plugins/hg4idea/testData/bin/mercurial/similar.py - platform/tools/idea - Git at Google

 # similar.py - mechanisms for finding similar files
 #
 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
 #
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.

 from i18n import _
 import util
 import mdiff
 import bdiff

 def _findexactmatches(repo, added, removed):
     '''find renamed files that have no changes

     Takes a list of new filectxs and a list of removed filectxs, and yields
     (before, after) tuples of exact matches.
     '''
     numfiles = len(added) + len(removed)

     # Get hashes of removed files.
     hashes = {}
     for i, fctx in enumerate(removed):
         repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
         h = util.sha1(fctx.data()).digest()
         hashes[h] = fctx

     # For each added file, see if it corresponds to a removed file.
     for i, fctx in enumerate(added):
         repo.ui.progress(_('searching for exact renames'), i + len(removed),
                 total=numfiles)
         h = util.sha1(fctx.data()).digest()
         if h in hashes:
             yield (hashes[h], fctx)

     # Done
     repo.ui.progress(_('searching for exact renames'), None)

 def _findsimilarmatches(repo, added, removed, threshold):
     '''find potentially renamed files based on similar file content

     Takes a list of new filectxs and a list of removed filectxs, and yields
     (before, after, score) tuples of partial matches.
     '''
     copies = {}
     for i, r in enumerate(removed):
         repo.ui.progress(_('searching for similar files'), i,
                          total=len(removed))

         # lazily load text
         @util.cachefunc
         def data():
             orig = r.data()
             return orig, mdiff.splitnewlines(orig)

         def score(text):
             orig, lines = data()
             # bdiff.blocks() returns blocks of matching lines
             # count the number of bytes in each
             equal = 0
             matches = bdiff.blocks(text, orig)
             for x1, x2, y1, y2 in matches:
                 for line in lines[y1:y2]:
                     equal += len(line)

             lengths = len(text) + len(orig)
             return equal * 2.0 / lengths

         for a in added:
             bestscore = copies.get(a, (None, threshold))[1]
             myscore = score(a.data())
             if myscore >= bestscore:
                 copies[a] = (r, myscore)
     repo.ui.progress(_('searching'), None)

     for dest, v in copies.iteritems():
         source, score = v
         yield source, dest, score

 def findrenames(repo, added, removed, threshold):
     '''find renamed files -- yields (before, after, score) tuples'''
     parentctx = repo['.']
     workingctx = repo[None]

     # Zero length files will be frequently unrelated to each other, and
     # tracking the deletion/addition of such a file will probably cause more
     # harm than good. We strip them out here to avoid matching them later on.
     addedfiles = set([workingctx[fp] for fp in added
             if workingctx[fp].size() > 0])
     removedfiles = set([parentctx[fp] for fp in removed
             if fp in parentctx and parentctx[fp].size() > 0])

     # Find exact matches.
     for (a, b) in _findexactmatches(repo,
             sorted(addedfiles), sorted(removedfiles)):
         addedfiles.remove(b)
         yield (a.path(), b.path(), 1.0)

     # If the user requested similar files to be matched, search for them also.
     if threshold < 1.0:
         for (a, b, score) in _findsimilarmatches(repo,
                 sorted(addedfiles), sorted(removedfiles), threshold):
             yield (a.path(), b.path(), score)
	# similar.py - mechanisms for finding similar files
	#
	# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
	#
	# This software may be used and distributed according to the terms of the
	# GNU General Public License version 2 or any later version.

	from i18n import _
	import util
	import mdiff
	import bdiff

	def _findexactmatches(repo, added, removed):
	'''find renamed files that have no changes

	Takes a list of new filectxs and a list of removed filectxs, and yields
	(before, after) tuples of exact matches.
	'''
	numfiles = len(added) + len(removed)

	# Get hashes of removed files.
	hashes = {}
	for i, fctx in enumerate(removed):
	repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
	h = util.sha1(fctx.data()).digest()
	hashes[h] = fctx

	# For each added file, see if it corresponds to a removed file.
	for i, fctx in enumerate(added):
	repo.ui.progress(_('searching for exact renames'), i + len(removed),
	total=numfiles)
	h = util.sha1(fctx.data()).digest()
	if h in hashes:
	yield (hashes[h], fctx)

	# Done
	repo.ui.progress(_('searching for exact renames'), None)

	def _findsimilarmatches(repo, added, removed, threshold):
	'''find potentially renamed files based on similar file content

	Takes a list of new filectxs and a list of removed filectxs, and yields
	(before, after, score) tuples of partial matches.
	'''
	copies = {}
	for i, r in enumerate(removed):
	repo.ui.progress(_('searching for similar files'), i,
	total=len(removed))

	# lazily load text
	@util.cachefunc
	def data():
	orig = r.data()
	return orig, mdiff.splitnewlines(orig)

	def score(text):
	orig, lines = data()
	# bdiff.blocks() returns blocks of matching lines
	# count the number of bytes in each
	equal = 0
	matches = bdiff.blocks(text, orig)
	for x1, x2, y1, y2 in matches:
	for line in lines[y1:y2]:
	equal += len(line)

	lengths = len(text) + len(orig)
	return equal * 2.0 / lengths

	for a in added:
	bestscore = copies.get(a, (None, threshold))[1]
	myscore = score(a.data())
	if myscore >= bestscore:
	copies[a] = (r, myscore)
	repo.ui.progress(_('searching'), None)

	for dest, v in copies.iteritems():
	source, score = v
	yield source, dest, score

	def findrenames(repo, added, removed, threshold):
	'''find renamed files -- yields (before, after, score) tuples'''
	parentctx = repo['.']
	workingctx = repo[None]

	# Zero length files will be frequently unrelated to each other, and
	# tracking the deletion/addition of such a file will probably cause more
	# harm than good. We strip them out here to avoid matching them later on.
	addedfiles = set([workingctx[fp] for fp in added
	if workingctx[fp].size() > 0])
	removedfiles = set([parentctx[fp] for fp in removed
	if fp in parentctx and parentctx[fp].size() > 0])

	# Find exact matches.
	for (a, b) in _findexactmatches(repo,
	sorted(addedfiles), sorted(removedfiles)):
	addedfiles.remove(b)
	yield (a.path(), b.path(), 1.0)

	# If the user requested similar files to be matched, search for them also.
	if threshold < 1.0:
	for (a, b, score) in _findsimilarmatches(repo,
	sorted(addedfiles), sorted(removedfiles), threshold):
	yield (a.path(), b.path(), score)