# HG changeset patch # User mdd # Date 1511994891 -3600 # Node ID c18abd9198c06a2a7eb85722e2a7dea59de66f57 # Parent 1c0beeca2f9c3a260a4eba5a2397ad22a67fc050 implemented Levenshtein algorithm for incredible speedup diff -r 1c0beeca2f9c -r c18abd9198c0 dupecheck.py --- a/dupecheck.py Wed Nov 29 23:04:52 2017 +0100 +++ b/dupecheck.py Wed Nov 29 23:34:51 2017 +0100 @@ -9,9 +9,14 @@ #pylint: disable=invalid-name from __future__ import print_function -import difflib import os, sys +def similarity(a, b): + if DIFFLIB: + return difflib.SequenceMatcher(a=a, b=b).ratio() + else: + return Levenshtein.ratio(a, b) + class dupechecker(object): """ Simple class to scan multiple directories recursive, @@ -24,6 +29,7 @@ self.duplicates = {} self.ratio = 0.85 + def reset(self): self.filelist = [] self.duplicates = {} @@ -67,10 +73,10 @@ sys.stdout.flush() for idx2 in range(idx + 1, listlen): if self.filelist[idx2]: - if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio: + if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio: #print "possible duplicate %d %s" % (idx2, item2[0]) key = os.path.join(self.filelist[idx][2], self.filelist[idx][1]) - if not key in self.duplicates.keys(): + if not key in self.duplicates: self.duplicates[key] = [] self.duplicates[key].append( os.path.join( @@ -79,14 +85,14 @@ )) # unset the found duplicate, so that this will not be scanned again self.filelist[idx2] = None - print("\n\n\n") + print("\n\n") def output(self): """ Dump found duplicates to console """ idx = 1 - for base in self.duplicates.keys(): + for base in self.duplicates: print("Duplicate file set #%i" % idx) print(base) for dup in self.duplicates[base]: @@ -102,13 +108,27 @@ parser = argparse.ArgumentParser(\ description='Movie database filename duplicate checker') parser.add_argument('--ratio', type=float, default=0.85, \ - help='filename duplicate threshold 0.1 < ratio 1.0') + help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') + parser.add_argument('--difflib', action='store_true', default=False, \ + help='force the use of difflib instead Levenshtein') parser.add_argument('basedir', metavar='basedir', nargs='+', \ help='one or more base directories') args = parser.parse_args() dupe = dupechecker() dupe.ratio = args.ratio + if args.difflib: + DIFFLIB = True + import difflib + else: + try: + import Levenshtein + DIFFLIB = False + except ImportError: + import difflib + DIFFLIB = True + print("Consider 'pip install python-Levenshtein' for faster analyze") + for srcstr in args.basedir: dupe.scandir(srcstr)