implemented Levenshtein algorithm for incredible speedup V1.0

Wed, 29 Nov 2017 23:34:51 +0100

author
mdd
date
Wed, 29 Nov 2017 23:34:51 +0100
changeset 22
c18abd9198c0
parent 21
1c0beeca2f9c
child 23
9bf1a8d2c26e

implemented Levenshtein algorithm for incredible speedup

dupecheck.py file | annotate | diff | comparison | revisions
--- a/dupecheck.py	Wed Nov 29 23:04:52 2017 +0100
+++ b/dupecheck.py	Wed Nov 29 23:34:51 2017 +0100
@@ -9,9 +9,14 @@
 #pylint: disable=invalid-name
 
 from __future__ import print_function
-import difflib
 import os, sys
 
+def similarity(a, b):
+    if DIFFLIB:
+        return difflib.SequenceMatcher(a=a, b=b).ratio()
+    else:
+        return Levenshtein.ratio(a, b)
+
 class dupechecker(object):
     """
     Simple class to scan multiple directories recursive,
@@ -24,6 +29,7 @@
         self.duplicates = {}
         self.ratio = 0.85
 
+
     def reset(self):
         self.filelist = []
         self.duplicates = {}
@@ -67,10 +73,10 @@
             sys.stdout.flush()
             for idx2 in range(idx + 1, listlen):
                 if self.filelist[idx2]:
-                    if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio:
+                    if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
                         #print "possible duplicate %d %s" % (idx2, item2[0])
                         key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
-                        if not key in self.duplicates.keys():
+                        if not key in self.duplicates:
                             self.duplicates[key] = []
                         self.duplicates[key].append(
                             os.path.join(
@@ -79,14 +85,14 @@
                             ))
                         # unset the found duplicate, so that this will not be scanned again
                         self.filelist[idx2] = None
-        print("\n\n\n")
+        print("\n\n")
 
     def output(self):
         """
         Dump found duplicates to console
         """
         idx = 1
-        for base in self.duplicates.keys():
+        for base in self.duplicates:
             print("Duplicate file set #%i" % idx)
             print(base)
             for dup in self.duplicates[base]:
@@ -102,13 +108,27 @@
     parser = argparse.ArgumentParser(\
         description='Movie database filename duplicate checker')
     parser.add_argument('--ratio', type=float, default=0.85, \
-        help='filename duplicate threshold 0.1 < ratio 1.0')
+        help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
+    parser.add_argument('--difflib', action='store_true', default=False, \
+        help='force the use of difflib instead Levenshtein')
     parser.add_argument('basedir', metavar='basedir', nargs='+', \
         help='one or more base directories')
 
     args = parser.parse_args()
     dupe = dupechecker()
     dupe.ratio = args.ratio
+    if args.difflib:
+        DIFFLIB = True
+        import difflib
+    else:
+        try:
+            import Levenshtein
+            DIFFLIB = False
+        except ImportError:
+            import difflib
+            DIFFLIB = True
+            print("Consider 'pip install python-Levenshtein' for faster analyze")
+
 
     for srcstr in args.basedir:
         dupe.scandir(srcstr)

mercurial