cleanup dupechecker

Wed, 29 Nov 2017 23:04:52 +0100

author
mdd
date
Wed, 29 Nov 2017 23:04:52 +0100
changeset 21
1c0beeca2f9c
parent 20
5b433bdd2023
child 22
c18abd9198c0

cleanup dupechecker

dupecheck.py file | annotate | diff | comparison | revisions
--- a/dupecheck.py	Wed Nov 29 18:00:32 2017 +0100
+++ b/dupecheck.py	Wed Nov 29 23:04:52 2017 +0100
@@ -1,58 +1,116 @@
 #!/usr/bin/env python
+"""
+Toolkit / executable to scan for duplicate filenames in movie database
+
+2017 by mdd
+"""
+
+#pylint: disable=line-too-long
+#pylint: disable=invalid-name
 
 from __future__ import print_function
 import difflib
 import os, sys
 
-BASEDIR="../DREAMBOX"
+class dupechecker(object):
+    """
+    Simple class to scan multiple directories recursive,
+    build a list of movie filenames.
+    analyze the list for duplicates and dump them
+    """
+    def __init__(self):
+        self.basedir = ""
+        self.filelist = []
+        self.duplicates = {}
+        self.ratio = 0.85
 
-FILELIST=[]
-DUPLICATES={}
+    def reset(self):
+        self.filelist = []
+        self.duplicates = {}
 
-print("Reading files...")
-for root, subdirs, files in os.walk(BASEDIR):
-    for filename in files:
-        if filename.endswith(".ts"):
-            file_path = os.path.join(root, filename)
-            title = filename.split(" - ")
-            if len(title) == 1:
-                title = title[0]
-            else:
-                title = " - ".join(title[2:])
-            title = title[:-3].lower()
-            FILELIST.append([title, filename, root])
-        elif filename.endswith(".mkv"):
-            title = filename[:-4].lower()
-            FILELIST.append([title, filename, root])
-        elif filename.endswith(".mp4"):
-            title = filename[:-4].lower()
-            FILELIST.append([title, filename, root])
-print("%i files found, running duplicate testing loop" % len(FILELIST))
+    def scandir(self, basedir):
+        """
+        Scan a base directory for movie files and add them to
+        the list for analyze
+        """
+        self.basedir = basedir
+        print("Scanning directory: %s" % basedir)
+        for root, subdirs, files in os.walk(basedir):
+            for filename in files:
+                ext = os.path.splitext(filename)[1].lower()
+                if ext == ".ts":
+                    #file_path = os.path.join(root, filename)
+                    title = filename.split(" - ")
+                    if len(title) == 1:
+                        title = title[0]
+                    else:
+                        title = " - ".join(title[2:])
+                    title = title[:-3].lower()
+                    self.filelist.append([title, filename, root])
+                elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
+                    title = filename[:-4].lower()
+                    self.filelist.append([title, filename, root])
+
+    def analyze(self):
+        """
+        Analyze the scanlist for duplicates
+        """
+        print("%i files to analyze, running duplicate testing loop..." % (
+            len(self.filelist)))
 
-listlen = len(FILELIST)
-for idx in range(listlen):
-    if not FILELIST[idx]:
-        continue
-    print("\r%d %s\033[K" % (idx, FILELIST[idx][0]),
-        end='')
-    sys.stdout.flush()
-    for idx2 in range(idx + 1, listlen):
-        if FILELIST[idx2] and difflib.SequenceMatcher(a = FILELIST[idx][0], b = FILELIST[idx2][0]).ratio() > 0.85:
-            #print "possible duplicate %d %s" % (idx2, item2[0])
-            key = os.path.join(FILELIST[idx][2], FILELIST[idx][1])
-            if not key in DUPLICATES.keys():
-                DUPLICATES[key] = []
-            DUPLICATES[key].append(
-                os.path.join(FILELIST[idx2][2], FILELIST[idx2][1]))
-            # unset the found duplicate, so that this will not be scanned again
-            FILELIST[idx2] = None
+        listlen = len(self.filelist)
+        for idx in range(listlen):
+            if not self.filelist[idx]:
+                continue
+            print("\r%d %s\033[K" % (
+                idx, self.filelist[idx][0]), end='')
+            sys.stdout.flush()
+            for idx2 in range(idx + 1, listlen):
+                if self.filelist[idx2]:
+                    if difflib.SequenceMatcher(a=self.filelist[idx][0], b=self.filelist[idx2][0]).ratio() > self.ratio:
+                        #print "possible duplicate %d %s" % (idx2, item2[0])
+                        key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
+                        if not key in self.duplicates.keys():
+                            self.duplicates[key] = []
+                        self.duplicates[key].append(
+                            os.path.join(
+                                self.filelist[idx2][2],
+                                self.filelist[idx2][1]
+                            ))
+                        # unset the found duplicate, so that this will not be scanned again
+                        self.filelist[idx2] = None
+        print("\n\n\n")
 
-print("\n\n\n")
-idx = 1
-for base in DUPLICATES.keys():
-    print("Duplicate file set #%i" % idx)
-    print(base)
-    for dup in DUPLICATES[base]:
-        print(dup)
-    print()
-    idx += 1
+    def output(self):
+        """
+        Dump found duplicates to console
+        """
+        idx = 1
+        for base in self.duplicates.keys():
+            print("Duplicate file set #%i" % idx)
+            print(base)
+            for dup in self.duplicates[base]:
+                print(dup)
+            print()
+            idx += 1
+
+
+if __name__ == "__main__":
+    # parse command line options
+    import argparse
+
+    parser = argparse.ArgumentParser(\
+        description='Movie database filename duplicate checker')
+    parser.add_argument('--ratio', type=float, default=0.85, \
+        help='filename duplicate threshold 0.1 < ratio 1.0')
+    parser.add_argument('basedir', metavar='basedir', nargs='+', \
+        help='one or more base directories')
+
+    args = parser.parse_args()
+    dupe = dupechecker()
+    dupe.ratio = args.ratio
+
+    for srcstr in args.basedir:
+        dupe.scandir(srcstr)
+    dupe.analyze()
+    dupe.output()

mercurial