added ability to provide a list of title prefix strings to ignore in duplicate checking

Thu, 04 Oct 2018 02:06:57 +0200

author
mdd
date
Thu, 04 Oct 2018 02:06:57 +0200
changeset 35
14c966c10648
parent 34
344802cf307d
child 36
a1ad6f4728be

added ability to provide a list of title prefix strings to ignore in duplicate checking

dupecheck.py file | annotate | diff | comparison | revisions
--- a/dupecheck.py	Thu Oct 04 00:43:26 2018 +0200
+++ b/dupecheck.py	Thu Oct 04 02:06:57 2018 +0200
@@ -10,7 +10,9 @@
 #pylint: disable=invalid-name
 
 from __future__ import print_function
-import os, sys
+import os, sys, re
+
+RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]")
 
 def similarity(a, b):
     if DIFFLIB:
@@ -43,6 +45,7 @@
         self.filelist = []
         self.duplicates = {}
         self.ratio = 0.85
+        self.ignore_fileprefix = []
 
 
     def reset(self):
@@ -67,12 +70,18 @@
                     else:
                         title = " - ".join(title[2:])
                     title = title[:-3].lower()
+
+                    # remove parentheses with contents in title
+                    title = RE_PARENTHESES.sub("", title)
+
                     self.filelist.append([title, filename, root, ext])
                 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
                     title = filename[:-4].lower()
+                    title = RE_PARENTHESES.sub("", title)
                     self.filelist.append([title, filename, root, ext])
                 elif ext in extra:
                     title = filename[:-4].lower()
+                    title = RE_PARENTHESES.sub("", title)
                     self.filelist.append([title, filename, root, ext])
 
     def fixnames(self):
@@ -132,10 +141,21 @@
         """
         Analyze the scanlist for duplicates
         """
+        listlen = len(self.filelist)
         print("%i files to analyze, running duplicate testing loop..." % (
-            len(self.filelist)))
+            listlen))
 
-        listlen = len(self.filelist)
+        # remove potentially unwanted entries from the list
+        if len(self.ignore_fileprefix) > 0:
+            for idx in reversed(range(listlen)):
+                for tst in self.ignore_fileprefix:
+                    if tst == '':
+                        continue
+                    if self.filelist[idx][0].startswith(tst):
+                        del self.filelist[idx]
+                        break
+            listlen = len(self.filelist)
+
         for idx in range(listlen):
             if not self.filelist[idx]:
                 continue
@@ -204,6 +224,10 @@
             DIFFLIB = True
             print("Consider 'pip install python-Levenshtein' for faster analyze")
 
+    if os.path.isfile("dupecheck-ignore.txt"):
+        # read the entire file line by line into buffer
+        print("Loading ignore filename prefixes file for dupe checking...")
+        dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")]
 
     if args.fixnames:
         for srcstr in args.basedir:

mercurial