# HG changeset patch # User mdd # Date 1538611617 -7200 # Node ID 14c966c10648492672f083b91d9e61b09909b760 # Parent 344802cf307d362f8fb050f215593ffa3b64fd1e added ability to provide a list of title prefix strings to ignore in duplicate checking diff -r 344802cf307d -r 14c966c10648 dupecheck.py --- a/dupecheck.py Thu Oct 04 00:43:26 2018 +0200 +++ b/dupecheck.py Thu Oct 04 02:06:57 2018 +0200 @@ -10,7 +10,9 @@ #pylint: disable=invalid-name from __future__ import print_function -import os, sys +import os, sys, re + +RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") def similarity(a, b): if DIFFLIB: @@ -43,6 +45,7 @@ self.filelist = [] self.duplicates = {} self.ratio = 0.85 + self.ignore_fileprefix = [] def reset(self): @@ -67,12 +70,18 @@ else: title = " - ".join(title[2:]) title = title[:-3].lower() + + # remove parentheses with contents in title + title = RE_PARENTHESES.sub("", title) + self.filelist.append([title, filename, root, ext]) elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: title = filename[:-4].lower() + title = RE_PARENTHESES.sub("", title) self.filelist.append([title, filename, root, ext]) elif ext in extra: title = filename[:-4].lower() + title = RE_PARENTHESES.sub("", title) self.filelist.append([title, filename, root, ext]) def fixnames(self): @@ -132,10 +141,21 @@ """ Analyze the scanlist for duplicates """ + listlen = len(self.filelist) print("%i files to analyze, running duplicate testing loop..." % ( - len(self.filelist))) + listlen)) - listlen = len(self.filelist) + # remove potentially unwanted entries from the list + if len(self.ignore_fileprefix) > 0: + for idx in reversed(range(listlen)): + for tst in self.ignore_fileprefix: + if tst == '': + continue + if self.filelist[idx][0].startswith(tst): + del self.filelist[idx] + break + listlen = len(self.filelist) + for idx in range(listlen): if not self.filelist[idx]: continue @@ -204,6 +224,10 @@ DIFFLIB = True print("Consider 'pip install python-Levenshtein' for faster analyze") + if os.path.isfile("dupecheck-ignore.txt"): + # read the entire file line by line into buffer + print("Loading ignore filename prefixes file for dupe checking...") + dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")] if args.fixnames: for srcstr in args.basedir: