dupecheck.py

changeset 35
14c966c10648
parent 33
83bcb5931ee3
child 36
a1ad6f4728be
equal deleted inserted replaced
34:344802cf307d 35:14c966c10648
8 8
9 #pylint: disable=line-too-long 9 #pylint: disable=line-too-long
10 #pylint: disable=invalid-name 10 #pylint: disable=invalid-name
11 11
12 from __future__ import print_function 12 from __future__ import print_function
13 import os, sys 13 import os, sys, re
14
15 RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]")
14 16
15 def similarity(a, b): 17 def similarity(a, b):
16 if DIFFLIB: 18 if DIFFLIB:
17 return difflib.SequenceMatcher(a=a, b=b).ratio() 19 return difflib.SequenceMatcher(a=a, b=b).ratio()
18 else: 20 else:
41 def __init__(self): 43 def __init__(self):
42 self.basedir = "" 44 self.basedir = ""
43 self.filelist = [] 45 self.filelist = []
44 self.duplicates = {} 46 self.duplicates = {}
45 self.ratio = 0.85 47 self.ratio = 0.85
48 self.ignore_fileprefix = []
46 49
47 50
48 def reset(self): 51 def reset(self):
49 self.filelist = [] 52 self.filelist = []
50 self.duplicates = {} 53 self.duplicates = {}
65 if len(title) == 1: 68 if len(title) == 1:
66 title = title[0] 69 title = title[0]
67 else: 70 else:
68 title = " - ".join(title[2:]) 71 title = " - ".join(title[2:])
69 title = title[:-3].lower() 72 title = title[:-3].lower()
73
74 # remove parentheses with contents in title
75 title = RE_PARENTHESES.sub("", title)
76
70 self.filelist.append([title, filename, root, ext]) 77 self.filelist.append([title, filename, root, ext])
71 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: 78 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
72 title = filename[:-4].lower() 79 title = filename[:-4].lower()
80 title = RE_PARENTHESES.sub("", title)
73 self.filelist.append([title, filename, root, ext]) 81 self.filelist.append([title, filename, root, ext])
74 elif ext in extra: 82 elif ext in extra:
75 title = filename[:-4].lower() 83 title = filename[:-4].lower()
84 title = RE_PARENTHESES.sub("", title)
76 self.filelist.append([title, filename, root, ext]) 85 self.filelist.append([title, filename, root, ext])
77 86
78 def fixnames(self): 87 def fixnames(self):
79 """ 88 """
80 Search for defect filenames and remove illegal characters 89 Search for defect filenames and remove illegal characters
130 139
131 def analyze(self): 140 def analyze(self):
132 """ 141 """
133 Analyze the scanlist for duplicates 142 Analyze the scanlist for duplicates
134 """ 143 """
144 listlen = len(self.filelist)
135 print("%i files to analyze, running duplicate testing loop..." % ( 145 print("%i files to analyze, running duplicate testing loop..." % (
136 len(self.filelist))) 146 listlen))
137 147
138 listlen = len(self.filelist) 148 # remove potentially unwanted entries from the list
149 if len(self.ignore_fileprefix) > 0:
150 for idx in reversed(range(listlen)):
151 for tst in self.ignore_fileprefix:
152 if tst == '':
153 continue
154 if self.filelist[idx][0].startswith(tst):
155 del self.filelist[idx]
156 break
157 listlen = len(self.filelist)
158
139 for idx in range(listlen): 159 for idx in range(listlen):
140 if not self.filelist[idx]: 160 if not self.filelist[idx]:
141 continue 161 continue
142 print("\r%d %s\033[K" % ( 162 print("\r%d %s\033[K" % (
143 idx, self.filelist[idx][0]), end='') 163 idx, self.filelist[idx][0]), end='')
202 except ImportError: 222 except ImportError:
203 import difflib 223 import difflib
204 DIFFLIB = True 224 DIFFLIB = True
205 print("Consider 'pip install python-Levenshtein' for faster analyze") 225 print("Consider 'pip install python-Levenshtein' for faster analyze")
206 226
227 if os.path.isfile("dupecheck-ignore.txt"):
228 # read the entire file line by line into buffer
229 print("Loading ignore filename prefixes file for dupe checking...")
230 dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")]
207 231
208 if args.fixnames: 232 if args.fixnames:
209 for srcstr in args.basedir: 233 for srcstr in args.basedir:
210 dupe.scandir(srcstr, ['.txt']) 234 dupe.scandir(srcstr, ['.txt'])
211 if len(dupe.filelist) > 0: 235 if len(dupe.filelist) > 0:

mercurial