dupecheck.py

Thu, 04 Oct 2018 02:06:57 +0200

author
mdd
date
Thu, 04 Oct 2018 02:06:57 +0200
changeset 35
14c966c10648
parent 33
83bcb5931ee3
child 36
a1ad6f4728be
permissions
-rw-r--r--

added ability to provide a list of title prefix strings to ignore in duplicate checking

3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
1 #!/usr/bin/env python
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
2 # -*- coding: utf-8 -*-
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
3 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
4 Toolkit / executable to scan for duplicate filenames in movie database
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
5
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
6 2017 by mdd
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
7 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
8
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
9 #pylint: disable=line-too-long
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
10 #pylint: disable=invalid-name
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
11
4
a7e9e7974c22 prepare for speedup
mdd
parents: 3
diff changeset
12 from __future__ import print_function
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
13 import os, sys, re
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
14
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
15 RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
16
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
17 def similarity(a, b):
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
18 if DIFFLIB:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
19 return difflib.SequenceMatcher(a=a, b=b).ratio()
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
20 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
21 return Levenshtein.ratio(a, b)
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
22
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
23 suffixes = ['b', 'K', 'M', 'G', 'T', 'P']
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
24 def humansize(nbytes):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
25 i = 0
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
26 while nbytes >= 1024 and i < len(suffixes)-1:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
27 nbytes /= 1024.
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
28 i += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
29 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
30 return '%s %s' % (f, suffixes[i])
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
31
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
32 def replace_all(text, dic):
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
33 for i, j in dic.iteritems():
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
34 text = text.replace(i, j)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
35 return text
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
36
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
37 class dupechecker(object):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
38 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
39 Simple class to scan multiple directories recursive,
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
40 build a list of movie filenames.
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
41 analyze the list for duplicates and dump them
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
42 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
43 def __init__(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
44 self.basedir = ""
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
45 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
46 self.duplicates = {}
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
47 self.ratio = 0.85
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
48 self.ignore_fileprefix = []
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
49
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
50
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
51 def reset(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
52 self.filelist = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
53 self.duplicates = {}
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
54
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
55 def scandir(self, basedir, extra=[]):
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
56 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
57 Scan a base directory for movie files and add them to
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
58 the list for analyze
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
59 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
60 self.basedir = basedir
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
61 print("Scanning directory: %s" % basedir)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
62 for root, subdirs, files in os.walk(basedir):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
63 for filename in files:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
64 ext = os.path.splitext(filename)[1].lower()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
65 if ext == ".ts":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
66 #file_path = os.path.join(root, filename)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
67 title = filename.split(" - ")
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
68 if len(title) == 1:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
69 title = title[0]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
70 else:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
71 title = " - ".join(title[2:])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
72 title = title[:-3].lower()
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
73
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
74 # remove parentheses with contents in title
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
75 title = RE_PARENTHESES.sub("", title)
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
76
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
77 self.filelist.append([title, filename, root, ext])
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
78 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
79 title = filename[:-4].lower()
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
80 title = RE_PARENTHESES.sub("", title)
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
81 self.filelist.append([title, filename, root, ext])
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
82 elif ext in extra:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
83 title = filename[:-4].lower()
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
84 title = RE_PARENTHESES.sub("", title)
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
85 self.filelist.append([title, filename, root, ext])
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
86
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
87 def fixnames(self):
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
88 """
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
89 Search for defect filenames and remove illegal characters
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
90 """
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
91 import re
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
92 for item in self.filelist:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
93 if not item[3] in ['.mkv', '.txt']:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
94 continue
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
95 # any non-alphanumeric characters in filename?
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
96 cleanfn = replace_all(item[1], {
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
97 #'ä':'ae', 'Ä':'Ae',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
98 #'ö':'oe', 'Ö':'Oe',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
99 #'ü':'ue', 'Ü':'Ue',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
100 'ß':'ss',
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
101 })
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
102 cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
103 if item[1] == cleanfn:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
104 continue
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
105 print (item[1])
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
106 os.rename(
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
107 os.path.join(item[2], item[1]),
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
108 os.path.join(item[2], cleanfn)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
109 )
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
110
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
111 def statistics(self):
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
112 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
113 Summarize disk usage and print stats about found filetypes
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
114 """
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
115 stats = {}
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
116 for item in self.filelist:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
117 if not item[3] in stats:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
118 stats[item[3]] = [0, 0.0]
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
119 stats[item[3]][0] += 1
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
120 stats[item[3]][1] += os.stat(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
121 os.path.join(
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
122 item[2], item[1])).st_size
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
123 print ("%5s %6s %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
124 "File:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
125 "Count:",
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
126 "Size:"))
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
127 sum_count = 0
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
128 sum_size = 0.0
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
129 for ext in stats.keys():
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
130 sum_count += stats[ext][0]
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
131 sum_size += stats[ext][1]
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
132 print ("%5s %6i %10s" % (
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
133 ext, stats[ext][0],
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
134 humansize(stats[ext][1])))
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
135 print ("%5s %6i %10s" % (
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
136 "TOTAL", sum_count,
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
137 humansize(sum_size)))
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
138
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
139
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
140 def analyze(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
141 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
142 Analyze the scanlist for duplicates
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
143 """
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
144 listlen = len(self.filelist)
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
145 print("%i files to analyze, running duplicate testing loop..." % (
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
146 listlen))
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
147
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
148 # remove potentially unwanted entries from the list
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
149 if len(self.ignore_fileprefix) > 0:
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
150 for idx in reversed(range(listlen)):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
151 for tst in self.ignore_fileprefix:
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
152 if tst == '':
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
153 continue
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
154 if self.filelist[idx][0].startswith(tst):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
155 del self.filelist[idx]
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
156 break
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
157 listlen = len(self.filelist)
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
158
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
159 for idx in range(listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
160 if not self.filelist[idx]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
161 continue
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
162 print("\r%d %s\033[K" % (
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
163 idx, self.filelist[idx][0]), end='')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
164 sys.stdout.flush()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
165 for idx2 in range(idx + 1, listlen):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
166 if self.filelist[idx2]:
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
167 if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
168 #print "possible duplicate %d %s" % (idx2, item2[0])
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
169 key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
170 if not key in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
171 self.duplicates[key] = []
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
172 self.duplicates[key].append(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
173 os.path.join(
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
174 self.filelist[idx2][2],
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
175 self.filelist[idx2][1]
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
176 ))
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
177 # unset the found duplicate, so that this will not be scanned again
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
178 self.filelist[idx2] = None
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
179 print("\n\n")
3
569fa9a431b9 added filename duplicate checker
mdd
parents:
diff changeset
180
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
181 def output(self):
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
182 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
183 Dump found duplicates to console
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
184 """
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
185 idx = 1
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
186 for base in self.duplicates:
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
187 print("Duplicate file set #%i" % idx)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
188 print(base)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
189 for dup in self.duplicates[base]:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
190 print(dup)
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
191 print()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
192 idx += 1
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
193
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
194
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
195 if __name__ == "__main__":
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
196 # parse command line options
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
197 import argparse
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
198
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
199 parser = argparse.ArgumentParser(\
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
200 description='Movie database filename duplicate checker')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
201 parser.add_argument('--ratio', type=float, default=0.85, \
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
202 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
203 parser.add_argument('--difflib', action='store_true', default=False, \
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
204 help='force the use of difflib instead Levenshtein')
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
205 parser.add_argument('--stats', action='store_true', default=False, \
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
206 help='generate stats summary instead of check for duplicates')
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
207 parser.add_argument('--fixnames', action='store_true', default=False, \
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
208 help='scan for mkv and txt, fix broken filenames for windows')
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
209 parser.add_argument('basedir', metavar='basedir', nargs='+', \
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
210 help='one or more base directories')
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
211
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
212 args = parser.parse_args()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
213 dupe = dupechecker()
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
214 dupe.ratio = args.ratio
22
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
215 if args.difflib:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
216 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
217 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
218 else:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
219 try:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
220 import Levenshtein
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
221 DIFFLIB = False
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
222 except ImportError:
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
223 import difflib
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
224 DIFFLIB = True
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
225 print("Consider 'pip install python-Levenshtein' for faster analyze")
c18abd9198c0 implemented Levenshtein algorithm for incredible speedup
mdd
parents: 21
diff changeset
226
35
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
227 if os.path.isfile("dupecheck-ignore.txt"):
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
228 # read the entire file line by line into buffer
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
229 print("Loading ignore filename prefixes file for dupe checking...")
14c966c10648 added ability to provide a list of title prefix strings to ignore in duplicate checking
mdd
parents: 33
diff changeset
230 dupe.ignore_fileprefix = [line.rstrip('\n').rstrip('\r') for line in open("dupecheck-ignore.txt", "rb")]
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
231
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
232 if args.fixnames:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
233 for srcstr in args.basedir:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
234 dupe.scandir(srcstr, ['.txt'])
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
235 if len(dupe.filelist) > 0:
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
236 print ("Checking %i file names..." % len(dupe.filelist))
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
237 dupe.fixnames()
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
238 dupe.filelist = []
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
239 sys.exit(0)
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
240
21
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
241 for srcstr in args.basedir:
1c0beeca2f9c cleanup dupechecker
mdd
parents: 15
diff changeset
242 dupe.scandir(srcstr)
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
243
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
244 if args.stats or args.fixnames:
32
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
245 dupe.statistics()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
246 else:
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
247 dupe.analyze()
df89a8fba2a2 added stats calc to dupechecker
mdd
parents: 22
diff changeset
248 dupe.output()
33
83bcb5931ee3 function to fix weird filenames
mdd
parents: 32
diff changeset
249

mercurial