dupecheck.py

changeset 33
83bcb5931ee3
parent 32
df89a8fba2a2
child 35
14c966c10648
equal deleted inserted replaced
32:df89a8fba2a2 33:83bcb5931ee3
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
2 """ 3 """
3 Toolkit / executable to scan for duplicate filenames in movie database 4 Toolkit / executable to scan for duplicate filenames in movie database
4 5
5 2017 by mdd 6 2017 by mdd
6 """ 7 """
23 while nbytes >= 1024 and i < len(suffixes)-1: 24 while nbytes >= 1024 and i < len(suffixes)-1:
24 nbytes /= 1024. 25 nbytes /= 1024.
25 i += 1 26 i += 1
26 f = ('%.2f' % nbytes).rstrip('0').rstrip('.') 27 f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
27 return '%s %s' % (f, suffixes[i]) 28 return '%s %s' % (f, suffixes[i])
29
30 def replace_all(text, dic):
31 for i, j in dic.iteritems():
32 text = text.replace(i, j)
33 return text
28 34
29 class dupechecker(object): 35 class dupechecker(object):
30 """ 36 """
31 Simple class to scan multiple directories recursive, 37 Simple class to scan multiple directories recursive,
32 build a list of movie filenames. 38 build a list of movie filenames.
41 47
42 def reset(self): 48 def reset(self):
43 self.filelist = [] 49 self.filelist = []
44 self.duplicates = {} 50 self.duplicates = {}
45 51
46 def scandir(self, basedir): 52 def scandir(self, basedir, extra=[]):
47 """ 53 """
48 Scan a base directory for movie files and add them to 54 Scan a base directory for movie files and add them to
49 the list for analyze 55 the list for analyze
50 """ 56 """
51 self.basedir = basedir 57 self.basedir = basedir
63 title = title[:-3].lower() 69 title = title[:-3].lower()
64 self.filelist.append([title, filename, root, ext]) 70 self.filelist.append([title, filename, root, ext])
65 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: 71 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
66 title = filename[:-4].lower() 72 title = filename[:-4].lower()
67 self.filelist.append([title, filename, root, ext]) 73 self.filelist.append([title, filename, root, ext])
74 elif ext in extra:
75 title = filename[:-4].lower()
76 self.filelist.append([title, filename, root, ext])
77
78 def fixnames(self):
79 """
80 Search for defect filenames and remove illegal characters
81 """
82 import re
83 for item in self.filelist:
84 if not item[3] in ['.mkv', '.txt']:
85 continue
86 # any non-alphanumeric characters in filename?
87 cleanfn = replace_all(item[1], {
88 #'ä':'ae', 'Ä':'Ae',
89 #'ö':'oe', 'Ö':'Oe',
90 #'ü':'ue', 'Ü':'Ue',
91 'ß':'ss',
92 })
93 cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn)
94 if item[1] == cleanfn:
95 continue
96 print (item[1])
97 os.rename(
98 os.path.join(item[2], item[1]),
99 os.path.join(item[2], cleanfn)
100 )
68 101
69 def statistics(self): 102 def statistics(self):
70 """ 103 """
71 Summarize disk usage and print stats about found filetypes 104 Summarize disk usage and print stats about found filetypes
72 """ 105 """
80 item[2], item[1])).st_size 113 item[2], item[1])).st_size
81 print ("%5s %6s %10s" % ( 114 print ("%5s %6s %10s" % (
82 "File:", 115 "File:",
83 "Count:", 116 "Count:",
84 "Size:")) 117 "Size:"))
118 sum_count = 0
119 sum_size = 0.0
85 for ext in stats.keys(): 120 for ext in stats.keys():
121 sum_count += stats[ext][0]
122 sum_size += stats[ext][1]
86 print ("%5s %6i %10s" % ( 123 print ("%5s %6i %10s" % (
87 ext, stats[ext][0], 124 ext, stats[ext][0],
88 humansize(stats[ext][1]))) 125 humansize(stats[ext][1])))
126 print ("%5s %6i %10s" % (
127 "TOTAL", sum_count,
128 humansize(sum_size)))
89 129
90 130
91 def analyze(self): 131 def analyze(self):
92 """ 132 """
93 Analyze the scanlist for duplicates 133 Analyze the scanlist for duplicates
142 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') 182 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
143 parser.add_argument('--difflib', action='store_true', default=False, \ 183 parser.add_argument('--difflib', action='store_true', default=False, \
144 help='force the use of difflib instead Levenshtein') 184 help='force the use of difflib instead Levenshtein')
145 parser.add_argument('--stats', action='store_true', default=False, \ 185 parser.add_argument('--stats', action='store_true', default=False, \
146 help='generate stats summary instead of check for duplicates') 186 help='generate stats summary instead of check for duplicates')
187 parser.add_argument('--fixnames', action='store_true', default=False, \
188 help='scan for mkv and txt, fix broken filenames for windows')
147 parser.add_argument('basedir', metavar='basedir', nargs='+', \ 189 parser.add_argument('basedir', metavar='basedir', nargs='+', \
148 help='one or more base directories') 190 help='one or more base directories')
149 191
150 args = parser.parse_args() 192 args = parser.parse_args()
151 dupe = dupechecker() 193 dupe = dupechecker()
161 import difflib 203 import difflib
162 DIFFLIB = True 204 DIFFLIB = True
163 print("Consider 'pip install python-Levenshtein' for faster analyze") 205 print("Consider 'pip install python-Levenshtein' for faster analyze")
164 206
165 207
208 if args.fixnames:
209 for srcstr in args.basedir:
210 dupe.scandir(srcstr, ['.txt'])
211 if len(dupe.filelist) > 0:
212 print ("Checking %i file names..." % len(dupe.filelist))
213 dupe.fixnames()
214 dupe.filelist = []
215 sys.exit(0)
216
166 for srcstr in args.basedir: 217 for srcstr in args.basedir:
167 dupe.scandir(srcstr) 218 dupe.scandir(srcstr)
168 if args.stats: 219
220 if args.stats or args.fixnames:
169 dupe.statistics() 221 dupe.statistics()
170 else: 222 else:
171 dupe.analyze() 223 dupe.analyze()
172 dupe.output() 224 dupe.output()
225

mercurial