# HG changeset patch # User mdd # Date 1538606568 -7200 # Node ID 83bcb5931ee32af5a991a4f20c70fbe488c50901 # Parent df89a8fba2a205f65330ea51b3328306fc374d26 function to fix weird filenames diff -r df89a8fba2a2 -r 83bcb5931ee3 dupecheck.py --- a/dupecheck.py Tue Dec 12 03:25:44 2017 +0100 +++ b/dupecheck.py Thu Oct 04 00:42:48 2018 +0200 @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- """ Toolkit / executable to scan for duplicate filenames in movie database @@ -26,6 +27,11 @@ f = ('%.2f' % nbytes).rstrip('0').rstrip('.') return '%s %s' % (f, suffixes[i]) +def replace_all(text, dic): + for i, j in dic.iteritems(): + text = text.replace(i, j) + return text + class dupechecker(object): """ Simple class to scan multiple directories recursive, @@ -43,7 +49,7 @@ self.filelist = [] self.duplicates = {} - def scandir(self, basedir): + def scandir(self, basedir, extra=[]): """ Scan a base directory for movie files and add them to the list for analyze @@ -65,6 +71,33 @@ elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: title = filename[:-4].lower() self.filelist.append([title, filename, root, ext]) + elif ext in extra: + title = filename[:-4].lower() + self.filelist.append([title, filename, root, ext]) + + def fixnames(self): + """ + Search for defect filenames and remove illegal characters + """ + import re + for item in self.filelist: + if not item[3] in ['.mkv', '.txt']: + continue + # any non-alphanumeric characters in filename? + cleanfn = replace_all(item[1], { + #'ä':'ae', 'Ä':'Ae', + #'ö':'oe', 'Ö':'Oe', + #'ü':'ue', 'Ü':'Ue', + 'ß':'ss', + }) + cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn) + if item[1] == cleanfn: + continue + print (item[1]) + os.rename( + os.path.join(item[2], item[1]), + os.path.join(item[2], cleanfn) + ) def statistics(self): """ @@ -82,10 +115,17 @@ "File:", "Count:", "Size:")) + sum_count = 0 + sum_size = 0.0 for ext in stats.keys(): + sum_count += stats[ext][0] + sum_size += stats[ext][1] print ("%5s %6i %10s" % ( ext, stats[ext][0], humansize(stats[ext][1]))) + print ("%5s %6i %10s" % ( + "TOTAL", sum_count, + humansize(sum_size))) def analyze(self): @@ -144,6 +184,8 @@ help='force the use of difflib instead Levenshtein') parser.add_argument('--stats', action='store_true', default=False, \ help='generate stats summary instead of check for duplicates') + parser.add_argument('--fixnames', action='store_true', default=False, \ + help='scan for mkv and txt, fix broken filenames for windows') parser.add_argument('basedir', metavar='basedir', nargs='+', \ help='one or more base directories') @@ -163,10 +205,21 @@ print("Consider 'pip install python-Levenshtein' for faster analyze") + if args.fixnames: + for srcstr in args.basedir: + dupe.scandir(srcstr, ['.txt']) + if len(dupe.filelist) > 0: + print ("Checking %i file names..." % len(dupe.filelist)) + dupe.fixnames() + dupe.filelist = [] + sys.exit(0) + for srcstr in args.basedir: dupe.scandir(srcstr) - if args.stats: + + if args.stats or args.fixnames: dupe.statistics() else: dupe.analyze() dupe.output() +