# HG changeset patch
# User mdd
# Date 1538606568 -7200
# Node ID 83bcb5931ee32af5a991a4f20c70fbe488c50901
# Parent  df89a8fba2a205f65330ea51b3328306fc374d26
function to fix weird filenames

diff -r df89a8fba2a2 -r 83bcb5931ee3 dupecheck.py
--- a/dupecheck.py	Tue Dec 12 03:25:44 2017 +0100
+++ b/dupecheck.py	Thu Oct 04 00:42:48 2018 +0200
@@ -1,4 +1,5 @@
 #!/usr/bin/env python
+# -*- coding: utf-8 -*-
 """
 Toolkit / executable to scan for duplicate filenames in movie database
 
@@ -26,6 +27,11 @@
     f = ('%.2f' % nbytes).rstrip('0').rstrip('.')
     return '%s %s' % (f, suffixes[i])
 
+def replace_all(text, dic):
+    for i, j in dic.iteritems():
+        text = text.replace(i, j)
+    return text
+
 class dupechecker(object):
     """
     Simple class to scan multiple directories recursive,
@@ -43,7 +49,7 @@
         self.filelist = []
         self.duplicates = {}
 
-    def scandir(self, basedir):
+    def scandir(self, basedir, extra=[]):
         """
         Scan a base directory for movie files and add them to
         the list for analyze
@@ -65,6 +71,33 @@
                 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
                     title = filename[:-4].lower()
                     self.filelist.append([title, filename, root, ext])
+                elif ext in extra:
+                    title = filename[:-4].lower()
+                    self.filelist.append([title, filename, root, ext])
+
+    def fixnames(self):
+        """
+        Search for defect filenames and remove illegal characters
+        """
+        import re
+        for item in self.filelist:
+            if not item[3] in ['.mkv', '.txt']:
+                continue
+            # any non-alphanumeric characters in filename?
+            cleanfn = replace_all(item[1], {
+                    #'ä':'ae', 'Ä':'Ae',
+                    #'ö':'oe', 'Ö':'Oe',
+                    #'ü':'ue', 'Ü':'Ue',
+                    'ß':'ss',
+                })
+            cleanfn = re.sub(r'[^A-Za-z0-9\.\_\-\(\)\&öäüÖÄÜ\' ]', '-', cleanfn)
+            if item[1] == cleanfn:
+                continue
+            print (item[1])
+            os.rename(
+                os.path.join(item[2], item[1]),
+                os.path.join(item[2], cleanfn)
+                )
 
     def statistics(self):
         """
@@ -82,10 +115,17 @@
             "File:",
             "Count:",
             "Size:"))
+        sum_count = 0
+        sum_size = 0.0
         for ext in stats.keys():
+            sum_count += stats[ext][0]
+            sum_size += stats[ext][1]
             print ("%5s %6i %10s" % (
                 ext, stats[ext][0],
                 humansize(stats[ext][1])))
+        print ("%5s %6i %10s" % (
+            "TOTAL", sum_count,
+            humansize(sum_size)))
 
 
     def analyze(self):
@@ -144,6 +184,8 @@
         help='force the use of difflib instead Levenshtein')
     parser.add_argument('--stats', action='store_true', default=False, \
         help='generate stats summary instead of check for duplicates')
+    parser.add_argument('--fixnames', action='store_true', default=False, \
+        help='scan for mkv and txt, fix broken filenames for windows')
     parser.add_argument('basedir', metavar='basedir', nargs='+', \
         help='one or more base directories')
 
@@ -163,10 +205,21 @@
             print("Consider 'pip install python-Levenshtein' for faster analyze")
 
 
+    if args.fixnames:
+        for srcstr in args.basedir:
+            dupe.scandir(srcstr, ['.txt'])
+        if len(dupe.filelist) > 0:
+            print ("Checking %i file names..." % len(dupe.filelist))
+            dupe.fixnames()
+            dupe.filelist = []
+        sys.exit(0)
+
     for srcstr in args.basedir:
         dupe.scandir(srcstr)
-    if args.stats:
+
+    if args.stats or args.fixnames:
         dupe.statistics()
     else:
         dupe.analyze()
         dupe.output()
+