dupecheck.py

changeset 36
a1ad6f4728be
parent 35
14c966c10648
child 37
5be334b71b08
equal deleted inserted replaced
35:14c966c10648 36:a1ad6f4728be
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 """ 3 """
4 Toolkit / executable to scan for duplicate filenames in movie database 4 Toolkit / executable to scan for duplicate filenames in movie database
5 5
6 2017 by mdd 6 2017-2019 by mdd
7 """ 7 """
8 8
9 #pylint: disable=line-too-long 9 #pylint: disable=line-too-long
10 #pylint: disable=invalid-name 10 #pylint: disable=invalid-name
11 11
12 from __future__ import print_function 12 from __future__ import print_function
13 import os, sys, re 13 import os, sys, re
14 import time
14 15
15 RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") 16 RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]")
16 17
17 def similarity(a, b): 18 def similarity(a, b):
18 if DIFFLIB: 19 if DIFFLIB:
44 self.basedir = "" 45 self.basedir = ""
45 self.filelist = [] 46 self.filelist = []
46 self.duplicates = {} 47 self.duplicates = {}
47 self.ratio = 0.85 48 self.ratio = 0.85
48 self.ignore_fileprefix = [] 49 self.ignore_fileprefix = []
50 self.ssh = None
51 self.ssh_data = None
49 52
50 53
51 def reset(self): 54 def reset(self):
52 self.filelist = [] 55 self.filelist = []
53 self.duplicates = {} 56 self.duplicates = {}
54 57
58 def __scandir_files(self, root, files, extra=[]):
59 for filename in files:
60 ext = os.path.splitext(filename)[1].lower()
61 if ext == ".ts":
62 #file_path = os.path.join(root, filename)
63 title = filename.split(" - ")
64 if len(title) == 1:
65 title = title[0]
66 else:
67 title = " - ".join(title[2:])
68 title = title[:-3].lower()
69
70 # remove parentheses with contents in title
71 title = RE_PARENTHESES.sub("", title)
72
73 self.filelist.append([title, filename, root, ext])
74 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
75 title = filename[:-4].lower()
76 title = RE_PARENTHESES.sub("", title)
77 self.filelist.append([title, filename, root, ext])
78 elif ext in extra:
79 title = filename[:-4].lower()
80 title = RE_PARENTHESES.sub("", title)
81 self.filelist.append([title, filename, root, ext])
82
83
55 def scandir(self, basedir, extra=[]): 84 def scandir(self, basedir, extra=[]):
56 """ 85 """
57 Scan a base directory for movie files and add them to 86 Scan a base directory for movie files and add them to
58 the list for analyze 87 the list for analyze
59 """ 88 """
60 self.basedir = basedir 89 self.basedir = basedir
61 print("Scanning directory: %s" % basedir) 90 print("Scanning directory: %s" % basedir)
62 for root, subdirs, files in os.walk(basedir): 91 for root, subdirs, files in os.walk(basedir):
63 for filename in files: 92 self.__scandir_files(root, files, extra)
64 ext = os.path.splitext(filename)[1].lower() 93 # print(repr(self.filelist))
65 if ext == ".ts": 94 # sys.exit()
66 #file_path = os.path.join(root, filename) 95
67 title = filename.split(" - ") 96 def scandir_remote(self, extra=[]):
68 if len(title) == 1: 97 """
69 title = title[0] 98 connect to remote ssh servers and get file lists for duplicate check
70 else: 99 """
71 title = " - ".join(title[2:]) 100 print("getting filelist from remote hosts...")
72 title = title[:-3].lower() 101 try:
73 102 from config import REMOTE_HOSTS
74 # remove parentheses with contents in title 103 except ImportError:
75 title = RE_PARENTHESES.sub("", title) 104 print("Please configure REMOTE_HOSTS in config.py!")
76 105 sys.exit(1)
77 self.filelist.append([title, filename, root, ext]) 106 try:
78 elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: 107 import paramiko
79 title = filename[:-4].lower() 108 self.ssh = paramiko.SSHClient()
80 title = RE_PARENTHESES.sub("", title) 109 #self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy())
81 self.filelist.append([title, filename, root, ext]) 110 self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
82 elif ext in extra: 111 #self.ssh_key = paramiko.RSAKey.from_private_key_file(SSH_PRIVATE_KEY_FILE)
83 title = filename[:-4].lower() 112 except ImportError:
84 title = RE_PARENTHESES.sub("", title) 113 print("Please install Paramiko!")
85 self.filelist.append([title, filename, root, ext]) 114 sys.exit(1)
115
116 for host in REMOTE_HOSTS:
117 self.ssh_data = host
118
119 cleanlist = []
120 lst = self.__ssh_exec('cd %s; ls -1 *.ts' % self.ssh_data['basedir'])[0]
121 for item in lst:
122 cleanlist.append(item.strip().encode('ascii','ignore'))
123 self.__scandir_files("%s: %s" % (
124 self.ssh_data['host'], self.ssh_data['basedir']), cleanlist)
125 # self.__scandir_files(self.ssh_data['basedir'], cleanlist)
126 self.__ssh_disconnect()
127
128 def __ssh_exec(self, command):
129 """
130 establish ssh connection and execute command
131 the connection remains open for following commands until ssh_disconnect is called
132 """
133 if self.ssh is None:
134 return None
135 try:
136 transport = self.ssh.get_transport()
137 if not transport or not transport.is_active():
138 print("SSH: connecting to %s" % self.ssh_data['host'])
139 self.ssh.connect(self.ssh_data['host'], self.ssh_data['port'], self.ssh_data['user'], self.ssh_data['pass'], self.ssh_data['key'])
140
141 # Send the command (non-blocking)
142 stdin, stdout, stderr = self.ssh.exec_command(command)
143
144 # Wait for the command to terminate
145 while not stdout.channel.exit_status_ready() and not stdout.channel.recv_ready():
146 time.sleep(1)
147
148 stdoutstring = stdout.readlines()
149 stderrstring = stderr.readlines()
150 return stdoutstring, stderrstring
151 finally:
152 pass
153
154 def __ssh_disconnect(self):
155 """
156 check if ssh is connected and disconnect
157 """
158 if self.ssh is not None:
159 # Close client connection.
160 transport = self.ssh.get_transport()
161 if not transport or not transport.is_active():
162 print("SSH: disconnecting")
163 self.ssh.close()
86 164
87 def fixnames(self): 165 def fixnames(self):
88 """ 166 """
89 Search for defect filenames and remove illegal characters 167 Search for defect filenames and remove illegal characters
90 """ 168 """
202 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') 280 help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
203 parser.add_argument('--difflib', action='store_true', default=False, \ 281 parser.add_argument('--difflib', action='store_true', default=False, \
204 help='force the use of difflib instead Levenshtein') 282 help='force the use of difflib instead Levenshtein')
205 parser.add_argument('--stats', action='store_true', default=False, \ 283 parser.add_argument('--stats', action='store_true', default=False, \
206 help='generate stats summary instead of check for duplicates') 284 help='generate stats summary instead of check for duplicates')
285 parser.add_argument('--remote', action='store_true', default=False, \
286 help='Connect to ssh remotes, eg. dupecheck for dreambox local storage')
207 parser.add_argument('--fixnames', action='store_true', default=False, \ 287 parser.add_argument('--fixnames', action='store_true', default=False, \
208 help='scan for mkv and txt, fix broken filenames for windows') 288 help='scan for mkv and txt, fix broken filenames for windows')
209 parser.add_argument('basedir', metavar='basedir', nargs='+', \ 289 parser.add_argument('basedir', metavar='basedir', nargs='+', \
210 help='one or more base directories') 290 help='one or more base directories')
211 291
236 print ("Checking %i file names..." % len(dupe.filelist)) 316 print ("Checking %i file names..." % len(dupe.filelist))
237 dupe.fixnames() 317 dupe.fixnames()
238 dupe.filelist = [] 318 dupe.filelist = []
239 sys.exit(0) 319 sys.exit(0)
240 320
321 if args.remote:
322 dupe.scandir_remote()
323
241 for srcstr in args.basedir: 324 for srcstr in args.basedir:
242 dupe.scandir(srcstr) 325 dupe.scandir(srcstr)
243 326
244 if args.stats or args.fixnames: 327 if args.stats or args.fixnames:
245 dupe.statistics() 328 dupe.statistics()

mercurial