# HG changeset patch # User mdd # Date 1550063455 -3600 # Node ID a1ad6f4728be2c30a41e6f6896930c5fa9d977bc # Parent 14c966c10648492672f083b91d9e61b09909b760 added support for remote ssh dupe checking against local basedir diff -r 14c966c10648 -r a1ad6f4728be .hgignore --- a/.hgignore Thu Oct 04 02:06:57 2018 +0200 +++ b/.hgignore Wed Feb 13 14:10:55 2019 +0100 @@ -1,5 +1,7 @@ syntax: glob +config.py + *.pyc eit.old/* testfiles/* diff -r 14c966c10648 -r a1ad6f4728be config-dist.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/config-dist.py Wed Feb 13 14:10:55 2019 +0100 @@ -0,0 +1,10 @@ +REMOTE_HOSTS = [ + { + 'host': 'dm820', + 'port': 22, + 'user': 'root', + 'pass': 'password', + 'key': None, + 'basedir': '/media/hdd/movie' + }, +] \ No newline at end of file diff -r 14c966c10648 -r a1ad6f4728be dupecheck.py --- a/dupecheck.py Thu Oct 04 02:06:57 2018 +0200 +++ b/dupecheck.py Wed Feb 13 14:10:55 2019 +0100 @@ -3,7 +3,7 @@ """ Toolkit / executable to scan for duplicate filenames in movie database -2017 by mdd +2017-2019 by mdd """ #pylint: disable=line-too-long @@ -11,6 +11,7 @@ from __future__ import print_function import os, sys, re +import time RE_PARENTHESES = re.compile("[\(\[].*?[\)\]]") @@ -46,12 +47,40 @@ self.duplicates = {} self.ratio = 0.85 self.ignore_fileprefix = [] + self.ssh = None + self.ssh_data = None def reset(self): self.filelist = [] self.duplicates = {} + def __scandir_files(self, root, files, extra=[]): + for filename in files: + ext = os.path.splitext(filename)[1].lower() + if ext == ".ts": + #file_path = os.path.join(root, filename) + title = filename.split(" - ") + if len(title) == 1: + title = title[0] + else: + title = " - ".join(title[2:]) + title = title[:-3].lower() + + # remove parentheses with contents in title + title = RE_PARENTHESES.sub("", title) + + self.filelist.append([title, filename, root, ext]) + elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: + title = filename[:-4].lower() + title = RE_PARENTHESES.sub("", title) + self.filelist.append([title, filename, root, ext]) + elif ext in extra: + title = filename[:-4].lower() + title = RE_PARENTHESES.sub("", title) + self.filelist.append([title, filename, root, ext]) + + def scandir(self, basedir, extra=[]): """ Scan a base directory for movie files and add them to @@ -60,29 +89,78 @@ self.basedir = basedir print("Scanning directory: %s" % basedir) for root, subdirs, files in os.walk(basedir): - for filename in files: - ext = os.path.splitext(filename)[1].lower() - if ext == ".ts": - #file_path = os.path.join(root, filename) - title = filename.split(" - ") - if len(title) == 1: - title = title[0] - else: - title = " - ".join(title[2:]) - title = title[:-3].lower() + self.__scandir_files(root, files, extra) + # print(repr(self.filelist)) + # sys.exit() + + def scandir_remote(self, extra=[]): + """ + connect to remote ssh servers and get file lists for duplicate check + """ + print("getting filelist from remote hosts...") + try: + from config import REMOTE_HOSTS + except ImportError: + print("Please configure REMOTE_HOSTS in config.py!") + sys.exit(1) + try: + import paramiko + self.ssh = paramiko.SSHClient() + #self.ssh.set_missing_host_key_policy(paramiko.WarningPolicy()) + self.ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + #self.ssh_key = paramiko.RSAKey.from_private_key_file(SSH_PRIVATE_KEY_FILE) + except ImportError: + print("Please install Paramiko!") + sys.exit(1) + + for host in REMOTE_HOSTS: + self.ssh_data = host + + cleanlist = [] + lst = self.__ssh_exec('cd %s; ls -1 *.ts' % self.ssh_data['basedir'])[0] + for item in lst: + cleanlist.append(item.strip().encode('ascii','ignore')) + self.__scandir_files("%s: %s" % ( + self.ssh_data['host'], self.ssh_data['basedir']), cleanlist) + # self.__scandir_files(self.ssh_data['basedir'], cleanlist) + self.__ssh_disconnect() - # remove parentheses with contents in title - title = RE_PARENTHESES.sub("", title) + def __ssh_exec(self, command): + """ + establish ssh connection and execute command + the connection remains open for following commands until ssh_disconnect is called + """ + if self.ssh is None: + return None + try: + transport = self.ssh.get_transport() + if not transport or not transport.is_active(): + print("SSH: connecting to %s" % self.ssh_data['host']) + self.ssh.connect(self.ssh_data['host'], self.ssh_data['port'], self.ssh_data['user'], self.ssh_data['pass'], self.ssh_data['key']) + + # Send the command (non-blocking) + stdin, stdout, stderr = self.ssh.exec_command(command) - self.filelist.append([title, filename, root, ext]) - elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: - title = filename[:-4].lower() - title = RE_PARENTHESES.sub("", title) - self.filelist.append([title, filename, root, ext]) - elif ext in extra: - title = filename[:-4].lower() - title = RE_PARENTHESES.sub("", title) - self.filelist.append([title, filename, root, ext]) + # Wait for the command to terminate + while not stdout.channel.exit_status_ready() and not stdout.channel.recv_ready(): + time.sleep(1) + + stdoutstring = stdout.readlines() + stderrstring = stderr.readlines() + return stdoutstring, stderrstring + finally: + pass + + def __ssh_disconnect(self): + """ + check if ssh is connected and disconnect + """ + if self.ssh is not None: + # Close client connection. + transport = self.ssh.get_transport() + if not transport or not transport.is_active(): + print("SSH: disconnecting") + self.ssh.close() def fixnames(self): """ @@ -204,6 +282,8 @@ help='force the use of difflib instead Levenshtein') parser.add_argument('--stats', action='store_true', default=False, \ help='generate stats summary instead of check for duplicates') + parser.add_argument('--remote', action='store_true', default=False, \ + help='Connect to ssh remotes, eg. dupecheck for dreambox local storage') parser.add_argument('--fixnames', action='store_true', default=False, \ help='scan for mkv and txt, fix broken filenames for windows') parser.add_argument('basedir', metavar='basedir', nargs='+', \ @@ -238,6 +318,9 @@ dupe.filelist = [] sys.exit(0) + if args.remote: + dupe.scandir_remote() + for srcstr in args.basedir: dupe.scandir(srcstr)