dupecheck.py

Wed, 29 Nov 2017 23:34:51 +0100

author
mdd
date
Wed, 29 Nov 2017 23:34:51 +0100
changeset 22
c18abd9198c0
parent 21
1c0beeca2f9c
child 32
df89a8fba2a2
permissions
-rwxr-xr-x

implemented Levenshtein algorithm for incredible speedup

#!/usr/bin/env python
"""
Toolkit / executable to scan for duplicate filenames in movie database

2017 by mdd
"""

#pylint: disable=line-too-long
#pylint: disable=invalid-name

from __future__ import print_function
import os, sys

def similarity(a, b):
    if DIFFLIB:
        return difflib.SequenceMatcher(a=a, b=b).ratio()
    else:
        return Levenshtein.ratio(a, b)

class dupechecker(object):
    """
    Simple class to scan multiple directories recursive,
    build a list of movie filenames.
    analyze the list for duplicates and dump them
    """
    def __init__(self):
        self.basedir = ""
        self.filelist = []
        self.duplicates = {}
        self.ratio = 0.85


    def reset(self):
        self.filelist = []
        self.duplicates = {}

    def scandir(self, basedir):
        """
        Scan a base directory for movie files and add them to
        the list for analyze
        """
        self.basedir = basedir
        print("Scanning directory: %s" % basedir)
        for root, subdirs, files in os.walk(basedir):
            for filename in files:
                ext = os.path.splitext(filename)[1].lower()
                if ext == ".ts":
                    #file_path = os.path.join(root, filename)
                    title = filename.split(" - ")
                    if len(title) == 1:
                        title = title[0]
                    else:
                        title = " - ".join(title[2:])
                    title = title[:-3].lower()
                    self.filelist.append([title, filename, root])
                elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']:
                    title = filename[:-4].lower()
                    self.filelist.append([title, filename, root])

    def analyze(self):
        """
        Analyze the scanlist for duplicates
        """
        print("%i files to analyze, running duplicate testing loop..." % (
            len(self.filelist)))

        listlen = len(self.filelist)
        for idx in range(listlen):
            if not self.filelist[idx]:
                continue
            print("\r%d %s\033[K" % (
                idx, self.filelist[idx][0]), end='')
            sys.stdout.flush()
            for idx2 in range(idx + 1, listlen):
                if self.filelist[idx2]:
                    if similarity(self.filelist[idx][0], self.filelist[idx2][0]) > self.ratio:
                        #print "possible duplicate %d %s" % (idx2, item2[0])
                        key = os.path.join(self.filelist[idx][2], self.filelist[idx][1])
                        if not key in self.duplicates:
                            self.duplicates[key] = []
                        self.duplicates[key].append(
                            os.path.join(
                                self.filelist[idx2][2],
                                self.filelist[idx2][1]
                            ))
                        # unset the found duplicate, so that this will not be scanned again
                        self.filelist[idx2] = None
        print("\n\n")

    def output(self):
        """
        Dump found duplicates to console
        """
        idx = 1
        for base in self.duplicates:
            print("Duplicate file set #%i" % idx)
            print(base)
            for dup in self.duplicates[base]:
                print(dup)
            print()
            idx += 1


if __name__ == "__main__":
    # parse command line options
    import argparse

    parser = argparse.ArgumentParser(\
        description='Movie database filename duplicate checker')
    parser.add_argument('--ratio', type=float, default=0.85, \
        help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)')
    parser.add_argument('--difflib', action='store_true', default=False, \
        help='force the use of difflib instead Levenshtein')
    parser.add_argument('basedir', metavar='basedir', nargs='+', \
        help='one or more base directories')

    args = parser.parse_args()
    dupe = dupechecker()
    dupe.ratio = args.ratio
    if args.difflib:
        DIFFLIB = True
        import difflib
    else:
        try:
            import Levenshtein
            DIFFLIB = False
        except ImportError:
            import difflib
            DIFFLIB = True
            print("Consider 'pip install python-Levenshtein' for faster analyze")


    for srcstr in args.basedir:
        dupe.scandir(srcstr)
    dupe.analyze()
    dupe.output()

mercurial