# HG changeset patch # User mdd # Date 1513045544 -3600 # Node ID df89a8fba2a205f65330ea51b3328306fc374d26 # Parent 52371bbcde5c7bf31d7abb7aa32b68bb44525f98 added stats calc to dupechecker diff -r 52371bbcde5c -r df89a8fba2a2 dupecheck.py --- a/dupecheck.py Tue Dec 12 03:02:31 2017 +0100 +++ b/dupecheck.py Tue Dec 12 03:25:44 2017 +0100 @@ -17,6 +17,15 @@ else: return Levenshtein.ratio(a, b) +suffixes = ['b', 'K', 'M', 'G', 'T', 'P'] +def humansize(nbytes): + i = 0 + while nbytes >= 1024 and i < len(suffixes)-1: + nbytes /= 1024. + i += 1 + f = ('%.2f' % nbytes).rstrip('0').rstrip('.') + return '%s %s' % (f, suffixes[i]) + class dupechecker(object): """ Simple class to scan multiple directories recursive, @@ -52,10 +61,32 @@ else: title = " - ".join(title[2:]) title = title[:-3].lower() - self.filelist.append([title, filename, root]) + self.filelist.append([title, filename, root, ext]) elif ext in ['.mkv', '.avi', '.mpg', '.mpeg', '.mp4']: title = filename[:-4].lower() - self.filelist.append([title, filename, root]) + self.filelist.append([title, filename, root, ext]) + + def statistics(self): + """ + Summarize disk usage and print stats about found filetypes + """ + stats = {} + for item in self.filelist: + if not item[3] in stats: + stats[item[3]] = [0, 0.0] + stats[item[3]][0] += 1 + stats[item[3]][1] += os.stat( + os.path.join( + item[2], item[1])).st_size + print ("%5s %6s %10s" % ( + "File:", + "Count:", + "Size:")) + for ext in stats.keys(): + print ("%5s %6i %10s" % ( + ext, stats[ext][0], + humansize(stats[ext][1]))) + def analyze(self): """ @@ -111,6 +142,8 @@ help='filename duplicate threshold 0.1 < ratio 1.0 (default 0.85)') parser.add_argument('--difflib', action='store_true', default=False, \ help='force the use of difflib instead Levenshtein') + parser.add_argument('--stats', action='store_true', default=False, \ + help='generate stats summary instead of check for duplicates') parser.add_argument('basedir', metavar='basedir', nargs='+', \ help='one or more base directories') @@ -132,5 +165,8 @@ for srcstr in args.basedir: dupe.scandir(srcstr) - dupe.analyze() - dupe.output() + if args.stats: + dupe.statistics() + else: + dupe.analyze() + dupe.output()