On Tue, Sep 04 2012, Dmitry Kurochkin wrote: > The script removes duplicate message files. It takes no options. > > Files are assumed duplicates if their content is the same except for > ignored headers. Currently, the only ignored header is Received:. > --- > contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++ > 1 file changed, 95 insertions(+) > create mode 100755 contrib/notmuch-remove-duplicates.py > > diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py > new file mode 100755 > index 0000000..dbe2e25 > --- /dev/null > +++ b/contrib/notmuch-remove-duplicates.py > @@ -0,0 +1,95 @@ > +#!/usr/bin/env python > + > +import sys > + > +IGNORED_HEADERS = [ "Received:" ] > + > +if len(sys.argv) != 1: > + print "Usage: %s" % sys.argv[0] > + print > + print "The script removes duplicate message files. Takes no options." > + print "Requires notmuch python module." > + print > + print "Files are assumed duplicates if their content is the same" > + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS) > + exit(1) It's much better put inside a main() function, which is than called only if the script is run directly. > + > +import notmuch > +import os > +import time > + > +class MailComparator: > + """Checks if mail files are duplicates.""" > + def __init__(self, filename): > + self.filename = filename > + self.mail = self.readFile(self.filename) > + > + def isDuplicate(self, filename): > + return self.mail == self.readFile(filename) > + > + @staticmethod > + def readFile(filename): > + with open(filename) as f: > + data = "" > + while True: > + line = f.readline() > + for header in IGNORED_HEADERS: > + if line.startswith(header): Case of headers should be ignored, but this does not ignore it. > + # skip header continuation lines > + while True: > + line = f.readline() > + if len(line) == 0 or line[0] not in [" ", "\t"]: > + break > + break This will ignore line just after the ignored header. > + else: > + data += line > + if line == "\n": > + break > + data += f.read() > + return data > + > +db = notmuch.Database() > +query = db.create_query('*') > +print "Number of messages: %s" % query.count_messages() > + > +files_count = 0 > +for root, dirs, files in os.walk(db.get_path()): > + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): > + files_count += len(files) > +print "Number of files: %s" % files_count > +print "Estimated number of duplicates: %s" % (files_count - query.count_messages()) > + > +msgs = query.search_messages() > +msg_count = 0 > +suspected_duplicates_count = 0 > +duplicates_count = 0 > +timestamp = time.time() > +for msg in msgs: > + msg_count += 1 > + if len(msg.get_filenames()) > 1: > + filenames = msg.get_filenames() > + comparator = MailComparator(filenames.next()) > + for filename in filenames: Strictly speaking, you need to compare each file to each file, and not just every file to the first file. > + if os.path.realpath(comparator.filename) == os.path.realpath(filename): > + print "Message '%s' has filenames pointing to the > same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, > filename) So why aren't those removed? > + elif comparator.isDuplicate(filename): > + os.remove(filename) > + duplicates_count += 1 > + else: > + #print "Potential duplicates: %s" % msg.get_message_id() > + suspected_duplicates_count += 1 > + > + new_timestamp = time.time() > + if new_timestamp - timestamp > 1: > + timestamp = new_timestamp > + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) > + sys.stdout.flush() > + > +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) > +if duplicates_count > 0: > + print "You might want to run 'notmuch new' now." > + > +if suspected_duplicates_count > 0: > + print > + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count > + print "Perhaps we should ignore more headers." Please consider the following instead (not tested): #!/usr/bin/env python import collections import notmuch import os import re import sys import time IGNORED_HEADERS = [ 'Received' ] isIgnoredHeadersLine = re.compile( r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS), re.IGNORECASE).search doesStartWithWS = re.compile(r'^\s').search def usage(argv0): print """Usage: %s [] The script removes duplicate message files. Takes no options." Requires notmuch python module." Files are assumed duplicates if their content is the same" except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS)) def readMailFile(filename): with open(filename) as fd: data = [] skip_header = False for line in fd: if doesStartWithWS(line): if not skip_header: data.append(line) elif isIgnoredHeadersLine(line): skip_header = True else: data.append(line) if line == '\n': break data.append(fd.read()) return ''.join(data) def dedupMessage(msg): filenames = msg.get_filenames() if len(filenames) <= 1: return (0, 0) realpaths = collections.defaultdict(list) contents = collections.defaultdict(list) for filename in filenames: real = os.path.realpath(filename) lst = realpaths[real] lst.append(filename) if len(lst) == 1: contents[readMailFile(real)].append(real) duplicates = 0 for filenames in contents.itervalues(): if len(filenames) > 1: print 'Files with the same content:' print ' ', filenames.pop() duplicates += len(filenames) for filename in filenames: del realpaths[filename] # os.remane(filename) for real, filenames in realpaths.iteritems(): if len(filenames) > 1: print 'Files pointing to the same message:' print ' ', filenames.pop() duplicates += len(filenames) # for filename in filenames: # os.remane(filename) return (duplicates, len(realpaths) - 1) def dedupQuery(query): print 'Number of messages: %s' % query.count_messages() msg_count = 0 suspected_count = 0 duplicates_count = 0 timestamp = time.time() msgs = query.search_messages() for msg in msgs: msg_count += 1 d, s = dedupMessage(msg) duplicates_count += d suspected_count += d new_timestamp = time.time() if new_timestamp - timestamp > 1: timestamp = new_timestamp sys.stdout.write('\rProcessed %s messages, removed %s duplicates...' % (msg_count, duplicates_count)) sys.stdout.flush() print '\rFinished. Processed %s messages, removed %s duplicates.' % ( msg_count, duplicates_count) if duplicates_count > 0: print 'You might want to run "notmuch new" now.' if suspected_duplicates_count > 0: print """ Found %d messages with duplicate IDs but different content. Perhaps we should ignore more headers.""" % suspected_count def main(argv): if len(argv) == 1: query = '*' elif len(argv) == 2: query = argv[1] else: usage(argv[0]) return 1 db = notmuch.Database() query = db.create_query(query) dedupQuery(db, query) return 0 if __name__ == '__main__': sys.exit(main(sys.argv)) -- Best regards, _ _ .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o ..o | Computer Science, Michał “mina86” Nazarewicz (o o) ooo +------------------ooO--(_)--Ooo--