* [PATCH] Add notmuch-remove-duplicates.py script to contrib. @ 2012-09-04 18:53 Dmitry Kurochkin 2012-09-04 19:43 ` Michal Nazarewicz 0 siblings, 1 reply; 5+ messages in thread From: Dmitry Kurochkin @ 2012-09-04 18:53 UTC (permalink / raw) To: notmuch The script removes duplicate message files. It takes no options. Files are assumed duplicates if their content is the same except for ignored headers. Currently, the only ignored header is Received:. --- contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100755 contrib/notmuch-remove-duplicates.py diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py new file mode 100755 index 0000000..dbe2e25 --- /dev/null +++ b/contrib/notmuch-remove-duplicates.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python + +import sys + +IGNORED_HEADERS = [ "Received:" ] + +if len(sys.argv) != 1: + print "Usage: %s" % sys.argv[0] + print + print "The script removes duplicate message files. Takes no options." + print "Requires notmuch python module." + print + print "Files are assumed duplicates if their content is the same" + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS) + exit(1) + +import notmuch +import os +import time + +class MailComparator: + """Checks if mail files are duplicates.""" + def __init__(self, filename): + self.filename = filename + self.mail = self.readFile(self.filename) + + def isDuplicate(self, filename): + return self.mail == self.readFile(filename) + + @staticmethod + def readFile(filename): + with open(filename) as f: + data = "" + while True: + line = f.readline() + for header in IGNORED_HEADERS: + if line.startswith(header): + # skip header continuation lines + while True: + line = f.readline() + if len(line) == 0 or line[0] not in [" ", "\t"]: + break + break + else: + data += line + if line == "\n": + break + data += f.read() + return data + +db = notmuch.Database() +query = db.create_query('*') +print "Number of messages: %s" % query.count_messages() + +files_count = 0 +for root, dirs, files in os.walk(db.get_path()): + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): + files_count += len(files) +print "Number of files: %s" % files_count +print "Estimated number of duplicates: %s" % (files_count - query.count_messages()) + +msgs = query.search_messages() +msg_count = 0 +suspected_duplicates_count = 0 +duplicates_count = 0 +timestamp = time.time() +for msg in msgs: + msg_count += 1 + if len(msg.get_filenames()) > 1: + filenames = msg.get_filenames() + comparator = MailComparator(filenames.next()) + for filename in filenames: + if os.path.realpath(comparator.filename) == os.path.realpath(filename): + print "Message '%s' has filenames pointing to the same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, filename) + elif comparator.isDuplicate(filename): + os.remove(filename) + duplicates_count += 1 + else: + #print "Potential duplicates: %s" % msg.get_message_id() + suspected_duplicates_count += 1 + + new_timestamp = time.time() + if new_timestamp - timestamp > 1: + timestamp = new_timestamp + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) + sys.stdout.flush() + +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) +if duplicates_count > 0: + print "You might want to run 'notmuch new' now." + +if suspected_duplicates_count > 0: + print + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count + print "Perhaps we should ignore more headers." -- 1.7.10.4 ^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib. 2012-09-04 18:53 [PATCH] Add notmuch-remove-duplicates.py script to contrib Dmitry Kurochkin @ 2012-09-04 19:43 ` Michal Nazarewicz 2012-09-04 20:12 ` Dmitry Kurochkin 0 siblings, 1 reply; 5+ messages in thread From: Michal Nazarewicz @ 2012-09-04 19:43 UTC (permalink / raw) To: Dmitry Kurochkin, notmuch [-- Attachment #1: Type: text/plain, Size: 8728 bytes --] On Tue, Sep 04 2012, Dmitry Kurochkin wrote: > The script removes duplicate message files. It takes no options. > > Files are assumed duplicates if their content is the same except for > ignored headers. Currently, the only ignored header is Received:. > --- > contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++ > 1 file changed, 95 insertions(+) > create mode 100755 contrib/notmuch-remove-duplicates.py > > diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py > new file mode 100755 > index 0000000..dbe2e25 > --- /dev/null > +++ b/contrib/notmuch-remove-duplicates.py > @@ -0,0 +1,95 @@ > +#!/usr/bin/env python > + > +import sys > + > +IGNORED_HEADERS = [ "Received:" ] > + > +if len(sys.argv) != 1: > + print "Usage: %s" % sys.argv[0] > + print > + print "The script removes duplicate message files. Takes no options." > + print "Requires notmuch python module." > + print > + print "Files are assumed duplicates if their content is the same" > + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS) > + exit(1) It's much better put inside a main() function, which is than called only if the script is run directly. > + > +import notmuch > +import os > +import time > + > +class MailComparator: > + """Checks if mail files are duplicates.""" > + def __init__(self, filename): > + self.filename = filename > + self.mail = self.readFile(self.filename) > + > + def isDuplicate(self, filename): > + return self.mail == self.readFile(filename) > + > + @staticmethod > + def readFile(filename): > + with open(filename) as f: > + data = "" > + while True: > + line = f.readline() > + for header in IGNORED_HEADERS: > + if line.startswith(header): Case of headers should be ignored, but this does not ignore it. > + # skip header continuation lines > + while True: > + line = f.readline() > + if len(line) == 0 or line[0] not in [" ", "\t"]: > + break > + break This will ignore line just after the ignored header. > + else: > + data += line > + if line == "\n": > + break > + data += f.read() > + return data > + > +db = notmuch.Database() > +query = db.create_query('*') > +print "Number of messages: %s" % query.count_messages() > + > +files_count = 0 > +for root, dirs, files in os.walk(db.get_path()): > + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): > + files_count += len(files) > +print "Number of files: %s" % files_count > +print "Estimated number of duplicates: %s" % (files_count - query.count_messages()) > + > +msgs = query.search_messages() > +msg_count = 0 > +suspected_duplicates_count = 0 > +duplicates_count = 0 > +timestamp = time.time() > +for msg in msgs: > + msg_count += 1 > + if len(msg.get_filenames()) > 1: > + filenames = msg.get_filenames() > + comparator = MailComparator(filenames.next()) > + for filename in filenames: Strictly speaking, you need to compare each file to each file, and not just every file to the first file. > + if os.path.realpath(comparator.filename) == os.path.realpath(filename): > + print "Message '%s' has filenames pointing to the > same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, > filename) So why aren't those removed? > + elif comparator.isDuplicate(filename): > + os.remove(filename) > + duplicates_count += 1 > + else: > + #print "Potential duplicates: %s" % msg.get_message_id() > + suspected_duplicates_count += 1 > + > + new_timestamp = time.time() > + if new_timestamp - timestamp > 1: > + timestamp = new_timestamp > + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) > + sys.stdout.flush() > + > +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) > +if duplicates_count > 0: > + print "You might want to run 'notmuch new' now." > + > +if suspected_duplicates_count > 0: > + print > + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count > + print "Perhaps we should ignore more headers." Please consider the following instead (not tested): #!/usr/bin/env python import collections import notmuch import os import re import sys import time IGNORED_HEADERS = [ 'Received' ] isIgnoredHeadersLine = re.compile( r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS), re.IGNORECASE).search doesStartWithWS = re.compile(r'^\s').search def usage(argv0): print """Usage: %s [<query-string>] The script removes duplicate message files. Takes no options." Requires notmuch python module." Files are assumed duplicates if their content is the same" except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS)) def readMailFile(filename): with open(filename) as fd: data = [] skip_header = False for line in fd: if doesStartWithWS(line): if not skip_header: data.append(line) elif isIgnoredHeadersLine(line): skip_header = True else: data.append(line) if line == '\n': break data.append(fd.read()) return ''.join(data) def dedupMessage(msg): filenames = msg.get_filenames() if len(filenames) <= 1: return (0, 0) realpaths = collections.defaultdict(list) contents = collections.defaultdict(list) for filename in filenames: real = os.path.realpath(filename) lst = realpaths[real] lst.append(filename) if len(lst) == 1: contents[readMailFile(real)].append(real) duplicates = 0 for filenames in contents.itervalues(): if len(filenames) > 1: print 'Files with the same content:' print ' ', filenames.pop() duplicates += len(filenames) for filename in filenames: del realpaths[filename] # os.remane(filename) for real, filenames in realpaths.iteritems(): if len(filenames) > 1: print 'Files pointing to the same message:' print ' ', filenames.pop() duplicates += len(filenames) # for filename in filenames: # os.remane(filename) return (duplicates, len(realpaths) - 1) def dedupQuery(query): print 'Number of messages: %s' % query.count_messages() msg_count = 0 suspected_count = 0 duplicates_count = 0 timestamp = time.time() msgs = query.search_messages() for msg in msgs: msg_count += 1 d, s = dedupMessage(msg) duplicates_count += d suspected_count += d new_timestamp = time.time() if new_timestamp - timestamp > 1: timestamp = new_timestamp sys.stdout.write('\rProcessed %s messages, removed %s duplicates...' % (msg_count, duplicates_count)) sys.stdout.flush() print '\rFinished. Processed %s messages, removed %s duplicates.' % ( msg_count, duplicates_count) if duplicates_count > 0: print 'You might want to run "notmuch new" now.' if suspected_duplicates_count > 0: print """ Found %d messages with duplicate IDs but different content. Perhaps we should ignore more headers.""" % suspected_count def main(argv): if len(argv) == 1: query = '*' elif len(argv) == 2: query = argv[1] else: usage(argv[0]) return 1 db = notmuch.Database() query = db.create_query(query) dedupQuery(db, query) return 0 if __name__ == '__main__': sys.exit(main(sys.argv)) -- Best regards, _ _ .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o ..o | Computer Science, Michał “mina86” Nazarewicz (o o) ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo-- [-- Attachment #2.1: Type: text/plain, Size: 0 bytes --] [-- Attachment #2.2: Type: application/pgp-signature, Size: 835 bytes --] ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib. 2012-09-04 19:43 ` Michal Nazarewicz @ 2012-09-04 20:12 ` Dmitry Kurochkin 2012-09-04 20:26 ` Michal Nazarewicz 0 siblings, 1 reply; 5+ messages in thread From: Dmitry Kurochkin @ 2012-09-04 20:12 UTC (permalink / raw) To: Michal Nazarewicz, notmuch Hi Michal. Michal Nazarewicz <mina86@mina86.com> writes: > On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >> The script removes duplicate message files. It takes no options. >> >> Files are assumed duplicates if their content is the same except for >> ignored headers. Currently, the only ignored header is Received:. >> --- >> contrib/notmuch-remove-duplicates.py | 95 ++++++++++++++++++++++++++++++++++ >> 1 file changed, 95 insertions(+) >> create mode 100755 contrib/notmuch-remove-duplicates.py >> >> diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py >> new file mode 100755 >> index 0000000..dbe2e25 >> --- /dev/null >> +++ b/contrib/notmuch-remove-duplicates.py >> @@ -0,0 +1,95 @@ >> +#!/usr/bin/env python >> + >> +import sys >> + >> +IGNORED_HEADERS = [ "Received:" ] >> + >> +if len(sys.argv) != 1: >> + print "Usage: %s" % sys.argv[0] >> + print >> + print "The script removes duplicate message files. Takes no options." >> + print "Requires notmuch python module." >> + print >> + print "Files are assumed duplicates if their content is the same" >> + print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS) >> + exit(1) > > It's much better put inside a main() function, which is than called only > if the script is run directly. > Good point. My python skill is pretty low :) >> + >> +import notmuch >> +import os >> +import time >> + >> +class MailComparator: >> + """Checks if mail files are duplicates.""" >> + def __init__(self, filename): >> + self.filename = filename >> + self.mail = self.readFile(self.filename) >> + >> + def isDuplicate(self, filename): >> + return self.mail == self.readFile(filename) >> + >> + @staticmethod >> + def readFile(filename): >> + with open(filename) as f: >> + data = "" >> + while True: >> + line = f.readline() >> + for header in IGNORED_HEADERS: >> + if line.startswith(header): > > Case of headers should be ignored, but this does not ignore it. > It does. >> + # skip header continuation lines >> + while True: >> + line = f.readline() >> + if len(line) == 0 or line[0] not in [" ", "\t"]: >> + break >> + break > > This will ignore line just after the ignored header. > The first header line is ignored as well because line is added to data in else block. >> + else: >> + data += line >> + if line == "\n": >> + break >> + data += f.read() >> + return data >> + >> +db = notmuch.Database() >> +query = db.create_query('*') >> +print "Number of messages: %s" % query.count_messages() >> + >> +files_count = 0 >> +for root, dirs, files in os.walk(db.get_path()): >> + if not root.startswith(os.path.join(db.get_path(), ".notmuch/")): >> + files_count += len(files) >> +print "Number of files: %s" % files_count >> +print "Estimated number of duplicates: %s" % (files_count - query.count_messages()) >> + >> +msgs = query.search_messages() >> +msg_count = 0 >> +suspected_duplicates_count = 0 >> +duplicates_count = 0 >> +timestamp = time.time() >> +for msg in msgs: >> + msg_count += 1 >> + if len(msg.get_filenames()) > 1: >> + filenames = msg.get_filenames() >> + comparator = MailComparator(filenames.next()) >> + for filename in filenames: > > Strictly speaking, you need to compare each file to each file, and not > just every file to the first file. > >> + if os.path.realpath(comparator.filename) == os.path.realpath(filename): >> + print "Message '%s' has filenames pointing to the >> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, >> filename) > > So why aren't those removed? > Because it is the same file indexed twice (probably because of symlinks). We do not want to remove the only message file. >> + elif comparator.isDuplicate(filename): >> + os.remove(filename) >> + duplicates_count += 1 >> + else: >> + #print "Potential duplicates: %s" % msg.get_message_id() >> + suspected_duplicates_count += 1 >> + >> + new_timestamp = time.time() >> + if new_timestamp - timestamp > 1: >> + timestamp = new_timestamp >> + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) >> + sys.stdout.flush() >> + >> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) >> +if duplicates_count > 0: >> + print "You might want to run 'notmuch new' now." >> + >> +if suspected_duplicates_count > 0: >> + print >> + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count >> + print "Perhaps we should ignore more headers." > > Please consider the following instead (not tested): > Thanks for reviewing my poor python code :) I am afraid I do not have enough interest in improving it. I just implemented a simple solution for my problem. Though it looks like you already took time to rewrite the script. Would be great if you send it as a proper patch obsoleting this one. Regards, Dmitry > > #!/usr/bin/env python > > import collections > import notmuch > import os > import re > import sys > import time > > > IGNORED_HEADERS = [ 'Received' ] > > > isIgnoredHeadersLine = re.compile( > r'^(?:%s)\s*:' % '|'.join(IGNORED_HEADERS), > re.IGNORECASE).search > > doesStartWithWS = re.compile(r'^\s').search > > > def usage(argv0): > print """Usage: %s [<query-string>] > > The script removes duplicate message files. Takes no options." > Requires notmuch python module." > > Files are assumed duplicates if their content is the same" > except for the following headers: %s.""" % (argv0, ', '.join(IGNORED_HEADERS)) > > > def readMailFile(filename): > with open(filename) as fd: > data = [] > skip_header = False > for line in fd: > if doesStartWithWS(line): > if not skip_header: > data.append(line) > elif isIgnoredHeadersLine(line): > skip_header = True > else: > data.append(line) > if line == '\n': > break > data.append(fd.read()) > return ''.join(data) > > > def dedupMessage(msg): > filenames = msg.get_filenames() > if len(filenames) <= 1: > return (0, 0) > > realpaths = collections.defaultdict(list) > contents = collections.defaultdict(list) > for filename in filenames: > real = os.path.realpath(filename) > lst = realpaths[real] > lst.append(filename) > if len(lst) == 1: > contents[readMailFile(real)].append(real) > > duplicates = 0 > > for filenames in contents.itervalues(): > if len(filenames) > 1: > print 'Files with the same content:' > print ' ', filenames.pop() > duplicates += len(filenames) > for filename in filenames: > del realpaths[filename] > # os.remane(filename) > > for real, filenames in realpaths.iteritems(): > if len(filenames) > 1: > print 'Files pointing to the same message:' > print ' ', filenames.pop() > duplicates += len(filenames) > # for filename in filenames: > # os.remane(filename) > > return (duplicates, len(realpaths) - 1) > > > def dedupQuery(query): > print 'Number of messages: %s' % query.count_messages() > msg_count = 0 > suspected_count = 0 > duplicates_count = 0 > timestamp = time.time() > msgs = query.search_messages() > for msg in msgs: > msg_count += 1 > d, s = dedupMessage(msg) > duplicates_count += d > suspected_count += d > > new_timestamp = time.time() > if new_timestamp - timestamp > 1: > timestamp = new_timestamp > sys.stdout.write('\rProcessed %s messages, removed %s duplicates...' > % (msg_count, duplicates_count)) > sys.stdout.flush() > > print '\rFinished. Processed %s messages, removed %s duplicates.' % ( > msg_count, duplicates_count) > if duplicates_count > 0: > print 'You might want to run "notmuch new" now.' > > if suspected_duplicates_count > 0: > print """ > Found %d messages with duplicate IDs but different content. > Perhaps we should ignore more headers.""" % suspected_count > > > def main(argv): > if len(argv) == 1: > query = '*' > elif len(argv) == 2: > query = argv[1] > else: > usage(argv[0]) > return 1 > > db = notmuch.Database() > query = db.create_query(query) > dedupQuery(db, query) > return 0 > > > if __name__ == '__main__': > sys.exit(main(sys.argv)) > > > > -- > Best regards, _ _ > .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o > ..o | Computer Science, Michał “mina86” Nazarewicz (o o) > ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo-- ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib. 2012-09-04 20:12 ` Dmitry Kurochkin @ 2012-09-04 20:26 ` Michal Nazarewicz 2012-09-04 20:33 ` Dmitry Kurochkin 0 siblings, 1 reply; 5+ messages in thread From: Michal Nazarewicz @ 2012-09-04 20:26 UTC (permalink / raw) To: Dmitry Kurochkin, notmuch [-- Attachment #1: Type: text/plain, Size: 3299 bytes --] >> On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >>> +class MailComparator: >>> + """Checks if mail files are duplicates.""" >>> + def __init__(self, filename): >>> + self.filename = filename >>> + self.mail = self.readFile(self.filename) >>> + >>> + def isDuplicate(self, filename): >>> + return self.mail == self.readFile(filename) >>> + >>> + @staticmethod >>> + def readFile(filename): >>> + with open(filename) as f: >>> + data = "" >>> + while True: >>> + line = f.readline() >>> + for header in IGNORED_HEADERS: >>> + if line.startswith(header): > Michal Nazarewicz <mina86@mina86.com> writes: >> Case of headers should be ignored, but this does not ignore it. On Tue, Sep 04 2012, Dmitry Kurochkin wrote: > It does. Wait, how? If line is “received:” how does it starts with “Received:”? >>> + if os.path.realpath(comparator.filename) == os.path.realpath(filename): >>> + print "Message '%s' has filenames pointing to the >>> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, >>> filename) >> >> So why aren't those removed? >> > > Because it is the same file indexed twice (probably because of > symlinks). We do not want to remove the only message file. Ah, right, with symlinks this is troublesome, but than again, we can check if there is at least one non-symlink. If there is, delete everything else, if there is not, delete all but one arbitrarily chosen symlink. >>> + elif comparator.isDuplicate(filename): >>> + os.remove(filename) >>> + duplicates_count += 1 >>> + else: >>> + #print "Potential duplicates: %s" % msg.get_message_id() >>> + suspected_duplicates_count += 1 >>> + >>> + new_timestamp = time.time() >>> + if new_timestamp - timestamp > 1: >>> + timestamp = new_timestamp >>> + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) >>> + sys.stdout.flush() >>> + >>> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) >>> +if duplicates_count > 0: >>> + print "You might want to run 'notmuch new' now." >>> + >>> +if suspected_duplicates_count > 0: >>> + print >>> + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count >>> + print "Perhaps we should ignore more headers." >> >> Please consider the following instead (not tested): > Thanks for reviewing my poor python code :) I am afraid I do not have > enough interest in improving it. I just implemented a simple solution > for my problem. Though it looks like you already took time to rewrite > the script. Would be great if you send it as a proper patch obsoleting > this one. Bah, I'll probably won't have time to properly test it. -- Best regards, _ _ .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o ..o | Computer Science, Michał “mina86” Nazarewicz (o o) ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo-- [-- Attachment #2.1: Type: text/plain, Size: 0 bytes --] [-- Attachment #2.2: Type: application/pgp-signature, Size: 835 bytes --] ^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] Add notmuch-remove-duplicates.py script to contrib. 2012-09-04 20:26 ` Michal Nazarewicz @ 2012-09-04 20:33 ` Dmitry Kurochkin 0 siblings, 0 replies; 5+ messages in thread From: Dmitry Kurochkin @ 2012-09-04 20:33 UTC (permalink / raw) To: Michal Nazarewicz, notmuch Michal Nazarewicz <mina86@mina86.com> writes: >>> On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >>>> +class MailComparator: >>>> + """Checks if mail files are duplicates.""" >>>> + def __init__(self, filename): >>>> + self.filename = filename >>>> + self.mail = self.readFile(self.filename) >>>> + >>>> + def isDuplicate(self, filename): >>>> + return self.mail == self.readFile(filename) >>>> + >>>> + @staticmethod >>>> + def readFile(filename): >>>> + with open(filename) as f: >>>> + data = "" >>>> + while True: >>>> + line = f.readline() >>>> + for header in IGNORED_HEADERS: >>>> + if line.startswith(header): > >> Michal Nazarewicz <mina86@mina86.com> writes: >>> Case of headers should be ignored, but this does not ignore it. > > On Tue, Sep 04 2012, Dmitry Kurochkin wrote: >> It does. > > Wait, how? If line is “received:” how does it starts with “Received:”? > Sorry, I misunderstood your comment. It does not ignore the case indeed. >>>> + if os.path.realpath(comparator.filename) == os.path.realpath(filename): >>>> + print "Message '%s' has filenames pointing to the >>>> same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, >>>> filename) >>> >>> So why aren't those removed? >>> >> >> Because it is the same file indexed twice (probably because of >> symlinks). We do not want to remove the only message file. > > Ah, right, with symlinks this is troublesome, but than again, we can > check if there is at least one non-symlink. If there is, delete > everything else, if there is not, delete all but one arbitrarily chosen > symlink. > Sure, we could do that. >>>> + elif comparator.isDuplicate(filename): >>>> + os.remove(filename) >>>> + duplicates_count += 1 >>>> + else: >>>> + #print "Potential duplicates: %s" % msg.get_message_id() >>>> + suspected_duplicates_count += 1 >>>> + >>>> + new_timestamp = time.time() >>>> + if new_timestamp - timestamp > 1: >>>> + timestamp = new_timestamp >>>> + sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count)) >>>> + sys.stdout.flush() >>>> + >>>> +print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count) >>>> +if duplicates_count > 0: >>>> + print "You might want to run 'notmuch new' now." >>>> + >>>> +if suspected_duplicates_count > 0: >>>> + print >>>> + print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count >>>> + print "Perhaps we should ignore more headers." >>> >>> Please consider the following instead (not tested): > >> Thanks for reviewing my poor python code :) I am afraid I do not have >> enough interest in improving it. I just implemented a simple solution >> for my problem. Though it looks like you already took time to rewrite >> the script. Would be great if you send it as a proper patch obsoleting >> this one. > > Bah, I'll probably won't have time to properly test it. > Same problem :) Regards, Dmitry > -- > Best regards, _ _ > .o. | Liege of Serenely Enlightened Majesty of o' \,=./ `o > ..o | Computer Science, Michał “mina86” Nazarewicz (o o) > ooo +----<email/xmpp: mpn@google.com>--------------ooO--(_)--Ooo-- ^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2012-09-04 20:33 UTC | newest] Thread overview: 5+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2012-09-04 18:53 [PATCH] Add notmuch-remove-duplicates.py script to contrib Dmitry Kurochkin 2012-09-04 19:43 ` Michal Nazarewicz 2012-09-04 20:12 ` Dmitry Kurochkin 2012-09-04 20:26 ` Michal Nazarewicz 2012-09-04 20:33 ` Dmitry Kurochkin
Code repositories for project(s) associated with this public inbox https://yhetil.org/notmuch.git/ This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).