unofficial mirror of notmuch@notmuchmail.org

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
 #!/usr/bin/env python

import sys

IGNORED_HEADERS = [ "Received:" ]

if len(sys.argv) != 1:
    print "Usage: %s" % sys.argv[0]
    print
    print "The script removes duplicate message files.  Takes no options."
    print "Requires notmuch python module."
    print
    print "Files are assumed duplicates if their content is the same"
    print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS)
    exit(1)

import notmuch
import os
import time

class MailComparator:
    """Checks if mail files are duplicates."""
    def __init__(self, filename):
        self.filename = filename
        self.mail = self.readFile(self.filename)

    def isDuplicate(self, filename):
        return self.mail == self.readFile(filename)

    @staticmethod
    def readFile(filename):
        with open(filename) as f:
            data = ""
            while True:
                line = f.readline()
                for header in IGNORED_HEADERS:
                    if line.startswith(header):
                        # skip header continuation lines
                        while True:
                            line = f.readline()
                            if len(line) == 0 or line[0] not in [" ", "\t"]:
                                break
                        break
                else:
                    data += line
                    if line == "\n":
                        break
            data += f.read()
            return data

db = notmuch.Database()
query = db.create_query('*')
print "Number of messages: %s" % query.count_messages()

files_count = 0
for root, dirs, files in os.walk(db.get_path()):
    if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
        files_count += len(files)
print "Number of files: %s" % files_count
print "Estimated number of duplicates: %s" % (files_count - query.count_messages())

msgs = query.search_messages()
msg_count = 0
suspected_duplicates_count = 0
duplicates_count = 0
timestamp = time.time()
for msg in msgs:
    msg_count += 1
    if len(msg.get_filenames()) > 1:
        filenames = msg.get_filenames()
        comparator = MailComparator(filenames.next())
        for filename in filenames:
            if os.path.realpath(comparator.filename) == os.path.realpath(filename):
                print "Message '%s' has filenames pointing to the same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, filename)
            elif comparator.isDuplicate(filename):
                os.remove(filename)
                duplicates_count += 1
            else:
                #print "Potential duplicates: %s" % msg.get_message_id()
                suspected_duplicates_count += 1

    new_timestamp = time.time()
    if new_timestamp - timestamp > 1:
        timestamp = new_timestamp
        sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count))
        sys.stdout.flush()

print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count)
if duplicates_count > 0:
    print "You might want to run 'notmuch new' now."

if suspected_duplicates_count > 0:
    print
    print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count
    print "Perhaps we should ignore more headers."