unofficial mirror of notmuch@notmuchmail.org
 help / color / mirror / code / Atom feed
blob dbe2e252bde83c99a421786c8b4ca8f2c9f4f09e 3301 bytes (raw)
name: contrib/notmuch-remove-duplicates.py 	 # note: path name is non-authoritative(*)

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
 
#!/usr/bin/env python

import sys

IGNORED_HEADERS = [ "Received:" ]

if len(sys.argv) != 1:
    print "Usage: %s" % sys.argv[0]
    print
    print "The script removes duplicate message files.  Takes no options."
    print "Requires notmuch python module."
    print
    print "Files are assumed duplicates if their content is the same"
    print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS)
    exit(1)

import notmuch
import os
import time

class MailComparator:
    """Checks if mail files are duplicates."""
    def __init__(self, filename):
        self.filename = filename
        self.mail = self.readFile(self.filename)

    def isDuplicate(self, filename):
        return self.mail == self.readFile(filename)

    @staticmethod
    def readFile(filename):
        with open(filename) as f:
            data = ""
            while True:
                line = f.readline()
                for header in IGNORED_HEADERS:
                    if line.startswith(header):
                        # skip header continuation lines
                        while True:
                            line = f.readline()
                            if len(line) == 0 or line[0] not in [" ", "\t"]:
                                break
                        break
                else:
                    data += line
                    if line == "\n":
                        break
            data += f.read()
            return data

db = notmuch.Database()
query = db.create_query('*')
print "Number of messages: %s" % query.count_messages()

files_count = 0
for root, dirs, files in os.walk(db.get_path()):
    if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
        files_count += len(files)
print "Number of files: %s" % files_count
print "Estimated number of duplicates: %s" % (files_count - query.count_messages())

msgs = query.search_messages()
msg_count = 0
suspected_duplicates_count = 0
duplicates_count = 0
timestamp = time.time()
for msg in msgs:
    msg_count += 1
    if len(msg.get_filenames()) > 1:
        filenames = msg.get_filenames()
        comparator = MailComparator(filenames.next())
        for filename in filenames:
            if os.path.realpath(comparator.filename) == os.path.realpath(filename):
                print "Message '%s' has filenames pointing to the same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, filename)
            elif comparator.isDuplicate(filename):
                os.remove(filename)
                duplicates_count += 1
            else:
                #print "Potential duplicates: %s" % msg.get_message_id()
                suspected_duplicates_count += 1

    new_timestamp = time.time()
    if new_timestamp - timestamp > 1:
        timestamp = new_timestamp
        sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count))
        sys.stdout.flush()

print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count)
if duplicates_count > 0:
    print "You might want to run 'notmuch new' now."

if suspected_duplicates_count > 0:
    print
    print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count
    print "Perhaps we should ignore more headers."

debug log:

solving dbe2e25 ...
found dbe2e25 in https://yhetil.org/notmuch/1346784785-19746-1-git-send-email-dmitry.kurochkin@gmail.com/

applying [1/1] https://yhetil.org/notmuch/1346784785-19746-1-git-send-email-dmitry.kurochkin@gmail.com/
diff --git a/contrib/notmuch-remove-duplicates.py b/contrib/notmuch-remove-duplicates.py
new file mode 100755
index 0000000..dbe2e25

Checking patch contrib/notmuch-remove-duplicates.py...
Applied patch contrib/notmuch-remove-duplicates.py cleanly.

index at:
100755 dbe2e252bde83c99a421786c8b4ca8f2c9f4f09e	contrib/notmuch-remove-duplicates.py

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this public inbox

	https://yhetil.org/notmuch.git/

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).