1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
| | #!/usr/bin/env python
import sys
IGNORED_HEADERS = [ "Received:" ]
if len(sys.argv) != 1:
print "Usage: %s" % sys.argv[0]
print
print "The script removes duplicate message files. Takes no options."
print "Requires notmuch python module."
print
print "Files are assumed duplicates if their content is the same"
print "except for the following headers: %s." % ", ".join(IGNORED_HEADERS)
exit(1)
import notmuch
import os
import time
class MailComparator:
"""Checks if mail files are duplicates."""
def __init__(self, filename):
self.filename = filename
self.mail = self.readFile(self.filename)
def isDuplicate(self, filename):
return self.mail == self.readFile(filename)
@staticmethod
def readFile(filename):
with open(filename) as f:
data = ""
while True:
line = f.readline()
for header in IGNORED_HEADERS:
if line.startswith(header):
# skip header continuation lines
while True:
line = f.readline()
if len(line) == 0 or line[0] not in [" ", "\t"]:
break
break
else:
data += line
if line == "\n":
break
data += f.read()
return data
db = notmuch.Database()
query = db.create_query('*')
print "Number of messages: %s" % query.count_messages()
files_count = 0
for root, dirs, files in os.walk(db.get_path()):
if not root.startswith(os.path.join(db.get_path(), ".notmuch/")):
files_count += len(files)
print "Number of files: %s" % files_count
print "Estimated number of duplicates: %s" % (files_count - query.count_messages())
msgs = query.search_messages()
msg_count = 0
suspected_duplicates_count = 0
duplicates_count = 0
timestamp = time.time()
for msg in msgs:
msg_count += 1
if len(msg.get_filenames()) > 1:
filenames = msg.get_filenames()
comparator = MailComparator(filenames.next())
for filename in filenames:
if os.path.realpath(comparator.filename) == os.path.realpath(filename):
print "Message '%s' has filenames pointing to the same file: '%s' '%s'" % (msg.get_message_id(), comparator.filename, filename)
elif comparator.isDuplicate(filename):
os.remove(filename)
duplicates_count += 1
else:
#print "Potential duplicates: %s" % msg.get_message_id()
suspected_duplicates_count += 1
new_timestamp = time.time()
if new_timestamp - timestamp > 1:
timestamp = new_timestamp
sys.stdout.write("\rProcessed %s messages, removed %s duplicates..." % (msg_count, duplicates_count))
sys.stdout.flush()
print "\rFinished. Processed %s messages, removed %s duplicates." % (msg_count, duplicates_count)
if duplicates_count > 0:
print "You might want to run 'notmuch new' now."
if suspected_duplicates_count > 0:
print
print "Found %s messages with duplicate IDs but different content." % suspected_duplicates_count
print "Perhaps we should ignore more headers."
|