From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Cc: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
Subject: [PATCH 06/12] v2: generate better Message-IDs for duplicates
Date: Wed, 18 Apr 2018 09:13:10 +0000 [thread overview]
Message-ID: <20180418091316.29114-7-e@80x24.org> (raw)
In-Reply-To: <20180418091316.29114-1-e@80x24.org>
While hunting duplicates, I noticed a leading '-' in some
Message-IDs as a result of RFC4648 encoding. While '-' seems
allowed by RFC5322 and URL-friendly (RFC4648), they are uncommon
and make using Message-IDs as arguments for command-line tools
more difficult. So prefix them with a datestamp to at least
give readers some sense of the age. And shorten the "localhost"
hostname to "z" to save space.
---
MANIFEST | 1 +
lib/PublicInbox/Import.pm | 18 ++++++------
lib/PublicInbox/V2Writable.pm | 6 ++--
scripts/dupe-finder | 54 +++++++++++++++++++++++++++++++++++
t/v2writable.t | 5 ++--
5 files changed, 71 insertions(+), 13 deletions(-)
create mode 100644 scripts/dupe-finder
diff --git a/MANIFEST b/MANIFEST
index 58b3634..00a0970 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -124,6 +124,7 @@ script/public-inbox-watch
script/public-inbox.cgi
scripts/dc-dlvr
scripts/dc-dlvr.pre
+scripts/dupe-finder
scripts/edit-sa-prefs
scripts/import_maildir
scripts/import_slrnspool
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 9e8900f..c7a96e1 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -14,6 +14,7 @@ use PublicInbox::Address;
use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
use PublicInbox::ContentId qw(content_digest);
use PublicInbox::MDA;
+use POSIX qw(strftime);
sub new {
my ($class, $git, $name, $email, $ibx) = @_;
@@ -330,7 +331,7 @@ sub v1_mid0 ($) {
my $mids = mids($hdr);
if (!scalar(@$mids)) { # spam often has no Message-Id
- my $mid0 = digest2mid(content_digest($mime));
+ my $mid0 = digest2mid(content_digest($mime), $hdr);
append_mid($hdr, $mid0);
return $mid0;
}
@@ -445,18 +446,19 @@ sub atfork_child {
}
}
-sub digest2mid ($) {
- my ($dig) = @_;
+sub digest2mid ($$) {
+ my ($dig, $hdr) = @_;
my $b64 = $dig->clone->b64digest;
# Make our own URLs nicer:
# See "Base 64 Encoding with URL and Filename Safe Alphabet" in RFC4648
$b64 =~ tr!+/=!-_!d;
- # We can make this more meaningful with a date prefix or other things,
- # but this is only needed for crap that fails to generate a Message-ID
- # or reuses one. In other words, it's usually spammers who hit this
- # so they don't deserve nice Message-IDs :P
- $b64 . '@localhost';
+ # Add a date prefix to prevent a leading '-' in case that trips
+ # up some tools (e.g. if a Message-ID were a expected as a
+ # command-line arg)
+ my $dt = msg_datestamp($hdr);
+ $dt = POSIX::strftime('%Y%m%d%H%M%S', gmtime($dt));
+ "$dt.$b64" . '@z';
}
sub clean_purge_buffer {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 66f8a8a..0dcdeda 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -174,19 +174,19 @@ sub num_for_harder {
my $hdr = $mime->header_obj;
my $dig = content_digest($mime);
- $$mid0 = PublicInbox::Import::digest2mid($dig);
+ $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
my $num = $self->{mm}->mid_insert($$mid0);
unless (defined $num) {
# it's hard to spoof the last Received: header
my @recvd = $hdr->header_raw('Received');
$dig->add("Received: $_") foreach (@recvd);
- $$mid0 = PublicInbox::Import::digest2mid($dig);
+ $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
$num = $self->{mm}->mid_insert($$mid0);
# fall back to a random Message-ID and give up determinism:
until (defined($num)) {
$dig->add(rand);
- $$mid0 = PublicInbox::Import::digest2mid($dig);
+ $$mid0 = PublicInbox::Import::digest2mid($dig, $hdr);
warn "using random Message-ID <$$mid0> as fallback\n";
$num = $self->{mm}->mid_insert($$mid0);
}
diff --git a/scripts/dupe-finder b/scripts/dupe-finder
new file mode 100644
index 0000000..1402237
--- /dev/null
+++ b/scripts/dupe-finder
@@ -0,0 +1,54 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# ad-hoc tool for finding duplicates, unstable!
+use strict;
+use warnings;
+use PublicInbox::Inbox;
+use PublicInbox::Over;
+use PublicInbox::Search;
+use PublicInbox::Config;
+my $repo = shift;
+my $ibx;
+if (index($repo, '@') > 0) {
+ $ibx = PublicInbox::Config->new->lookup($repo);
+} elsif (-d $repo) {
+ $ibx = { mainrepo => $repo, address => 'unnamed@example.com' };
+ $ibx = PublicInbox::Inbox->new($ibx);
+} else {
+ $ibx = PublicInbox::Config->new->lookup_name($repo);
+}
+$ibx or die "No inbox";
+$ibx->search or die "search not available for inbox";
+my $dbh = $ibx->search->{over_ro}->connect;
+my $over = PublicInbox::Over->new($dbh->sqlite_db_filename);
+
+sub emit ($) {
+ my ($nums) = @_;
+ foreach my $n (@$nums) {
+ my $smsg = $over->get_art($n) or next;
+ print STDERR "$n $smsg->{blob} $smsg->{mid}\n";
+ my $msg = $ibx->msg_by_smsg($smsg) or next;
+ print "From $smsg->{blob}\@$n Thu Jan 1 00:00:00 1970\n";
+ $$msg =~ s/^(>*From )/>$1/gm;
+ print $$msg, "\n";
+ }
+}
+
+my $sth = $dbh->prepare(<<'');
+SELECT id,num FROM id2num WHERE num > 0 ORDER BY id
+
+$sth->execute;
+my $prev_id = -1;
+my ($id, $num, @nums);
+while (1) {
+ ($id, $num) = $sth->fetchrow_array;
+ defined $id or last;
+ if ($prev_id != $id) {
+ emit(\@nums) if scalar(@nums) > 1;
+ @nums = ();
+ }
+ $prev_id = $id;
+ push @nums, $num;
+}
diff --git a/t/v2writable.t b/t/v2writable.t
index 85fb6a6..d37fb06 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -68,7 +68,7 @@ if ('ensure git configs are correct') {
[ $sec->header_obj->header_raw('Message-Id') ],
'no new Message-Id added');
- my $sane_mid = qr/\A<[\w\-]+\@localhost>\z/;
+ my $sane_mid = qr/\A<[\w\-\.]+\@\w+>\z/;
@warn = ();
$mime->header_set('Message-Id', '<a-mid@b>');
$mime->body_set('different');
@@ -82,7 +82,8 @@ if ('ensure git configs are correct') {
@warn = ();
$mime->header_set('Message-Id', '<a-mid@b>');
$mime->body_set('this one needs a random mid');
- my $gen = PublicInbox::Import::digest2mid(content_digest($mime));
+ my $hdr = $mime->header_obj;
+ my $gen = PublicInbox::Import::digest2mid(content_digest($mime), $hdr);
unlike($gen, qr![\+/=]!, 'no URL-unfriendly chars in Message-Id');
my $fake = PublicInbox::MIME->new($mime->as_string);
$fake->header_set('Message-Id', "<$gen>");
--
EW
next prev parent reply other threads:[~2018-04-18 9:13 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-04-18 9:13 [PATCH 00/12] better dedupe, contiguous article numbers Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 01/12] feed: respect feedmax, again Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 02/12] v1: remove articles from overview DB Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 03/12] compact: do not merge v2 repos by default Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 04/12] v2writable: reduce partititions by one Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 05/12] search: preserve References in Xapian smsg for x=t view Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-04-18 9:13 ` [PATCH 07/12] v2: improve deduplication checks Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 08/12] import: cat_blob drops leading 'From ' lines like Inbox Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 09/12] searchidx: regenerate and avoid article number gaps on full index Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 10/12] extmsg: remove expensive git path checks Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 11/12] use %H consistently to disable abbreviations Eric Wong (Contractor, The Linux Foundation)
2018-04-18 9:13 ` [PATCH 12/12] searchidx: increase term positions for all text terms Eric Wong (Contractor, The Linux Foundation)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180418091316.29114-7-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).