From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 1323A1F9FC for ; Mon, 18 Oct 2021 05:09:06 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/2] v2: mirrors don't clobber msgs w/ reused Message-IDs Date: Mon, 18 Oct 2021 05:09:05 +0000 Message-Id: <20211018050905.21275-3-e@80x24.org> In-Reply-To: <20211018050905.21275-1-e@80x24.org> References: <20211018050905.21275-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: For odd messages with reused Message-IDs, the second message showing up in a mirror (via git-fetch + -index) should never clobber an entry with a different blob in over. This is noticeable only if the messages arrive in-between indexing runs. Fixes: 4441a38481ed ("v2: index forwards (via `git log --reverse')") --- MANIFEST | 1 + lib/PublicInbox/V2Writable.pm | 7 ++++++- t/v2index-late-dupe.t | 37 +++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 t/v2index-late-dupe.t diff --git a/MANIFEST b/MANIFEST index b5aae77747dd..af1522d71bd1 100644 --- a/MANIFEST +++ b/MANIFEST @@ -552,6 +552,7 @@ t/v1-add-remove-add.t t/v1reindex.t t/v2-add-remove-add.t t/v2dupindex.t +t/v2index-late-dupe.t t/v2mda.t t/v2mirror.t t/v2reindex.t diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 3914383cc9d3..ed5182ae8460 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -813,8 +813,8 @@ sub index_oid { # cat_async callback } } } + my $oidx = $self->{oidx}; if (!defined($num)) { # reuse if reindexing (or duplicates) - my $oidx = $self->{oidx}; for my $mid (@$mids) { ($num, $mid0) = $oidx->num_mid0_for_oid($oid, $mid); last if defined $num; @@ -822,6 +822,11 @@ sub index_oid { # cat_async callback } $mid0 //= do { # is this a number we got before? $num = $arg->{mm_tmp}->num_for($mids->[0]); + + # don't clobber existing if Message-ID is reused: + if (my $x = defined($num) ? $oidx->get_art($num) : undef) { + undef($num) if $x->{blob} ne $oid; + } defined($num) ? $mids->[0] : undef; }; if (!defined($num)) { diff --git a/t/v2index-late-dupe.t b/t/v2index-late-dupe.t new file mode 100644 index 000000000000..c83e3409044f --- /dev/null +++ b/t/v2index-late-dupe.t @@ -0,0 +1,37 @@ +# Copyright (C) all contributors +# License: AGPL-3.0+ +# +# this simulates a mirror path: git fetch && -index +use strict; use v5.10.1; use PublicInbox::TestCommon; +use Test::More; # redundant, used for bisect +require_mods 'v2'; +require PublicInbox::Import; +require PublicInbox::Inbox; +require PublicInbox::Git; +my ($tmpdir, $for_destroy) = tmpdir(); +my $inboxdir = "$tmpdir/i"; +PublicInbox::Import::init_bare(my $e0 = "$inboxdir/git/0.git"); +open my $fh, '>', "$inboxdir/inbox.lock" or xbail $!; +my $git = PublicInbox::Git->new($e0); +my $im = PublicInbox::Import->new($git, qw(i i@example.com)); +$im->{lock_path} = undef; +$im->{path_type} = 'v2'; +my $eml = eml_load('t/plack-qp.eml'); +ok($im->add($eml), 'add original'); +$im->done; +run_script([qw(-index -Lbasic), $inboxdir]); +is($?, 0, 'basic index'); +my $ibx = PublicInbox::Inbox->new({ inboxdir => $inboxdir }); +my $orig = $ibx->over->get_art(1); + +my @mid = $eml->header_raw('Message-ID'); +$eml->header_set('Message-ID', @mid, ''); +ok($im->add($eml), 'add another'); +$im->done; +run_script([qw(-index -Lbasic), $inboxdir]); +is($?, 0, 'basic index again'); + +my $after = $ibx->over->get_art(1); +is_deeply($after, $orig, 'original unchanged') or note explain([$orig,$after]); + +done_testing;