From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00, URIBL_BLOCKED shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DE5A51F66E for ; Tue, 1 Sep 2020 05:55:45 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] t/v2dupindex: test indexing mirrors with duplicate messages Date: Tue, 1 Sep 2020 05:55:45 +0000 Message-Id: <20200901055545.30440-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: While it's not a known problem, our deduplicating logic may change in the future; or a BOFH could be manually injecting duplicate messages directly into the git epoch repositories. Ensure indexing in mirrors doesn't break when there's duplicates. This is in preparation for detached indices for multi-inbox search. --- MANIFEST | 1 + t/v2dupindex.t | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) create mode 100644 t/v2dupindex.t diff --git a/MANIFEST b/MANIFEST index b65e96b0..44670c7e 100644 --- a/MANIFEST +++ b/MANIFEST @@ -357,6 +357,7 @@ t/utf8.eml t/v1-add-remove-add.t t/v1reindex.t t/v2-add-remove-add.t +t/v2dupindex.t t/v2mda.t t/v2mirror.t t/v2reindex.t diff --git a/t/v2dupindex.t b/t/v2dupindex.t new file mode 100644 index 00000000..b1abccd9 --- /dev/null +++ b/t/v2dupindex.t @@ -0,0 +1,61 @@ +#!perl -w +# Copyright (C) 2020 all contributors +# License: AGPL-3.0+ + +# we can index a message from a mirror which bypasses dedupe. +use strict; +use Test::More; +use PublicInbox::TestCommon; +require_git(2.6); +require_mods(qw(DBD::SQLite)); +my ($tmpdir, $for_destroy) = tmpdir(); +use_ok 'PublicInbox::Import'; +use_ok 'PublicInbox::Git'; +use_ok 'PublicInbox::InboxWritable'; +my $ibx = PublicInbox::InboxWritable->new({ + inboxdir => $tmpdir, + name => 'test-v2dupindex', + version => 2, + indexlevel => 'basic', + -primary_address => 'test@example.com', +}, { nproc => 1 }); +$ibx->init_inbox(1); +my $v2w = $ibx->importer; +$v2w->add(eml_load('t/plack-qp.eml')); +$v2w->add(eml_load('t/mda-mime.eml')); +$v2w->done; + +my $git0 = PublicInbox::Git->new("$tmpdir/git/0.git"); +my $im = PublicInbox::Import->new($git0, undef, undef, $ibx); +$im->{path_type} = 'v2'; +$im->{lock_path} = undef; + +# bypass duplicate filters (->header_set is optional) +my $eml = eml_load('t/plack-qp.eml'); +$eml->header_set('X-This-Is-Not-Checked-By-ContentHash', 'blah'); +ok($im->add($eml), 'add seen message directly'); +ok($im->add(eml_load('t/mda-mime.eml')), 'add another seen message directly'); + +ok($im->add(eml_load('t/iso-2202-jp.eml')), 'add another new message'); +$im->done; + +# mimic a fresh clone by dropping indices +my @sqlite = (glob("$tmpdir/*sqlite3*"), glob("$tmpdir/xap*/*sqlite3*")); +is(unlink(@sqlite), scalar(@sqlite), 'unlinked SQLite indices'); +my @shards = glob("$tmpdir/xap*/?"); +is(scalar(@shards), 0, 'no Xapian shards to drop'); + +my $rdr = { 2 => \(my $err = '') }; +ok(run_script([qw(-index -Lbasic), $tmpdir], undef, $rdr), '-indexed'); +my @n = $ibx->over->dbh->selectrow_array('SELECT COUNT(*) FROM over'); +is_deeply(\@n, [ 3 ], 'identical message not re-indexed'); +my $mm = $ibx->mm->{dbh}->selectall_arrayref(<<''); +SELECT num,mid FROM msgmap ORDER BY num ASC + +is_deeply($mm, [ + [ 1, 'qp@example.com' ], + [ 2, 'multipart-html-sucks@11' ], + [ 3, '199707281508.AAA24167@hoyogw.example' ] +], 'msgmap omits redundant message'); + +done_testing;