From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-3.8 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 3143C1FA13 for ; Wed, 9 Dec 2020 09:25:13 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 3/3] extsearchidx: enforce -index before -extindex Date: Wed, 9 Dec 2020 09:25:12 +0000 Message-Id: <20201209092512.25282-4-e@80x24.org> In-Reply-To: <20201209092512.25282-1-e@80x24.org> References: <20201209092512.25282-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We cannot set xref3 data without the `xnum' column to tie it to the per-inbox over.sqlite3 DB. So ensure we don't read brand-new history that only exists in git, but instead rely on last_commit and last_xap15-$EPOCH metadata in msgmap to decide how far we can index. Before this change, it was possible to miss messages in the extindex if -index did not run (which will be fixable by upcoming --reindex support in -extindex). --- lib/PublicInbox/ExtSearchIdx.pm | 7 ++++- lib/PublicInbox/V2Writable.pm | 20 +++++++++--- t/extsearch.t | 54 +++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 5 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index b0a12bca..84449cb4 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -291,7 +291,12 @@ sub _sync_inbox ($$$) { } elsif ($v == 1) { my $uv = $ibx->uidvalidity; my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv"); - my $stk = prepare_stack($sync, $lc ? "$lc..HEAD" : 'HEAD'); + my $head = $ibx->mm->last_commit; + unless (defined $head) { + warn "E: $ibx->{inboxdir} is not indexed\n"; + return; + } + my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head); my $unit = { stack => $stk, git => $ibx->git }; push @{$sync->{todo}}, $unit; } else { diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 07a7fa42..bef3a67a 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -1073,10 +1073,22 @@ sub sync_prepare ($$) { $pfx //= $sync->{ibx}->{inboxdir}; } - # reindex stops at the current heads and we later rerun index_sync - # without {reindex} - my $reindex_heads = $self->last_commits($sync) if $sync->{reindex}; - + my $reindex_heads; + if ($self->{ibx_map}) { + # ExtSearchIdx won't index messages unless they're in + # over.sqlite3 for a given inbox, so don't read beyond + # what's in the per-inbox index. + $reindex_heads = []; + my $v = PublicInbox::Search::SCHEMA_VERSION; + my $mm = $sync->{ibx}->mm; + for my $i (0..$sync->{epoch_max}) { + $reindex_heads->[$i] = $mm->last_commit_xap($v, $i); + } + } elsif ($sync->{reindex}) { # V2 inbox + # reindex stops at the current heads and we later + # rerun index_sync without {reindex} + $reindex_heads = $self->last_commits($sync); + } if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = $self->can('index_oid'); } diff --git a/t/extsearch.t b/t/extsearch.t index 96512227..70a60b5a 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -176,6 +176,60 @@ is(scalar(@it), 2, 'two inboxes'); like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2'); like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1'); +if ('inject w/o indexing') { + use PublicInbox::Import; + use PublicInbox::Search; + my $schema_version = PublicInbox::Search::SCHEMA_VERSION(); + my $v1ibx = PublicInbox::Config->new->lookup_name('v1test'); + my $last_v1_commit = $v1ibx->mm->last_commit; + my $v2ibx = PublicInbox::Config->new->lookup_name('v2test'); + my $last_v2_commit = $v2ibx->mm->last_commit_xap($schema_version, 0); + my $git0 = PublicInbox::Git->new("$v2ibx->{inboxdir}/git/0.git"); + chomp(my $cmt = $git0->qx(qw(rev-parse HEAD^0))); + is($last_v2_commit, $cmt, 'v2 index up-to-date'); + + my $v2im = PublicInbox::Import->new($git0, undef, undef, $v2ibx); + $v2im->{lock_path} = undef; + $v2im->{path_type} = 'v2'; + $v2im->add(eml_load('t/mda-mime.eml')); + $v2im->done; + chomp(my $tip = $git0->qx(qw(rev-parse HEAD^0))); + isnt($tip, $cmt, '0.git v2 updated'); + + # inject a message w/o updating index + rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or + BAIL_OUT $!; + open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!; + run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or + BAIL_OUT '-mda'; + rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or + BAIL_OUT $!; + + my ($in, $out, $err); + $in = $out = $err = ''; + my $opt = { 0 => \$in, 1 => \$out, 2 => \$err }; + ok(run_script([qw(-extindex -v -v --all), "$home/extindex"], + undef, undef), 'extindex noop'); + $es->{xdb}->reopen; + my $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example'); + is($mset->size, 0, 'did not attempt to index unindexed v1 message'); + $mset = $es->mset('mid:multipart-html-sucks@11'); + is($mset->size, 0, 'did not attempt to index unindexed v2 message'); + ok(run_script([qw(-index --all)]), 'indexed v1 and v2 inboxes'); + + isnt($v1ibx->mm->last_commit, $last_v1_commit, '-index v1 worked'); + isnt($v2ibx->mm->last_commit_xap($schema_version, 0), + $last_v2_commit, '-index v2 worked'); + ok(run_script([qw(-extindex --all), "$home/extindex"]), + 'extindex updates'); + + $es->{xdb}->reopen; + $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example'); + is($mset->size, 1, 'got v1 message'); + $mset = $es->mset('mid:multipart-html-sucks@11'); + is($mset->size, 1, 'got v2 message'); +} + if ('remove v1test and test gc') { xsys([qw(git config --unset publicinbox.v1test.inboxdir)], { GIT_CONFIG => $cfg_path });