From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id A897F1FA18 for ; Mon, 11 Oct 2021 08:06:20 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 6/6] extindex: avoid invalid blobs after unref Date: Mon, 11 Oct 2021 08:06:20 +0000 Message-Id: <20211011080620.27478-7-e@80x24.org> In-Reply-To: <20211011080620.27478-1-e@80x24.org> References: <20211011080620.27478-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: When unref-ing a blob from xref3, make sure the "preferred" smsg->{blob} doesn't point to the blob we just unrefed. This is necessary because we periodically checkpoint our extindex process to allow -watch and -mda processes to run. This also gets rid of a lot of redundant code for ->remove_xref3, since it's all handled in ExtSearchIdx, now. --- lib/PublicInbox/ExtSearchIdx.pm | 70 ++++++++++++++++++--------------- lib/PublicInbox/OverIdx.pm | 44 --------------------- t/extsearch.t | 2 +- t/over.t | 5 ++- 4 files changed, 43 insertions(+), 78 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index ce9cea25da5e..c2ab0447e176 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -138,21 +138,42 @@ sub remove_doc ($$) { sub _unref_doc ($$$$$;$) { my ($sync, $docid, $ibx, $xnum, $oidbin, $eml) = @_; - my $s = 'DELETE FROM xref3 WHERE ibx_id = ? AND oidbin = ?'; + my $smsg; + if (ref($docid)) { + $smsg = $docid; + $docid = $smsg->{num}; + } + my $s = 'DELETE FROM xref3 WHERE oidbin = ?'; + $s .= ' AND ibx_id = ?' if defined($ibx); $s .= ' AND xnum = ?' if defined($xnum); my $del = $sync->{self}->{oidx}->dbh->prepare_cached($s); - $del->bind_param(1, $ibx->{-ibx_id}); - $del->bind_param(2, $oidbin, SQL_BLOB); - $del->bind_param(3, $xnum) if defined($xnum); + my $col = 0; + $del->bind_param(++$col, $oidbin, SQL_BLOB); + $del->bind_param(++$col, $ibx->{-ibx_id}) if $ibx; + $del->bind_param(++$col, $xnum) if defined($xnum); $del->execute; - my $xr3 = $sync->{self}->{oidx}->get_xref3($docid, 1); + my $xr3 = $sync->{self}->{oidx}->get_xref3($docid); if (scalar(@$xr3) == 0) { # all gone remove_doc($sync->{self}, $docid); } else { # enqueue for reindex of remaining messages - my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key; - my $idx = $sync->{self}->idx_shard($docid); - $idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml); - $sync->{self}->{oidx}->eidxq_add($docid); # yes, add + if ($ibx) { + my $ekey = $ibx->{-gc_eidx_key} // $ibx->eidx_key; + my $idx = $sync->{self}->idx_shard($docid); + $idx->ipc_do('remove_eidx_info', $docid, $ekey, $eml); + } # else: we can't remove_eidx_info in reindex-only path + + # replace invalidated blob ASAP with something which should be + # readable since we may commit the transaction on checkpoint. + # eidxq processing will re-apply boost + $smsg //= $sync->{self}->{oidx}->get_art($docid); + my $hex = unpack('H*', $oidbin); + if ($smsg && $smsg->{blob} eq $hex) { + $xr3->[0] =~ /:([a-f0-9]{40,}+)\z/ or + die "BUG: xref $xr3->[0] has no OID"; + $sync->{self}->{oidx}->update_blob($smsg, $1); + } + # yes, add, we'll need to re-apply boost + $sync->{self}->{oidx}->eidxq_add($docid); } @$xr3 } @@ -235,24 +256,12 @@ sub do_step ($) { # main iterator for adding messages to the index do_finalize($req); } -sub _blob_missing ($$) { # called when $smsg->{blob} is bad +sub _blob_missing ($$) { # called when a known $smsg->{blob} is gone my ($req, $smsg) = @_; - my $self = $req->{self}; - my $xref3 = $self->{oidx}->get_xref3($smsg->{num}); - my @keep = grep(!/:$smsg->{blob}\z/, @$xref3); - if (@keep) { - warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n"; - $keep[0] =~ /:([a-f0-9]{40,}+)\z/ or - die "BUG: xref $keep[0] has no OID"; - my $oidhex = $1; - $self->{oidx}->remove_xref3($smsg->{num}, $smsg->{blob}); - $self->{oidx}->update_blob($smsg, $oidhex) or warn <{num} gone ($smsg->{blob} => $oidhex) -EOM - } else { - warn "E: $smsg->{blob} gone, removing #$smsg->{num}\n"; - remove_doc($self, $smsg->{num}); - } + # xnum and ibx are unknown, we only call this when an entry from + # /ei*/over.sqlite3 is bad, not on entries from xap*/over.sqlite3 + my $oidbin = pack('H*', $smsg->{blob}); + _unref_doc($req, $smsg, undef, undef, $oidbin); } sub ck_existing { # git->cat_async callback @@ -548,7 +557,8 @@ sub _reindex_finalize ($$$) { for my $ary (values %$by_chash) { for my $x (reverse @$ary) { warn "removing #$docid xref3 $x->{blob}\n"; - my $n = $self->{oidx}->remove_xref3($docid, $x->{blob}); + my $bin = pack('H*', $x->{blob}); + my $n = _unref_doc($sync, $docid, undef, undef, $bin); die "BUG: $x->{blob} invalidated #$docid" if $n == 0; } my $x = pop(@$ary) // die "BUG: #$docid {by_chash} empty"; @@ -579,16 +589,14 @@ sub _reindex_oid { # git->cat_async callback my $expect_oid = $req->{xr3r}->[$req->{ix}]->[2]; my $docid = $orig_smsg->{num}; if (is_bad_blob($oid, $type, $size, $expect_oid)) { - my $remain = $self->{oidx}->remove_xref3($docid, $expect_oid); + my $oidbin = pack('H*', $expect_oid); + my $remain = _unref_doc($sync, $docid, undef, undef, $oidbin); if ($remain == 0) { warn "W: #$docid gone or corrupted\n"; - remove_doc($self, $docid); } elsif (my $next_oid = $req->{xr3r}->[++$req->{ix}]->[2]) { - # n.b. we can't remove_eidx_info here $self->git->cat_async($next_oid, \&_reindex_oid, $req); } else { warn "BUG: #$docid gone (UNEXPECTED)\n"; - remove_doc($self, $docid); } return; } diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm index 46f7a066275a..d6d706f7fed0 100644 --- a/lib/PublicInbox/OverIdx.pm +++ b/lib/PublicInbox/OverIdx.pm @@ -606,50 +606,6 @@ INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?) $sth->execute; } -# returns remaining reference count to $docid -sub remove_xref3 { - my ($self, $docid, $oidhex, $eidx_key, $rm_eidx_info) = @_; - begin_lazy($self); - my $oidbin = pack('H*', $oidhex); - my ($sth, $ibx_id); - if (defined $eidx_key) { - $ibx_id = ibx_id($self, $eidx_key); - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $ibx_id); - $sth->bind_param(3, $oidbin, SQL_BLOB); - } else { - $sth = $self->{dbh}->prepare_cached(<<''); -DELETE FROM xref3 WHERE docid = ? AND oidbin = ? - - $sth->bind_param(1, $docid); - $sth->bind_param(2, $oidbin, SQL_BLOB); - } - $sth->execute; - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? - - $sth->execute($docid); - my $nr = $sth->fetchrow_array; - if ($nr == 0) { - delete_by_num($self, $docid); - } elsif (defined($ibx_id) && $rm_eidx_info) { - # if deduplication rules in ContentHash change, it's - # possible a docid can have multiple rows with the - # same ibx_id. This governs whether or not we call - # ->remove_eidx_info in ExtSearchIdx. - $sth = $self->{dbh}->prepare_cached(<<'', undef, 1); -SELECT COUNT(*) FROM xref3 WHERE docid = ? AND ibx_id = ? - - $sth->execute($docid, $ibx_id); - my $count = $sth->fetchrow_array; - $$rm_eidx_info = ($count == 0); - } - $nr; -} - # for when an xref3 goes missing, this does NOT update {ts} sub update_blob { my ($self, $smsg, $oidhex) = @_; diff --git a/t/extsearch.t b/t/extsearch.t index 896e270414bd..1b6235ba0eca 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -375,7 +375,7 @@ if ('reindex catches content bifurcation') { is($oidx->max, $oldmax, 'oidx->max unchanged'); $oidx->dbh_close; ok(run_script([qw(-extindex --reindex --all), "$home/extindex"], - undef, $opt), 'extindex --reindex'); + undef, $opt), 'extindex --reindex') or diag explain($opt); $oidx->dbh; ok($oidx->max > $oldmax, 'oidx->max bumped'); like($err, qr/split into 2 due to deduplication change/, diff --git a/t/over.t b/t/over.t index a92d2f77b039..1f2df7cf065a 100644 --- a/t/over.t +++ b/t/over.t @@ -90,8 +90,9 @@ $over->eidx_prep; 'example.kee:2018:deadbeefcafe' ], 'xref3 works forw two'); - @arg = qw(1349 adeadba7cafe example.key); - is($over->remove_xref3(@arg), 1, 'remove first'); + is($over->dbh->do(<<''), 1, 'remove first'); +DELETE FROM xref3 WHERE xnum = 2019 AND docid = 1349 + $xref3 = $over->get_xref3(1349); is_deeply($xref3, [ 'example.kee:2018:deadbeefcafe' ], 'confirm removal successful');