From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 317111F8C8 for ; Wed, 6 Oct 2021 09:44:50 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] extindex: --gc checkpoints Date: Wed, 6 Oct 2021 09:44:50 +0000 Message-Id: <20211006094450.29451-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We need to ensure -extindex --gc runs don't prevent other work from happening in the meantime. I actually caused my -extindex to OOM due to the lack of checkpoints :x We'll also hoist out the shard scanning into its own sub in preparation for lei/store usage. --- lib/PublicInbox/ExtSearchIdx.pm | 91 +++++++++++++++++++++++---------- lib/PublicInbox/SearchIdx.pm | 32 ++++-------- 2 files changed, 73 insertions(+), 50 deletions(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 3a1856c84709..7cc8dd952559 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -369,19 +369,16 @@ SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ? } } -sub eidx_gc { - my ($self, $opt) = @_; - $self->{cfg} or die "E: GC requires ->attach_config\n"; - $opt->{-idx_gc} = 1; - $self->idx_init($opt); # acquire lock via V2Writable::_idx_init - - my $dbh = $self->{oidx}->dbh; - $dbh->do('PRAGMA case_sensitive_like = ON'); # only place we use LIKE - my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?'); - my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes'); - my $lc_i = $dbh->prepare(<<''); -SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ? - +sub eidx_gc_scan_inboxes ($$) { + my ($self, $sync) = @_; + my ($x3_doc, $ibx_ck); +restart: + $x3_doc = $self->{oidx}->dbh->prepare(<{oidx}->dbh->prepare(<execute; while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) { next if $self->{ibx_map}->{$eidx_key}; @@ -390,44 +387,84 @@ SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ? $x3_doc->execute($ibx_id); while (defined(my $docid = $x3_doc->fetchrow_array)) { gc_unref_doc($self, $ibx_id, $eidx_key, $docid); + if (checkpoint_due($sync)) { + $x3_doc = $ibx_ck = undef; + reindex_checkpoint($self, $sync); + goto restart; + } } - $dbh->prepare_cached(<<'')->execute($ibx_id); + $self->{oidx}->dbh->do(<<'', undef, $ibx_id); DELETE FROM inboxes WHERE ibx_id = ? # drop last_commit info my $pat = $eidx_key; $pat =~ s/([_%\\])/\\$1/g; + $self->{oidx}->dbh->do('PRAGMA case_sensitive_like = ON'); + my $lc_i = $self->{oidx}->dbh->prepare(<<''); +SELECT key FROM eidx_meta WHERE key LIKE ? ESCAPE ? + $lc_i->execute("lc-%:$pat//%", '\\'); while (my ($key) = $lc_i->fetchrow_array) { next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!; warn "I: removing $key\n"; - $dbh->prepare_cached(<<'')->execute($key); + $self->{oidx}->dbh->do(<<'', undef, $key); DELETE FROM eidx_meta WHERE key = ? } - warn "I: $eidx_key removed\n"; } +} - # it's not real unless it's in `over', we use parallelism here, - # shards will be reading directly from over, so commit - $self->{oidx}->commit_lazy; - $self->{oidx}->begin_lazy; - - for my $idx (@{$self->{idx_shards}}) { - warn "I: cleaning up shard #$idx->{shard}\n"; - $idx->shard_over_check($self->{oidx}); - } - my $nr = $dbh->do(<<''); +sub eidx_gc_scan_shards ($$) { # TODO: use for lei/store + my ($self, $sync) = @_; + my $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over) warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0; # fixup from old bugs: - $nr = $dbh->do(<<''); + $nr = $self->{oidx}->dbh->do(<<''); DELETE FROM over WHERE num NOT IN (SELECT docid FROM xref3) warn "I: eliminated $nr stale over entries\n" if $nr != 0; + + my ($cur) = $self->{oidx}->dbh->selectrow_array(<{oidx}->dbh->selectrow_array(<{oidx}->dbh->prepare(<execute($cur); + next if $exists->fetchrow_array != 0; + $self->idx_shard($cur)->ipc_do('xdb_remove_quiet', $cur); + if (checkpoint_due($sync)) { + $exists = undef; + reindex_checkpoint($self, $sync); + goto restart; + } + } +} + +sub eidx_gc { + my ($self, $opt) = @_; + $self->{cfg} or die "E: GC requires ->attach_config\n"; + $opt->{-idx_gc} = 1; + my $sync = { + need_checkpoint => \(my $need_checkpoint = 0), + check_intvl => 10, + next_check => now() + 10, + checkpoint_unlocks => 1, + -opt => $opt, + }; + $self->idx_init($opt); # acquire lock via V2Writable::_idx_init + eidx_gc_scan_inboxes($self, $sync); + eidx_gc_scan_shards($self, $sync); done($self); } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index e5c872d52fd3..78db329d9da9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -637,14 +637,21 @@ sub update_vmd { sub xdb_remove { my ($self, @docids) = @_; - $self->begin_txn_lazy; - my $xdb = $self->{xdb} or return; + begin_txn_lazy($self); + my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; for my $docid (@docids) { eval { $xdb->delete_document($docid) }; warn "E: #$docid not in in Xapian? $@\n" if $@; } } +sub xdb_remove_quiet { + my ($self, $docid) = @_; + begin_txn_lazy($self); + my $xdb = $self->{xdb} // die 'BUG: missing {xdb}'; + eval { $xdb->delete_document($docid) }; +} + sub index_git_blob_id { my ($doc, $pfx, $objid) = @_; @@ -1098,25 +1105,4 @@ sub eidx_shard_new { $self; } -# ensure there's no stale Xapian docs by treating $over as canonical -sub over_check { - my ($self, $over) = @_; - begin_txn_lazy($self); - my $sth = $over->dbh->prepare(<<''); -SELECT COUNT(*) FROM over WHERE num = ? - - my $xdb = $self->{xdb}; - my $cur = $xdb->postlist_begin(''); - my $end = $xdb->postlist_end(''); - my $xdir = $self->xdir; - for (; $cur != $end; $cur++) { - my $docid = $cur->get_docid; - $sth->execute($docid); - my $x = $sth->fetchrow_array; - next if $x > 0; - warn "I: removing $xdir #$docid, not in `over'\n"; - $xdb->delete_document($docid); - } -} - 1;