unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/2] extindex: administrative stuffs
@ 2020-11-29  5:09 Eric Wong
  2020-11-29  5:09 ` [PATCH 1/2] v2writable: detect shard count for ExtSearchIdx properly Eric Wong
  2020-11-29  5:09 ` [PATCH 2/2] extindex: support `--gc' to remove dead inboxes Eric Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Wong @ 2020-11-29  5:09 UTC (permalink / raw)
  To: meta

Welcome to episode #968 of "Deletes Are Hard".

Since inboxes may be removed from the config file and made
inaccessible unbeknownst to public-inbox indexing code; we need
to support a way to remove stale indexed data associated with
them.  PATCH 2/2 makes it happen with garbage collection.

Lightly-tested, but it seems to work reasonably well.

PATCH 1/2 is a bug fix :x

Eric Wong (2):
  v2writable: detect shard count for ExtSearchIdx properly
  extindex: support `--gc' to remove dead inboxes

 lib/PublicInbox/ExtSearchIdx.pm   | 90 ++++++++++++++++++++++++++++++-
 lib/PublicInbox/MiscIdx.pm        | 16 ++++++
 lib/PublicInbox/SearchIdx.pm      | 23 +++++++-
 lib/PublicInbox/SearchIdxShard.pm | 21 ++++++--
 lib/PublicInbox/V2Writable.pm     | 16 ++++--
 script/public-inbox-extindex      | 19 +++++--
 t/extsearch.t                     | 19 ++++++-
 7 files changed, 190 insertions(+), 14 deletions(-)

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] v2writable: detect shard count for ExtSearchIdx properly
  2020-11-29  5:09 [PATCH 0/2] extindex: administrative stuffs Eric Wong
@ 2020-11-29  5:09 ` Eric Wong
  2020-11-29  5:09 ` [PATCH 2/2] extindex: support `--gc' to remove dead inboxes Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2020-11-29  5:09 UTC (permalink / raw)
  To: meta

Otherwise, any explicitly set shard counts were ignored and
we'd be counting CPUs every single time.
---
 lib/PublicInbox/V2Writable.pm | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index a3938b56..e9a43000 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -65,13 +65,21 @@ sub nproc_shards ($) {
 
 sub count_shards ($) {
 	my ($self) = @_;
-	$self->{ibx} ? do {
+	if (my $ibx = $self->{ibx}) {
 		# always load existing shards in case core count changes:
 		# Also, shard count may change while -watch is running
-		my $srch = $self->{ibx}->search or return 0;
-		delete $self->{ibx}->{search};
+		my $srch = $ibx->search or return 0;
+		delete $ibx->{search};
 		$srch->{nshard} // 0
-	} : $self->{nshard}; # self->{nshard} is for ExtSearchIdx
+	} else { # ExtSearchIdx
+		$self->{nshard} // do {
+			if ($self->xdb_sharded) {
+				$self->{nshard} // die 'BUG: {nshard} unset';
+			} else {
+				0;
+			}
+		}
+	}
 }
 
 sub new {

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] extindex: support `--gc' to remove dead inboxes
  2020-11-29  5:09 [PATCH 0/2] extindex: administrative stuffs Eric Wong
  2020-11-29  5:09 ` [PATCH 1/2] v2writable: detect shard count for ExtSearchIdx properly Eric Wong
@ 2020-11-29  5:09 ` Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2020-11-29  5:09 UTC (permalink / raw)
  To: meta

Inboxes may be removed or newsgroups renamed over time.
Introduce a switch to do garbage collection and eliminate stale
search and xref3 results based on inboxes which remain in the
config file.

This may also fixup stale results leftover from any bugs which
may leave stale data around.

This is also useful in case a clumsy BOFH (me :P) is swapping
between several PI_CONFIGs and accidentally indexed a bunch of
inboxes they didn't intend to.
---
 lib/PublicInbox/ExtSearchIdx.pm   | 90 ++++++++++++++++++++++++++++++-
 lib/PublicInbox/MiscIdx.pm        | 16 ++++++
 lib/PublicInbox/SearchIdx.pm      | 23 +++++++-
 lib/PublicInbox/SearchIdxShard.pm | 21 ++++++--
 script/public-inbox-extindex      | 19 +++++--
 t/extsearch.t                     | 19 ++++++-
 6 files changed, 178 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index d780776f..4de47b58 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -87,6 +87,7 @@ sub _ibx_attach { # each_inbox callback
 
 sub attach_config {
 	my ($self, $cfg) = @_;
+	$self->{cfg} = $cfg;
 	$cfg->each_inbox(\&_ibx_attach, $self);
 }
 
@@ -141,7 +142,8 @@ sub do_xpost ($$) {
 		if ($nr == 0) {
 			$idx->shard_remove($oid, $docid);
 		} elsif ($rm_eidx_info) {
-			$idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml);
+			$idx->shard_remove_eidx_info($docid, $oid, $eidx_key,
+							$eml);
 		}
 	}
 }
@@ -324,6 +326,90 @@ sub _sync_inbox ($$$) {
 	$ibx->git->cleanup; # done with this inbox, now
 }
 
+sub unref_doc ($$$$) {
+	my ($self, $ibx_id, $eidx_key, $docid) = @_;
+	my $dbh = $self->{oidx}->dbh;
+
+	# for debug/info purposes, oids may no longer be accessible
+	my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT oidbin FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+	$sth->execute($docid, $ibx_id);
+	my @oid = map { unpack('H*', $_->[0]) } @{$sth->fetchall_arrayref};
+
+	$dbh->prepare_cached(<<'')->execute($docid, $ibx_id);
+DELETE FROM xref3 WHERE docid = ? AND ibx_id = ?
+
+	my $remain = $self->{oidx}->get_xref3($docid);
+	my $idx = $self->idx_shard($docid);
+	if (@$remain) {
+		for my $oid (@oid) {
+			warn "I: unref #$docid $eidx_key $oid\n";
+			$idx->shard_remove_eidx_info($docid, $oid, $eidx_key);
+		}
+	} else {
+		for my $oid (@oid) {
+			warn "I: remove #$docid $eidx_key $oid\n";
+			$idx->shard_remove($oid, $docid);
+		}
+	}
+}
+
+sub eidx_gc {
+	my ($self, $opt) = @_;
+	$self->{cfg} or die "E: GC requires ->attach_config\n";
+	$opt->{-idx_gc} = 1;
+	$self->idx_init($opt); # acquire lock via V2Writable::_idx_init
+
+	my $dbh = $self->{oidx}->dbh;
+	my $x3_doc = $dbh->prepare('SELECT docid FROM xref3 WHERE ibx_id = ?');
+	my $ibx_ck = $dbh->prepare('SELECT ibx_id,eidx_key FROM inboxes');
+	my $lc_i = $dbh->prepare('SELECT key FROM eidx_meta WHERE key LIKE ?');
+
+	$ibx_ck->execute;
+	while (my ($ibx_id, $eidx_key) = $ibx_ck->fetchrow_array) {
+		next if $self->{ibx_map}->{$eidx_key};
+		$self->{midx}->remove_eidx_key($eidx_key);
+		warn "I: deleting messages for $eidx_key...\n";
+		$x3_doc->execute($ibx_id);
+		while (defined(my $docid = $x3_doc->fetchrow_array)) {
+			unref_doc($self, $ibx_id, $eidx_key, $docid);
+		}
+		$dbh->prepare_cached(<<'')->execute($ibx_id);
+DELETE FROM inboxes WHERE ibx_id = ?
+
+		# drop last_commit info
+		my $pat = $eidx_key;
+		$pat =~ s/([_%])/\\$1/g;
+		$lc_i->execute("lc-%:$pat//%");
+		while (my ($key) = $lc_i->fetchrow_array) {
+			next if $key !~ m!\Alc-v[1-9]+:\Q$eidx_key\E//!;
+			warn "I: removing $key\n";
+			$dbh->prepare_cached(<<'')->execute($key);
+DELETE FROM eidx_meta WHERE key = ?
+
+		}
+
+		warn "I: $eidx_key removed\n";
+	}
+
+	# it's not real unless it's in `over', we use parallelism here,
+	# shards will be reading directly from over, so commit
+	$self->{oidx}->commit_lazy;
+	$self->{oidx}->begin_lazy;
+
+	for my $idx (@{$self->{idx_shards}}) {
+		warn "I: cleaning up shard #$idx->{shard}\n";
+		$idx->shard_over_check($self->{oidx});
+	}
+	my $nr = $dbh->do(<<'');
+DELETE FROM xref3 WHERE docid NOT IN (SELECT num FROM over)
+
+	warn "I: eliminated $nr stale xref3 entries\n" if $nr != 0;
+
+	done($self);
+}
+
 sub eidx_sync { # main entry point
 	my ($self, $opt) = @_;
 	$self->idx_init($opt); # acquire lock via V2Writable::_idx_init
@@ -413,6 +499,7 @@ sub idx_init { # similar to V2Writable
 				next if $seen{"$st[0]\0$st[1]"}++;
 			} else {
 				warn "W: stat($d) failed (from $alt): $!\n";
+				next if $opt->{-idx_gc};
 			}
 			push @old, $line;
 		}
@@ -424,6 +511,7 @@ sub idx_init { # similar to V2Writable
 			next if $seen{"$st[0]\0$st[1]"}++;
 		} else {
 			warn "W: stat($d) failed (from $ibx->{inboxdir}): $!\n";
+			next if $opt->{-idx_gc};
 		}
 		push @new, $line;
 	}
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 642d920b..64591d05 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -53,6 +53,22 @@ sub commit_txn {
 	delete($self->{xdb})->commit_transaction;
 }
 
+sub remove_eidx_key {
+	my ($self, $eidx_key) = @_;
+	my $xdb = $self->{xdb};
+	my $head = $xdb->postlist_begin('Q'.$eidx_key);
+	my $tail = $xdb->postlist_end('Q'.$eidx_key);
+	my @docids; # only one, unless we had bugs
+	for (; $head != $tail; $head++) {
+		push @docids, $head->get_docid;
+	}
+	for my $docid (@docids) {
+		$xdb->delete_document($docid);
+		warn "I: remove inbox docid #$docid ($eidx_key)\n";
+	}
+}
+
+# adds or updates according to $eidx_key
 sub index_ibx {
 	my ($self, $ibx) = @_;
 	my $eidx_key = $ibx->eidx_key;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index d06c159b..c18c7c36 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -471,7 +471,7 @@ sub remove_eidx_info {
 	my $doc = _get_doc($self, $docid, $oid) or return;
 	eval { $doc->remove_term('O'.$eidx_key) };
 	warn "W: ->remove_term O$eidx_key: $@\n" if $@;
-	for my $l ($eml->header_raw('List-Id')) {
+	for my $l ($eml ? $eml->header_raw('List-Id') : ()) {
 		$l =~ /<([^>]+)>/ or next;
 		my $lid = lc $1;
 		eval { $doc->remove_term('G' . $lid) };
@@ -970,4 +970,25 @@ sub eidx_shard_new {
 	$self;
 }
 
+# ensure there's no stale Xapian docs by treating $over as canonical
+sub over_check {
+	my ($self, $over) = @_;
+	begin_txn_lazy($self);
+	my $sth = $over->dbh->prepare(<<'');
+SELECT COUNT(*) FROM over WHERE num = ?
+
+	my $xdb = $self->{xdb};
+	my $cur = $xdb->postlist_begin('');
+	my $end = $xdb->postlist_end('');
+	my $xdir = $self->xdir;
+	for (; $cur != $end; $cur++) {
+		my $docid = $cur->get_docid;
+		$sth->execute($docid);
+		my $x = $sth->fetchrow_array;
+		next if $x > 0;
+		warn "I: removing $xdir #$docid, not in `over'\n";
+		$xdb->delete_document($docid);
+	}
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index dcfeb0be..53fac9b6 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -57,6 +57,7 @@ sub spawn_worker {
 
 sub eml ($$) {
 	my ($r, $len) = @_;
+	return if $len == 0;
 	my $n = read($r, my $bref, $len) or die "read: $!\n";
 	$n == $len or die "short read: $n != $len\n";
 	PublicInbox::Eml->new(\$bref);
@@ -92,6 +93,10 @@ sub shard_worker_loop ($$$$$) {
 			chomp $eidx_key;
 			$self->remove_eidx_info($docid, $oid, $eidx_key,
 							eml($r, $len));
+		} elsif ($line =~ s/\AO ([^\n]+)\n//) {
+			my $over_fn = $1;
+			$over_fn =~ tr/\0/\n/;
+			$self->over_check(PublicInbox::Over->new($over_fn));
 		} else {
 			chomp $line;
 			my $eidx_key;
@@ -155,10 +160,9 @@ sub shard_add_eidx_info {
 }
 
 sub shard_remove_eidx_info {
-	my ($self, $docid, $oid, $xibx, $eml) = @_;
-	my $eidx_key = $xibx->eidx_key;
+	my ($self, $docid, $oid, $eidx_key, $eml) = @_;
 	if (my $w = $self->{w}) {
-		my $hdr = $eml->header_obj->as_string;
+		my $hdr = $eml ? $eml->header_obj->as_string : '';
 		my $len = length($hdr);
 		print $w "-X $len $docid $oid $eidx_key\n", $hdr or
 			die "failed to write shard: $!";
@@ -212,4 +216,15 @@ sub shard_remove {
 	}
 }
 
+sub shard_over_check {
+	my ($self, $over) = @_;
+	if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child
+		my ($over_fn) = $over->{dbh}->sqlite_db_filename;
+		$over_fn =~ tr/\n/\0/;
+		print $w "O $over_fn\n" or die "failed to write over $!";
+	} else {
+		$self->over_check($over);
+	}
+}
+
 1;
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 20a0737c..17ad59fa 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -16,6 +16,7 @@ usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR]
   --jobs=NUM          set or disable parallelization (NUM=0)
   --batch-size=BYTES  flush changes to OS after a given number of bytes
   --max-size=BYTES    do not index messages larger than the given size
+  --gc                perform garbage collection instead of indexing
   --verbose | -v      increase verbosity (may be repeated)
 
 BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
@@ -26,6 +27,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
 		fsync|sync!
 		indexlevel|index-level|L=s max_size|max-size=s
 		batch_size|batch-size=s
+		gc
 		all help|h))
 	or die $help;
 if ($opt->{help}) { print $help; exit 0 };
@@ -36,7 +38,13 @@ my $eidx_dir = shift(@ARGV) // die "E: $help";
 local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
 require PublicInbox::Admin;
 my $cfg = PublicInbox::Config->new;
-my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+my @ibxs;
+if ($opt->{gc}) {
+	die "E: inbox paths must not be specified with --gc\n" if @ARGV;
+	die "E: --all not compatible --gc\n" if $opt->{all};
+} else {
+	@ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
+}
 PublicInbox::Admin::require_or_die(qw(-search));
 PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
 PublicInbox::Admin::progress_prepare($opt);
@@ -44,5 +52,10 @@ my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
 local %ENV = (%ENV, %$env) if $env;
 require PublicInbox::ExtSearchIdx;
 my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
-$eidx->attach_inbox($_) for @ibxs;
-$eidx->eidx_sync($opt);
+if ($opt->{gc}) {
+	$eidx->attach_config($cfg);
+	$eidx->eidx_gc($opt);
+} else {
+	$eidx->attach_inbox($_) for @ibxs;
+	$eidx->eidx_sync($opt);
+}
diff --git a/t/extsearch.t b/t/extsearch.t
index 4f1dbadf..10b14baf 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -16,7 +16,8 @@ my $host_port = $sock->sockhost . ':' . $sock->sockport;
 my ($home, $for_destroy) = tmpdir();
 local $ENV{HOME} = $home;
 mkdir "$home/.public-inbox" or BAIL_OUT $!;
-open my $fh, '>', "$home/.public-inbox/config" or BAIL_OUT $!;
+my $cfg_path = "$home/.public-inbox/config";
+open my $fh, '>', $cfg_path or BAIL_OUT $!;
 print $fh <<EOF or BAIL_OUT $!;
 [publicinboxMda]
 	spamcheck = none
@@ -55,7 +56,7 @@ ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
 
 
 { # TODO: -extindex should write this to config
-	open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!;
+	open $fh, '>>', $cfg_path or BAIL_OUT $!;
 	print $fh <<EOF or BAIL_OUT $!;
 ; for ->ALL
 [extindex "all"]
@@ -148,4 +149,18 @@ is(scalar(@it), 2, 'two inboxes');
 like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
 like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
 
+if ('remove v1test and test gc') {
+	xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
+		{ GIT_CONFIG => $cfg_path });
+	my $opt = { 2 => \(my $err = '') };
+	ok(run_script([qw(-extindex --gc), "$home/extindex"], undef, $opt),
+		'extindex --gc');
+	like($err, qr/^I: remove #1 v1\.example /ms, 'removed v1 message');
+	is(scalar(grep(!/^I:/, split(/^/m, $err))), 0,
+		'no non-informational messages');
+	$misc->{xdb}->reopen;
+	@it = $misc->mset('')->items;
+	is(scalar(@it), 1, 'only one inbox left');
+}
+
 done_testing;

^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-11-29  5:09 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-29  5:09 [PATCH 0/2] extindex: administrative stuffs Eric Wong
2020-11-29  5:09 ` [PATCH 1/2] v2writable: detect shard count for ExtSearchIdx properly Eric Wong
2020-11-29  5:09 ` [PATCH 2/2] extindex: support `--gc' to remove dead inboxes Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).