unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/3] extindex: another fix and some cleanups
@ 2020-12-09  9:25 Eric Wong
  2020-12-09  9:25 ` [PATCH 1/3] searchidx: all indexers check for bad blobs Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2020-12-09  9:25 UTC (permalink / raw)
  To: meta

3/3 avoids potential for user-error by enforcing -index
before -extindex.

And some cleanups found while working on --reindex.
--reindex is still running and not OOM-ing, at least...

Eric Wong (3):
  searchidx: all indexers check for bad blobs
  t/extsearch: use indexlevel=basic in inboxes
  extsearchidx: enforce -index before -extindex

 lib/PublicInbox/ExtSearchIdx.pm | 20 +++++------
 lib/PublicInbox/SearchIdx.pm    | 18 ++++++++--
 lib/PublicInbox/V2Writable.pm   | 25 +++++++++++---
 t/extsearch.t                   | 60 +++++++++++++++++++++++++++++++--
 4 files changed, 100 insertions(+), 23 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] searchidx: all indexers check for bad blobs
  2020-12-09  9:25 [PATCH 0/3] extindex: another fix and some cleanups Eric Wong
@ 2020-12-09  9:25 ` Eric Wong
  2020-12-09  9:25 ` [PATCH 2/3] t/extsearch: use indexlevel=basic in inboxes Eric Wong
  2020-12-09  9:25 ` [PATCH 3/3] extsearchidx: enforce -index before -extindex Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-12-09  9:25 UTC (permalink / raw)
  To: meta

This should help us detect bugs in our code or storage
synchronization problems more easily.  This probably won't
detect corrupted git storage, but can detect corrupted SQLite
files.

"Bad blobs, bad blobs, whatcha gonna do when they come for you?"
---
 lib/PublicInbox/ExtSearchIdx.pm | 13 ++-----------
 lib/PublicInbox/SearchIdx.pm    | 18 +++++++++++++++---
 lib/PublicInbox/V2Writable.pm   |  5 ++++-
 3 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 11f7786d..b0a12bca 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -19,7 +19,8 @@ use v5.10.1;
 use parent qw(PublicInbox::ExtSearch PublicInbox::Lock);
 use Carp qw(croak carp);
 use PublicInbox::Search;
-use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor);
+use PublicInbox::SearchIdx qw(crlf_adjust prepare_stack is_ancestor
+	is_bad_blob);
 use PublicInbox::OverIdx;
 use PublicInbox::MiscIdx;
 use PublicInbox::MID qw(mids);
@@ -91,16 +92,6 @@ sub attach_config {
 	$cfg->each_inbox(\&_ibx_attach, $self);
 }
 
-sub is_bad_blob ($$$$) {
-	my ($oid, $type, $size, $expect_oid) = @_;
-	if ($type ne 'blob') {
-		carp "W: $expect_oid is not a blob (type=$type)";
-		return 1;
-	}
-	croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid;
-	$size == 0 ? 1 : 0; # size == 0 means purged
-}
-
 sub check_batch_limit ($) {
 	my ($req) = @_;
 	my $self = $req->{self};
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0124dd11..0fbe6560 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -15,7 +15,7 @@ use PublicInbox::InboxWritable;
 use PublicInbox::MID qw(mids_for_index mids);
 use PublicInbox::MsgIter;
 use PublicInbox::IdxStack;
-use Carp qw(croak);
+use Carp qw(croak carp);
 use POSIX qw(strftime);
 use Time::Local qw(timegm);
 use PublicInbox::OverIdx;
@@ -23,7 +23,7 @@ use PublicInbox::Spawn qw(spawn nodatacow_dir);
 use PublicInbox::Git qw(git_unquote);
 use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp);
 our @EXPORT_OK = qw(crlf_adjust log2stack is_ancestor check_size prepare_stack
-	index_text term_generator add_val);
+	index_text term_generator add_val is_bad_blob);
 my $X = \%PublicInbox::Search::X;
 our ($DB_CREATE_OR_OPEN, $DB_OPEN);
 our $DB_NO_SYNC = 0;
@@ -591,8 +591,19 @@ sub crlf_adjust ($) {
 	}
 }
 
+sub is_bad_blob ($$$$) {
+	my ($oid, $type, $size, $expect_oid) = @_;
+	if ($type ne 'blob') {
+		carp "W: $expect_oid is not a blob (type=$type)";
+		return 1;
+	}
+	croak "BUG: $oid != $expect_oid" if $oid ne $expect_oid;
+	$size == 0 ? 1 : 0; # size == 0 means purged
+}
+
 sub index_both { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $sync) = @_;
+	return if is_bad_blob($oid, $type, $size, $sync->{oid});
 	my ($nr, $max) = @$sync{qw(nr max)};
 	++$$nr;
 	$$max -= $size;
@@ -609,6 +620,7 @@ sub index_both { # git->cat_async callback
 
 sub unindex_both { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $sync) = @_;
+	return if is_bad_blob($oid, $type, $size, $sync->{oid});
 	unindex_eml($sync->{sidx}, $oid, PublicInbox::Eml->new($bref));
 	# may be undef if leftover
 	if (defined(my $cur_cmt = $sync->{cur_cmt})) {
@@ -713,7 +725,7 @@ sub process_stack {
 		$sync->{index_oid} = \&index_both;
 	}
 	while (my ($f, $at, $ct, $oid, $cur_cmt) = $stk->pop_rec) {
-		my $arg = { %$sync, cur_cmt => $cur_cmt };
+		my $arg = { %$sync, cur_cmt => $cur_cmt, oid => $oid };
 		last if $sync->{quit};
 		if ($f eq 'm') {
 			$arg->{autime} = $at;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 5aec7561..07a7fa42 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -17,7 +17,8 @@ use PublicInbox::InboxWritable;
 use PublicInbox::OverIdx;
 use PublicInbox::Msgmap;
 use PublicInbox::Spawn qw(spawn popen_rd);
-use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size);
+use PublicInbox::SearchIdx qw(log2stack crlf_adjust is_ancestor check_size
+	is_bad_blob);
 use IO::Handle; # ->autoflush
 use File::Temp ();
 
@@ -896,6 +897,7 @@ sub reindex_checkpoint ($$) {
 
 sub index_oid { # cat_async callback
 	my ($bref, $oid, $type, $size, $arg) = @_;
+	return if is_bad_blob($oid, $type, $size, $arg->{oid});
 	my $self = $arg->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
 	return if $size == 0; # purged
@@ -1147,6 +1149,7 @@ sub unindex_oid_aux ($$$) {
 
 sub unindex_oid ($$;$) { # git->cat_async callback
 	my ($bref, $oid, $type, $size, $sync) = @_;
+	return if is_bad_blob($oid, $type, $size, $sync->{oid});
 	my $self = $sync->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
 	my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] t/extsearch: use indexlevel=basic in inboxes
  2020-12-09  9:25 [PATCH 0/3] extindex: another fix and some cleanups Eric Wong
  2020-12-09  9:25 ` [PATCH 1/3] searchidx: all indexers check for bad blobs Eric Wong
@ 2020-12-09  9:25 ` Eric Wong
  2020-12-09  9:25 ` [PATCH 3/3] extsearchidx: enforce -index before -extindex Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-12-09  9:25 UTC (permalink / raw)
  To: meta

There's no need for per-inbox Xapian DBs when using extindex, so
reduce wear on the poor systems this test runs on.
---
 t/extsearch.t | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/t/extsearch.t b/t/extsearch.t
index 97786b21..96512227 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -25,8 +25,8 @@ EOF
 close $fh or BAIL_OUT $!;
 my $v2addr = 'v2test@example.com';
 my $v1addr = 'v1test@example.com';
-ok(run_script([qw(-init -V2 v2test --newsgroup v2.example), "$home/v2test",
-	'http://example.com/v2test', $v2addr ]), 'v2test init');
+ok(run_script([qw(-init -Lbasic -V2 v2test --newsgroup v2.example),
+	"$home/v2test", 'http://example.com/v2test', $v2addr ]), 'v2test init');
 my $env = { ORIGINAL_RECIPIENT => $v2addr };
 my $eml = eml_load('t/utf8.eml');
 
@@ -50,7 +50,7 @@ seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
 $env = { ORIGINAL_RECIPIENT => $v1addr };
 run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
 
-run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?";
+run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?";
 
 ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
 {

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] extsearchidx: enforce -index before -extindex
  2020-12-09  9:25 [PATCH 0/3] extindex: another fix and some cleanups Eric Wong
  2020-12-09  9:25 ` [PATCH 1/3] searchidx: all indexers check for bad blobs Eric Wong
  2020-12-09  9:25 ` [PATCH 2/3] t/extsearch: use indexlevel=basic in inboxes Eric Wong
@ 2020-12-09  9:25 ` Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2020-12-09  9:25 UTC (permalink / raw)
  To: meta

We cannot set xref3 data without the `xnum' column to
tie it to the per-inbox over.sqlite3 DB.  So ensure we don't
read brand-new history that only exists in git, but instead
rely on last_commit and last_xap15-$EPOCH metadata in msgmap
to decide how far we can index.

Before this change, it was possible to miss messages in
the extindex if -index did not run (which will be fixable by
upcoming --reindex support in -extindex).
---
 lib/PublicInbox/ExtSearchIdx.pm |  7 ++++-
 lib/PublicInbox/V2Writable.pm   | 20 +++++++++---
 t/extsearch.t                   | 54 +++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index b0a12bca..84449cb4 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -291,7 +291,12 @@ sub _sync_inbox ($$$) {
 	} elsif ($v == 1) {
 		my $uv = $ibx->uidvalidity;
 		my $lc = $self->{oidx}->eidx_meta("lc-v1:$ekey//$uv");
-		my $stk = prepare_stack($sync, $lc ? "$lc..HEAD" : 'HEAD');
+		my $head = $ibx->mm->last_commit;
+		unless (defined $head) {
+			warn "E: $ibx->{inboxdir} is not indexed\n";
+			return;
+		}
+		my $stk = prepare_stack($sync, $lc ? "$lc..$head" : $head);
 		my $unit = { stack => $stk, git => $ibx->git };
 		push @{$sync->{todo}}, $unit;
 	} else {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 07a7fa42..bef3a67a 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1073,10 +1073,22 @@ sub sync_prepare ($$) {
 		$pfx //= $sync->{ibx}->{inboxdir};
 	}
 
-	# reindex stops at the current heads and we later rerun index_sync
-	# without {reindex}
-	my $reindex_heads = $self->last_commits($sync) if $sync->{reindex};
-
+	my $reindex_heads;
+	if ($self->{ibx_map}) {
+		# ExtSearchIdx won't index messages unless they're in
+		# over.sqlite3 for a given inbox, so don't read beyond
+		# what's in the per-inbox index.
+		$reindex_heads = [];
+		my $v = PublicInbox::Search::SCHEMA_VERSION;
+		my $mm = $sync->{ibx}->mm;
+		for my $i (0..$sync->{epoch_max}) {
+			$reindex_heads->[$i] = $mm->last_commit_xap($v, $i);
+		}
+	} elsif ($sync->{reindex}) { # V2 inbox
+		# reindex stops at the current heads and we later
+		# rerun index_sync without {reindex}
+		$reindex_heads = $self->last_commits($sync);
+	}
 	if ($sync->{max_size} = $sync->{-opt}->{max_size}) {
 		$sync->{index_oid} = $self->can('index_oid');
 	}
diff --git a/t/extsearch.t b/t/extsearch.t
index 96512227..70a60b5a 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -176,6 +176,60 @@ is(scalar(@it), 2, 'two inboxes');
 like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
 like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
 
+if ('inject w/o indexing') {
+	use PublicInbox::Import;
+	use PublicInbox::Search;
+	my $schema_version = PublicInbox::Search::SCHEMA_VERSION();
+	my $v1ibx = PublicInbox::Config->new->lookup_name('v1test');
+	my $last_v1_commit = $v1ibx->mm->last_commit;
+	my $v2ibx = PublicInbox::Config->new->lookup_name('v2test');
+	my $last_v2_commit = $v2ibx->mm->last_commit_xap($schema_version, 0);
+	my $git0 = PublicInbox::Git->new("$v2ibx->{inboxdir}/git/0.git");
+	chomp(my $cmt = $git0->qx(qw(rev-parse HEAD^0)));
+	is($last_v2_commit, $cmt, 'v2 index up-to-date');
+
+	my $v2im = PublicInbox::Import->new($git0, undef, undef, $v2ibx);
+	$v2im->{lock_path} = undef;
+	$v2im->{path_type} = 'v2';
+	$v2im->add(eml_load('t/mda-mime.eml'));
+	$v2im->done;
+	chomp(my $tip = $git0->qx(qw(rev-parse HEAD^0)));
+	isnt($tip, $cmt, '0.git v2 updated');
+
+	# inject a message w/o updating index
+	rename("$home/v1test/public-inbox", "$home/v1test/skip-index") or
+		BAIL_OUT $!;
+	open(my $eh, '<', 't/iso-2202-jp.eml') or BAIL_OUT $!;
+	run_script(['-mda', '--no-precheck'], $env, { 0 => $eh}) or
+		BAIL_OUT '-mda';
+	rename("$home/v1test/skip-index", "$home/v1test/public-inbox") or
+		BAIL_OUT $!;
+
+	my ($in, $out, $err);
+	$in = $out = $err = '';
+	my $opt = { 0 => \$in, 1 => \$out, 2 => \$err };
+	ok(run_script([qw(-extindex -v -v --all), "$home/extindex"],
+		undef, undef), 'extindex noop');
+	$es->{xdb}->reopen;
+	my $mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 0, 'did not attempt to index unindexed v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 0, 'did not attempt to index unindexed v2 message');
+	ok(run_script([qw(-index --all)]), 'indexed v1 and v2 inboxes');
+
+	isnt($v1ibx->mm->last_commit, $last_v1_commit, '-index v1 worked');
+	isnt($v2ibx->mm->last_commit_xap($schema_version, 0),
+		$last_v2_commit, '-index v2 worked');
+	ok(run_script([qw(-extindex --all), "$home/extindex"]),
+		'extindex updates');
+
+	$es->{xdb}->reopen;
+	$mset = $es->mset('mid:199707281508.AAA24167@hoyogw.example');
+	is($mset->size, 1, 'got v1 message');
+	$mset = $es->mset('mid:multipart-html-sucks@11');
+	is($mset->size, 1, 'got v2 message');
+}
+
 if ('remove v1test and test gc') {
 	xsys([qw(git config --unset publicinbox.v1test.inboxdir)],
 		{ GIT_CONFIG => $cfg_path });

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2020-12-09  9:25 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-09  9:25 [PATCH 0/3] extindex: another fix and some cleanups Eric Wong
2020-12-09  9:25 ` [PATCH 1/3] searchidx: all indexers check for bad blobs Eric Wong
2020-12-09  9:25 ` [PATCH 2/3] t/extsearch: use indexlevel=basic in inboxes Eric Wong
2020-12-09  9:25 ` [PATCH 3/3] extsearchidx: enforce -index before -extindex Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).