unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/5] extindex: random cleanups
@ 2020-12-07  7:40 Eric Wong
  2020-12-07  7:40 ` [PATCH 1/5] over: gracefully show invalid ibx_id Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

Still working on --reindex, but found a bunch of cleanups
and tweaks which are worth doing in any case.

Eric Wong (5):
  over: gracefully show invalid ibx_id
  overidx: wrap eidx_key => ibx_id mapping
  extsearchidx: remove needless SHA-1 check
  searchidx: remove $oid parameter from most calls
  shard_add_eidx_info: pass $eidx_key instead of $ibx object

 lib/PublicInbox/ExtSearchIdx.pm   | 27 +++++-----------------
 lib/PublicInbox/Over.pm           |  1 +
 lib/PublicInbox/OverIdx.pm        | 10 +++++---
 lib/PublicInbox/SearchIdx.pm      | 38 +++++++++++++------------------
 lib/PublicInbox/SearchIdxShard.pm | 38 ++++++++++++++-----------------
 lib/PublicInbox/V2Writable.pm     |  2 +-
 6 files changed, 48 insertions(+), 68 deletions(-)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH 1/5] over: gracefully show invalid ibx_id
  2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
@ 2020-12-07  7:40 ` Eric Wong
  2020-12-07  7:40 ` [PATCH 2/5] overidx: wrap eidx_key => ibx_id mapping Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

While "public-inbox-extindex --gc" invocations try to ensure
proper ordering, it is still possible for users to change
the `inboxes' tables via sqlite3(1) or similar means.  So
show a "missing://ibx_id=$ibx_id" placeholder to avoid undefined
variable warnings.

URLs such as "imaps://..." will eventually be supported as
eidx_keys, so having a URL-like "missing://" as a placeholder
probably makes sense.
---
 lib/PublicInbox/Over.pm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index f34e7fc1..51284601 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -275,6 +275,7 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ?
 		my $r = $_;
 		$eidx_key_sth->execute($r->[0]);
 		my $eidx_key = $eidx_key_sth->fetchrow_array;
+		$eidx_key //= "missing://ibx_id=$r->[0]";
 		"$eidx_key:$r->[1]:".unpack('H*', $r->[2]);
 	} @$rows ];
 }

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/5] overidx: wrap eidx_key => ibx_id mapping
  2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
  2020-12-07  7:40 ` [PATCH 1/5] over: gracefully show invalid ibx_id Eric Wong
@ 2020-12-07  7:40 ` Eric Wong
  2020-12-07  7:40 ` [PATCH 3/5] extsearchidx: remove needless SHA-1 check Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

This makes things a little less noisy and will be
called by ExtSearchIdx.
---
 lib/PublicInbox/OverIdx.pm | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 635aa314..38552247 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -79,6 +79,11 @@ SELECT $id_col FROM $tbl WHERE $val_col = ? LIMIT 1
 	}
 }
 
+sub ibx_id {
+	my ($self, $eidx_key) = @_;
+	id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key);
+}
+
 sub sid {
 	my ($self, $path) = @_;
 	return unless defined $path && $path ne '';
@@ -588,7 +593,7 @@ sub eidx_max {
 sub add_xref3 {
 	my ($self, $docid, $xnum, $oidhex, $eidx_key) = @_;
 	begin_lazy($self);
-	my $ibx_id = id_for($self, 'inboxes', 'ibx_id', eidx_key => $eidx_key);
+	my $ibx_id = ibx_id($self, $eidx_key);
 	my $oidbin = pack('H*', $oidhex);
 	my $sth = $self->{dbh}->prepare_cached(<<'');
 INSERT OR IGNORE INTO xref3 (docid, ibx_id, xnum, oidbin) VALUES (?, ?, ?, ?)
@@ -607,8 +612,7 @@ sub remove_xref3 {
 	my $oidbin = pack('H*', $oidhex);
 	my ($sth, $ibx_id);
 	if (defined $eidx_key) {
-		$ibx_id = id_for($self, 'inboxes', 'ibx_id',
-					eidx_key => $eidx_key);
+		$ibx_id = ibx_id($self, $eidx_key);
 		$sth = $self->{dbh}->prepare_cached(<<'');
 DELETE FROM xref3 WHERE docid = ? AND ibx_id = ? AND oidbin = ?
 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/5] extsearchidx: remove needless SHA-1 check
  2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
  2020-12-07  7:40 ` [PATCH 1/5] over: gracefully show invalid ibx_id Eric Wong
  2020-12-07  7:40 ` [PATCH 2/5] overidx: wrap eidx_key => ibx_id mapping Eric Wong
@ 2020-12-07  7:40 ` Eric Wong
  2020-12-07  7:40 ` [PATCH 4/5] searchidx: remove $oid parameter from most calls Eric Wong
  2020-12-07  7:40 ` [PATCH 5/5] shard_add_eidx_info: pass $eidx_key instead of $ibx object Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

There is no need to verify checksums of data already stored in
git.  Doing this ourselves also limits flexibility in moving to
other hashes.
---
 lib/PublicInbox/ExtSearchIdx.pm | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 4de47b58..819c7903 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -91,14 +91,6 @@ sub attach_config {
 	$cfg->each_inbox(\&_ibx_attach, $self);
 }
 
-sub git_blob_digest ($) {
-	my ($bref) = @_;
-	my $dig = Digest::SHA->new(1); # XXX SHA256 later
-	$dig->add('blob '.length($$bref)."\0");
-	$dig->add($$bref);
-	$dig;
-}
-
 sub is_bad_blob ($$$$) {
 	my ($oid, $type, $size, $expect_oid) = @_;
 	if ($type ne 'blob') {
@@ -245,10 +237,6 @@ sub cur_ibx_xnum ($$) {
 	my ($req, $bref) = @_;
 	my $ibx = $req->{ibx} or die 'BUG: current {ibx} missing';
 
-	# XXX overkill?
-	git_blob_digest($bref)->hexdigest eq $req->{oid} or die
-		"BUG: blob mismatch $req->{oid}";
-
 	$req->{eml} = PublicInbox::Eml->new($bref);
 	$req->{chash} = content_hash($req->{eml});
 	$req->{mids} = mids($req->{eml});

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/5] searchidx: remove $oid parameter from most calls
  2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
                   ` (2 preceding siblings ...)
  2020-12-07  7:40 ` [PATCH 3/5] extsearchidx: remove needless SHA-1 check Eric Wong
@ 2020-12-07  7:40 ` Eric Wong
  2020-12-07  7:40 ` [PATCH 5/5] shard_add_eidx_info: pass $eidx_key instead of $ibx object Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

Xapian docids have been tied to the over {num} column for
nearly 3 years, now; and OIDs are no longer stored in Xapian
document data.  There's no need to increase code and IPC
complexity by passing the OID around.
---
 lib/PublicInbox/ExtSearchIdx.pm   | 15 +++++-------
 lib/PublicInbox/SearchIdx.pm      | 38 +++++++++++++------------------
 lib/PublicInbox/SearchIdxShard.pm | 37 ++++++++++++++----------------
 lib/PublicInbox/V2Writable.pm     |  2 +-
 4 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 819c7903..c06b25a9 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -125,17 +125,16 @@ sub do_xpost ($$) {
 	if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
 		my $xnum = $req->{xnum};
 		$self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
-		$idx->shard_add_eidx_info($docid, $oid, $xibx, $eml);
+		$idx->shard_add_eidx_info($docid, $xibx, $eml);
 		check_batch_limit($req);
 	} else { # 'd'
 		my $rm_eidx_info;
 		my $nr = $self->{oidx}->remove_xref3($docid, $oid, $eidx_key,
 							\$rm_eidx_info);
 		if ($nr == 0) {
-			$idx->shard_remove($oid, $docid);
+			$idx->shard_remove($docid);
 		} elsif ($rm_eidx_info) {
-			$idx->shard_remove_eidx_info($docid, $oid, $eidx_key,
-							$eml);
+			$idx->shard_remove_eidx_info($docid, $eidx_key, $eml);
 		}
 	}
 }
@@ -333,13 +332,11 @@ DELETE FROM xref3 WHERE docid = ? AND ibx_id = ?
 	if (@$remain) {
 		for my $oid (@oid) {
 			warn "I: unref #$docid $eidx_key $oid\n";
-			$idx->shard_remove_eidx_info($docid, $oid, $eidx_key);
+			$idx->shard_remove_eidx_info($docid, $eidx_key);
 		}
 	} else {
-		for my $oid (@oid) {
-			warn "I: remove #$docid $eidx_key $oid\n";
-			$idx->shard_remove($oid, $docid);
-		}
+		warn "I: remove #$docid $eidx_key @oid\n";
+		$idx->shard_remove($docid);
 	}
 }
 
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c18c7c36..0124dd11 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -445,20 +445,20 @@ sub add_message {
 	$smsg->{num};
 }
 
-sub _get_doc ($$$) {
-	my ($self, $docid, $oid) = @_;
+sub _get_doc ($$) {
+	my ($self, $docid) = @_;
 	my $doc = eval { $self->{xdb}->get_document($docid) };
 	$doc // do {
 		warn "E: $@\n" if $@;
-		warn "E: #$docid $oid missing in Xapian\n";
+		warn "E: #$docid missing in Xapian\n";
 		undef;
 	}
 }
 
 sub add_eidx_info {
-	my ($self, $docid, $oid, $eidx_key, $eml) = @_;
+	my ($self, $docid, $eidx_key, $eml) = @_;
 	begin_txn_lazy($self);
-	my $doc = _get_doc($self, $docid, $oid) or return;
+	my $doc = _get_doc($self, $docid) or return;
 	term_generator($self)->set_document($doc);
 	$doc->add_boolean_term('O'.$eidx_key);
 	index_list_id($self, $doc, $eml);
@@ -466,9 +466,9 @@ sub add_eidx_info {
 }
 
 sub remove_eidx_info {
-	my ($self, $docid, $oid, $eidx_key, $eml) = @_;
+	my ($self, $docid, $eidx_key, $eml) = @_;
 	begin_txn_lazy($self);
-	my $doc = _get_doc($self, $docid, $oid) or return;
+	my $doc = _get_doc($self, $docid) or return;
 	eval { $doc->remove_term('O'.$eidx_key) };
 	warn "W: ->remove_term O$eidx_key: $@\n" if $@;
 	for my $l ($eml ? $eml->header_raw('List-Id') : ()) {
@@ -512,25 +512,19 @@ sub smsg_from_doc ($) {
 }
 
 sub xdb_remove {
-	my ($self, $oid, @removed) = @_;
+	my ($self, @docids) = @_;
 	my $xdb = $self->{xdb} or return;
-	for my $num (@removed) {
-		my $doc = _get_doc($self, $num, $oid) or next;
-		my $smsg = smsg_from_doc($doc);
-		my $blob = $smsg->{blob}; # may be undef if --skip-docdata
-		if (!defined($blob) || $blob eq $oid) {
-			$xdb->delete_document($num);
-		} else {
-			warn "E: #$num $oid != $blob in Xapian\n";
-		}
+	for my $docid (@docids) {
+		eval { $xdb->delete_document($docid) };
+		warn "E: #$docid not in in Xapian? $@\n" if $@;
 	}
 }
 
-sub remove_by_oid {
-	my ($self, $oid, $num) = @_;
-	die "BUG: remove_by_oid is v2-only\n" if $self->{oidx};
+sub remove_by_docid {
+	my ($self, $num) = @_;
+	die "BUG: remove_by_docid is v2-only\n" if $self->{oidx};
 	$self->begin_txn_lazy;
-	xdb_remove($self, $oid, $num) if need_xapian($self);
+	xdb_remove($self, $num) if need_xapian($self);
 }
 
 sub index_git_blob_id {
@@ -566,7 +560,7 @@ sub unindex_eml {
 	} else { # just in case msgmap and over.sqlite3 become desynched:
 		$self->{mm}->mid_delete($mids->[0]);
 	}
-	xdb_remove($self, $oid, keys %tmp) if need_xapian($self);
+	xdb_remove($self, keys %tmp) if need_xapian($self);
 }
 
 sub index_mm {
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 53fac9b6..182bbde2 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -79,19 +79,16 @@ sub shard_worker_loop ($$$$$) {
 			# no need to lock < 512 bytes is atomic under POSIX
 			print $bnote "barrier $shard\n" or
 					die "write failed for barrier $!\n";
-		} elsif ($line =~ /\AD ([a-f0-9]{40,}) ([0-9]+)\n\z/s) {
-			$self->remove_by_oid($1, $2 + 0);
+		} elsif ($line =~ /\AD ([0-9]+)\n\z/s) {
+			$self->remove_by_docid($1 + 0);
 		} elsif ($line =~ s/\A\+X //) {
-			my ($len, $docid, $oid, $eidx_key) =
-							split(/ /, $line, 4);
+			my ($len, $docid, $eidx_key) = split(/ /, $line, 3);
 			chomp $eidx_key;
-			$self->add_eidx_info($docid, $oid, $eidx_key,
-							eml($r, $len));
+			$self->add_eidx_info($docid, $eidx_key, eml($r, $len));
 		} elsif ($line =~ s/\A-X //) {
-			my ($len, $docid, $oid, $eidx_key) =
-							split(/ /, $line, 4);
+			my ($len, $docid, $eidx_key) = split(/ /, $line, 3);
 			chomp $eidx_key;
-			$self->remove_eidx_info($docid, $oid, $eidx_key,
+			$self->remove_eidx_info($docid, $eidx_key,
 							eml($r, $len));
 		} elsif ($line =~ s/\AO ([^\n]+)\n//) {
 			my $over_fn = $1;
@@ -147,27 +144,27 @@ sub index_raw {
 }
 
 sub shard_add_eidx_info {
-	my ($self, $docid, $oid, $xibx, $eml) = @_;
+	my ($self, $docid, $xibx, $eml) = @_;
 	my $eidx_key = $xibx->eidx_key;
 	if (my $w = $self->{w}) {
 		my $hdr = $eml->header_obj->as_string;
 		my $len = length($hdr);
-		print $w "+X $len $docid $oid $eidx_key\n", $hdr or
+		print $w "+X $len $docid $eidx_key\n", $hdr or
 			die "failed to write shard: $!";
 	} else {
-		$self->add_eidx_info($docid, $oid, $eidx_key, $eml);
+		$self->add_eidx_info($docid, $eidx_key, $eml);
 	}
 }
 
 sub shard_remove_eidx_info {
-	my ($self, $docid, $oid, $eidx_key, $eml) = @_;
+	my ($self, $docid, $eidx_key, $eml) = @_;
 	if (my $w = $self->{w}) {
 		my $hdr = $eml ? $eml->header_obj->as_string : '';
 		my $len = length($hdr);
-		print $w "-X $len $docid $oid $eidx_key\n", $hdr or
+		print $w "-X $len $docid $eidx_key\n", $hdr or
 			die "failed to write shard: $!";
 	} else {
-		$self->remove_eidx_info($docid, $oid, $eidx_key, $eml);
+		$self->remove_eidx_info($docid, $eidx_key, $eml);
 	}
 }
 
@@ -208,17 +205,17 @@ sub shard_close {
 }
 
 sub shard_remove {
-	my ($self, $oid, $num) = @_;
-	if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child
-		print $w "D $oid $num\n" or die "failed to write remove $!";
+	my ($self, $num) = @_;
+	if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child
+		print $w "D $num\n" or die "failed to write remove $!";
 	} else { # same process
-		$self->remove_by_oid($oid, $num);
+		$self->remove_by_docid($num);
 	}
 }
 
 sub shard_over_check {
 	my ($self, $over) = @_;
-	if (my $w = $self->{w}) { # triggers remove_by_oid in a shard child
+	if (my $w = $self->{w}) { # triggers remove_by_docid in a shard child
 		my ($over_fn) = $over->{dbh}->sqlite_db_filename;
 		$over_fn =~ tr/\n/\0/;
 		print $w "O $over_fn\n" or die "failed to write over $!";
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index e9a43000..5aec7561 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1141,7 +1141,7 @@ sub unindex_oid_aux ($$$) {
 	my @removed = $self->{oidx}->remove_oid($oid, $mid);
 	for my $num (@removed) {
 		my $idx = idx_shard($self, $num);
-		$idx->shard_remove($oid, $num);
+		$idx->shard_remove($num);
 	}
 }
 

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/5] shard_add_eidx_info: pass $eidx_key instead of $ibx object
  2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
                   ` (3 preceding siblings ...)
  2020-12-07  7:40 ` [PATCH 4/5] searchidx: remove $oid parameter from most calls Eric Wong
@ 2020-12-07  7:40 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2020-12-07  7:40 UTC (permalink / raw)
  To: meta

This improves consistency with sibling methods such as
->shard_remove_eidx_info and ->add_xref3.  Passing the
$eidx_key scalar is preferable to the entire $ibx object
for IPC-friendliness.
---
 lib/PublicInbox/ExtSearchIdx.pm   | 2 +-
 lib/PublicInbox/SearchIdxShard.pm | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index c06b25a9..c82d1633 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -125,7 +125,7 @@ sub do_xpost ($$) {
 	if (my $new_smsg = $req->{new_smsg}) { # 'm' on cross-posted message
 		my $xnum = $req->{xnum};
 		$self->{oidx}->add_xref3($docid, $xnum, $oid, $eidx_key);
-		$idx->shard_add_eidx_info($docid, $xibx, $eml);
+		$idx->shard_add_eidx_info($docid, $eidx_key, $eml);
 		check_batch_limit($req);
 	} else { # 'd'
 		my $rm_eidx_info;
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 182bbde2..579ed196 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -144,8 +144,7 @@ sub index_raw {
 }
 
 sub shard_add_eidx_info {
-	my ($self, $docid, $xibx, $eml) = @_;
-	my $eidx_key = $xibx->eidx_key;
+	my ($self, $docid, $eidx_key, $eml) = @_;
 	if (my $w = $self->{w}) {
 		my $hdr = $eml->header_obj->as_string;
 		my $len = length($hdr);

^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-12-07  7:40 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-12-07  7:40 [PATCH 0/5] extindex: random cleanups Eric Wong
2020-12-07  7:40 ` [PATCH 1/5] over: gracefully show invalid ibx_id Eric Wong
2020-12-07  7:40 ` [PATCH 2/5] overidx: wrap eidx_key => ibx_id mapping Eric Wong
2020-12-07  7:40 ` [PATCH 3/5] extsearchidx: remove needless SHA-1 check Eric Wong
2020-12-07  7:40 ` [PATCH 4/5] searchidx: remove $oid parameter from most calls Eric Wong
2020-12-07  7:40 ` [PATCH 5/5] shard_add_eidx_info: pass $eidx_key instead of $ibx object Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).