unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 4/7] index: do not attach inbox to extindex unless updated
Date: Fri, 25 Dec 2020 10:21:12 +0000	[thread overview]
Message-ID: <20201225102115.6745-5-e@80x24.org> (raw)
In-Reply-To: <20201225102115.6745-1-e@80x24.org>

We'll count the number of log changes (regardless of index or
unindex) and only attach inboxes to ExtSearchIdx objects when
they get new work.  We'll also reduce lock bouncing and only
update external indices after all per-inbox indexing is done.

This also updates existing v2 indexing/unindexing callers
to be more consistent and ensures unindex log entries update
per-inbox last commit information.
---
 lib/PublicInbox/Admin.pm      |  1 +
 lib/PublicInbox/SearchIdx.pm  |  2 ++
 lib/PublicInbox/V2Writable.pm | 26 +++++++++++++++++++-------
 script/public-inbox-index     | 23 ++++++++++++++---------
 4 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 9a86d206..b468108e 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -271,6 +271,7 @@ EOM
 		$idx = PublicInbox::SearchIdx->new($ibx, 1);
 	}
 	$idx->index_sync($opt);
+	$idx->{nidx} // 0; # returns number processed
 }
 
 sub progress_prepare ($) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c8e309fc..b3361e05 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -615,6 +615,7 @@ sub index_both { # git->cat_async callback
 	$smsg->{num} = index_mm($self, $eml, $oid, $sync) or
 		die "E: could not generate NNTP article number for $oid";
 	add_message($self, $eml, $smsg, $sync);
+	++$self->{nidx};
 	my $cur_cmt = $sync->{cur_cmt} // die 'BUG: {cur_cmt} missing';
 	${$sync->{latest_cmt}} = $cur_cmt;
 }
@@ -629,6 +630,7 @@ sub unindex_both { # git->cat_async callback
 	if (defined(my $cur_cmt = $sync->{cur_cmt})) {
 		${$sync->{latest_cmt}} = $cur_cmt;
 	}
+	++$self->{nidx};
 }
 
 sub with_umask {
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 2b849ddf..ca52874b 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -891,12 +891,22 @@ sub reindex_checkpoint ($$) {
 	$mm_tmp->atfork_parent if $mm_tmp;
 }
 
+sub index_finalize ($$) {
+	my ($arg, $index) = @_;
+	++$arg->{self}->{nidx};
+	if (defined(my $cur = $arg->{cur_cmt})) {
+		${$arg->{latest_cmt}} = $cur;
+	} elsif ($index) {
+		die 'BUG: {cur_cmt} missing';
+	} # else { unindexing @leftovers doesn't set {cur_cmt}
+}
+
 sub index_oid { # cat_async callback
 	my ($bref, $oid, $type, $size, $arg) = @_;
-	return if is_bad_blob($oid, $type, $size, $arg->{oid});
+	is_bad_blob($oid, $type, $size, $arg->{oid}) and
+		return index_finalize($arg, 1); # size == 0 purged returns here
 	my $self = $arg->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
-	return if $size == 0; # purged
 	my ($num, $mid0);
 	my $eml = PublicInbox::Eml->new($$bref);
 	my $mids = mids($eml);
@@ -967,7 +977,7 @@ sub index_oid { # cat_async callback
 	if (do_idx($self, $bref, $eml, $smsg)) {
 		${$arg->{need_checkpoint}} = 1;
 	}
-	${$arg->{latest_cmt}} = $arg->{cur_cmt} // die 'BUG: {cur_cmt} missing';
+	index_finalize($arg, 1);
 }
 
 # only update last_commit for $i on reindex iff newer than current
@@ -1157,11 +1167,12 @@ sub unindex_oid_aux ($$$) {
 }
 
 sub unindex_oid ($$;$) { # git->cat_async callback
-	my ($bref, $oid, $type, $size, $sync) = @_;
-	return if is_bad_blob($oid, $type, $size, $sync->{oid});
-	my $self = $sync->{self};
+	my ($bref, $oid, $type, $size, $arg) = @_;
+	is_bad_blob($oid, $type, $size, $arg->{oid}) and
+		return index_finalize($arg, 0);
+	my $self = $arg->{self};
 	local $self->{current_info} = "$self->{current_info} $oid";
-	my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;
+	my $unindexed = $arg->{in_unindex} ? $arg->{unindexed} : undef;
 	my $mm = $self->{mm};
 	my $mids = mids(PublicInbox::Eml->new($bref));
 	undef $$bref;
@@ -1186,6 +1197,7 @@ sub unindex_oid ($$;$) { # git->cat_async callback
 		}
 		unindex_oid_aux($self, $oid, $mid);
 	}
+	index_finalize($arg, 0);
 }
 
 sub git { $_[0]->{ibx}->git }
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 87893ef1..a17bf615 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -63,7 +63,7 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
 PublicInbox::Admin::require_or_die('-index');
 unless (@ibxs) { print STDERR $help; exit 1 }
 
-my (@eidx_dir, %eidx_seen);
+my (@eidx, %eidx_seen);
 my $update_extindex = $opt->{'update-extindex'};
 if (!scalar(@$update_extindex) && (my $ALL = $cfg->ALL)) {
 	# extindex and normal inboxes may have different owners
@@ -84,7 +84,8 @@ for my $ei_name (@$update_extindex) {
 	} else {
 		die "extindex `$ei_name' not configured or found\n";
 	}
-	$eidx_seen{$topdir} //= push(@eidx_dir, $topdir);
+	$eidx_seen{$topdir} //=
+		push(@eidx, PublicInbox::ExtSearchIdx->new($topdir));
 }
 my $mods = {};
 my @eidx_unconfigured;
@@ -95,7 +96,7 @@ foreach my $ibx (@ibxs) {
 	$ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ?
 			'full' : $detected);
 	PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
-	if (@eidx_dir && $ibx->{-unconfigured}) {
+	if (@eidx && $ibx->{-unconfigured}) {
 		push @eidx_unconfigured, "  $ibx->{inboxdir}\n";
 	}
 }
@@ -128,18 +129,22 @@ publicInbox.$ibx->{name}.indexSequentialShard not boolean
 EOL
 		$ibx_opt = { %$opt, sequential_shard => $v };
 	}
-	PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
+	my $nidx = PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt);
 	last if $ibx_opt->{quit};
 	if (my $copt = $opt->{compact_opt}) {
 		local $copt->{jobs} = 0 if $ibx_opt->{sequential_shard};
 		PublicInbox::Xapcmd::run($ibx, 'compact', $copt);
 	}
-	next if $ibx->{-unconfigured};
 	last if $ibx_opt->{quit};
-	for my $dir (@eidx_dir) {
-		my $eidx = PublicInbox::ExtSearchIdx->new($dir);
+	next if $ibx->{-unconfigured} || !$nidx;
+	for my $eidx (@eidx) {
 		$eidx->attach_inbox($ibx);
-		$eidx->eidx_sync($ibx_opt);
-		last if $ibx_opt->{quit};
 	}
 }
+$opt->{-no_fsync} = 1 if !$opt->{fsync};
+my $pr = $opt->{-progress};
+for my $eidx (@eidx) {
+	$pr->("indexing $eidx->{topdir} ...\n") if $pr;
+	$eidx->eidx_sync($opt);
+	last if $opt->{quit};
+}

  parent reply	other threads:[~2020-12-25 10:21 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-25 10:21 [PATCH 0/7] index + extindex interaction improvements Eric Wong
2020-12-25 10:21 ` [PATCH 1/7] index: disable --fast-noop on --reindex Eric Wong
2020-12-25 10:21 ` [PATCH 2/7] extsearchidx: delay SQLite availability checks Eric Wong
2020-12-25 10:21 ` [PATCH 3/7] extsearchidx: close DB handles after use if FD constrained Eric Wong
2020-12-25 10:21 ` Eric Wong [this message]
2020-12-25 10:21 ` [PATCH 5/7] index: fix --no-fsync flag propagation to extindex Eric Wong
2020-12-25 10:21 ` [PATCH 6/7] v2writable: don't verify tip if reindexing Eric Wong
2020-12-25 10:21 ` [PATCH 7/7] index: filter out indexlevel=basic from extindex Eric Wong
2020-12-25 10:39 ` [PATCH 0/7] index + extindex interaction improvements Eric Wong
2020-12-26  1:44   ` [PATCH 0/3] extindex --watch support Eric Wong
2020-12-26  1:44     ` [PATCH 1/3] default to CORE::warn in $SIG{__WARN__} handlers Eric Wong
2020-12-26  1:44     ` [PATCH 2/3] extindex: --watch for inotify-based updates Eric Wong
2020-12-26  1:44     ` [PATCH 3/3] init: use the return value of rel2abs_collapsed Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201225102115.6745-5-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).