* [PATCH 01/10] extsearch: rename -eindex to -extindex
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 02/10] extsearchidx: avoid needless alternates rewrite in ALL.git Eric Wong
` (8 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
Upon "eindex" rhymes with "reindex", which could be confusing;
so name the command and config prefix to use "extindex" which
is hopefully less confusing.
---
MANIFEST | 2 +-
lib/PublicInbox/Config.pm | 2 +-
script/{public-inbox-eindex => public-inbox-extindex} | 4 ++--
t/extsearch.t | 6 +++---
4 files changed, 7 insertions(+), 7 deletions(-)
rename script/{public-inbox-eindex => public-inbox-extindex} (93%)
diff --git a/MANIFEST b/MANIFEST
index 10561cd2..fc79a134 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -225,7 +225,7 @@ sa_config/user/.spamassassin/user_prefs
script/public-inbox-compact
script/public-inbox-convert
script/public-inbox-edit
-script/public-inbox-eindex
+script/public-inbox-extindex
script/public-inbox-httpd
script/public-inbox-imapd
script/public-inbox-index
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index d425cc9b..d2010f7a 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -91,7 +91,7 @@ sub lookup_name ($$) {
sub lookup_ei {
my ($self, $name) = @_;
- $self->{-ei_by_name}->{$name} //= _fill_ei($self, "eindex.$name");
+ $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name");
}
sub each_inbox {
diff --git a/script/public-inbox-eindex b/script/public-inbox-extindex
similarity index 93%
rename from script/public-inbox-eindex
rename to script/public-inbox-extindex
index c26edb93..a58f35ca 100644
--- a/script/public-inbox-eindex
+++ b/script/public-inbox-extindex
@@ -6,7 +6,7 @@ use strict;
use v5.10.1;
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
-usage: public-inbox-eindex [options] EINDEX_DIR [INBOX_DIR]
+usage: public-inbox-extindex [options] EXTINDEX_DIR [INBOX_DIR]
Create and update external (detached) search indices
@@ -19,7 +19,7 @@ usage: public-inbox-eindex [options] EINDEX_DIR [INBOX_DIR]
--verbose | -v increase verbosity (may be repeated)
BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
-See public-inbox-eindex(1) man page for full documentation.
+See public-inbox-extindex(1) man page for full documentation.
EOF
my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 };
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
diff --git a/t/extsearch.t b/t/extsearch.t
index 8d2c1507..8792fd9e 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -33,7 +33,7 @@ seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?";
-ok(run_script([qw(-eindex --all), "$home/eindex"]), 'eindex init');
+ok(run_script([qw(-extindex --all), "$home/eindex"]), 'extindex init');
my $es = PublicInbox::ExtSearch->new("$home/eindex");
{
@@ -53,8 +53,8 @@ my $es = PublicInbox::ExtSearch->new("$home/eindex");
my $env = { MAIL_EDITOR => "$^X -i -p -e 's/test message/BEST MSG/'" };
my $cmd = [ qw(-edit -Ft/utf8.eml), "$home/v2test" ];
ok(run_script($cmd, $env, $opt), '-edit');
- ok(run_script([qw(-eindex --all), "$home/eindex"], undef, $opt),
- 'eindex again');
+ ok(run_script([qw(-extindex --all), "$home/eindex"], undef, $opt),
+ 'extindex again');
like($err, qr/discontiguous range/, 'warned about discontiguous range');
my $msg1 = $es->over->get_art(1) or BAIL_OUT 'msg1 missing';
my $msg2 = $es->over->get_art(2) or BAIL_OUT 'msg2 missing';
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 02/10] extsearchidx: avoid needless alternates rewrite in ALL.git
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
2020-11-07 10:56 ` [PATCH 01/10] extsearch: rename -eindex to -extindex Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 03/10] searchidxshard: reduce syscalls when writing ->eidx_key Eric Wong
` (7 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
As with fill_alternates in V2Writable, we do not need to update
$GIT_DIR/objects/info/alternates if nothing is changed.
---
lib/PublicInbox/ExtSearchIdx.pm | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 9d576adb..9da42538 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -380,8 +380,11 @@ sub idx_init { # similar to V2Writable
$new{$line} = 1;
push @new, $line;
}
- push @old, @new;
- PublicInbox::V2Writable::write_alternates($info_dir, $mode, \@old);
+ if (scalar @new) {
+ push @old, @new;
+ my $o = \@old;
+ PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o);
+ }
$self->parallel_init($self->{indexlevel});
$self->umask_prepare;
$self->with_umask(\&PublicInbox::V2Writable::_idx_init, $self, $opt);
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 03/10] searchidxshard: reduce syscalls when writing ->eidx_key
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
2020-11-07 10:56 ` [PATCH 01/10] extsearch: rename -eindex to -extindex Eric Wong
2020-11-07 10:56 ` [PATCH 02/10] extsearchidx: avoid needless alternates rewrite in ALL.git Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 04/10] searchidxshard: further improve {current_info} readability Eric Wong
` (6 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
We use ->autoflush(1) on this pipe to ensure the shard workers
see data immediately on print; so this means we have to do our
own buffering for optional data.
---
lib/PublicInbox/SearchIdxShard.pm | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index e194b7e0..9566d234 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -114,14 +114,11 @@ sub shard_worker_loop ($$$$$) {
sub index_raw {
my ($self, $msgref, $eml, $smsg, $ibx) = @_;
if (my $w = $self->{w}) {
- if ($ibx) {
- print $w 'X=', $ibx->eidx_key, "\0" or die
- "failed to write shard: $!\n";
- }
+ my @ekey = $ibx ? ('X='.$ibx->eidx_key."\0") : ();
$msgref //= \($eml->as_string);
$smsg->{raw_bytes} //= length($$msgref);
# mid must be last, it can contain spaces (but not LF)
- print $w join(' ', @$smsg{qw(raw_bytes bytes
+ print $w @ekey, join(' ', @$smsg{qw(raw_bytes bytes
num blob ds ts tid mid)}),
"\n", $$msgref or die "failed to write shard $!\n";
} else {
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 04/10] searchidxshard: further improve {current_info} readability
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (2 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 03/10] searchidxshard: reduce syscalls when writing ->eidx_key Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 05/10] v2writable: less expensive checkpoint for extindex Eric Wong
` (5 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
Add a space after \0 to visually disambiguate it from the
{bytes} field.
---
lib/PublicInbox/SearchIdxShard.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 9566d234..1333b305 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -89,7 +89,7 @@ sub shard_worker_loop ($$$$$) {
my $eidx_key;
if ($line =~ s/\AX=(.+)\0//) {
$eidx_key = $1;
- $v2w->{current_info} =~ s/\0/\\0/;
+ $v2w->{current_info} =~ s/\0/\\0 /;
}
# n.b. $mid may contain spaces(!)
my ($len, $bytes, $num, $oid, $ds, $ts, $tid, $mid)
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 05/10] v2writable: less expensive checkpoint for extindex
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (3 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 04/10] searchidxshard: further improve {current_info} readability Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 06/10] extsearchidx: quiet warning for unindexed `d' messages Eric Wong
` (4 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
Since extindex holds no locks on parallel inbox writers,
we can simply use "barrier" IPC shard commands to checkpoint
and avoid respawning shard or git processes.
---
lib/PublicInbox/V2Writable.pm | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 0364857f..224675ab 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -620,13 +620,13 @@ sub checkpoint ($;$) {
# Now deal with Xapian
if ($wait) {
- my $barrier = $self->barrier_init(scalar @$shards);
+ my $barrier = barrier_init($self, scalar @$shards);
# each shard needs to issue a barrier command
$_->shard_barrier for @$shards;
# wait for each Xapian shard
- $self->barrier_wait($barrier);
+ barrier_wait($self, $barrier);
} else {
$_->shard_commit for @$shards;
}
@@ -860,11 +860,16 @@ sub atfork_child {
sub reindex_checkpoint ($$) {
my ($self, $sync) = @_;
- $self->git->cleanup; # *async_wait
+ $self->git->async_wait_all;
${$sync->{need_checkpoint}} = 0;
my $mm_tmp = $sync->{mm_tmp};
$mm_tmp->atfork_prepare if $mm_tmp;
- $self->done; # release lock
+ die 'BUG: {im} during reindex' if $self->{im};
+ if ($self->{ibx_map}) {
+ checkpoint($self, 1); # no need to release lock on pure index
+ } else {
+ $self->done; # release lock
+ }
if (my $pr = $sync->{-opt}->{-progress}) {
$pr->(sprintf($sync->{-regen_fmt}, ${$sync->{nr}}));
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 06/10] extsearchidx: quiet warning for unindexed `d' messages
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (4 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 05/10] v2writable: less expensive checkpoint for extindex Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 07/10] extsearch: canonicalize topdir Eric Wong
` (3 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
"deleted" messages (via -learn <spam|rm>) in the source inboxes
are likely to already be unindexed, so avoid triggering needless
warnings about the spam message being missing.
---
lib/PublicInbox/ExtSearchIdx.pm | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 9da42538..2bb9afce 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -164,7 +164,8 @@ sub do_finalize ($) {
} elsif (exists $req->{new_smsg}) { # totally unseen messsage
index_unseen($req);
} else {
- warn "W: ignoring delete $req->{oid} (not found)\n";
+ # `d' message was already unindexed in the v1/v2 inboxes,
+ # so it's too noisy to warn, here.
}
}
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 07/10] extsearch: canonicalize topdir
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (5 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 06/10] extsearchidx: quiet warning for unindexed `d' messages Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 08/10] v2writable: more accurate {current_info} warnings/progress Eric Wong
` (2 subsequent siblings)
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
This makes `ps' output look a bit nicer if there's trailing
slashes involved from the command-line.
---
lib/PublicInbox/ExtSearch.pm | 2 ++
lib/PublicInbox/ExtSearchIdx.pm | 1 +
2 files changed, 3 insertions(+)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index 66c99eaa..eb665027 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -9,12 +9,14 @@ use strict;
use v5.10.1;
use PublicInbox::Over;
use PublicInbox::Inbox;
+use File::Spec ();
# for ->reopen, ->mset, ->mset_to_artnums
use parent qw(PublicInbox::Search);
sub new {
my (undef, $topdir) = @_;
+ $topdir = File::Spec->canonpath($topdir);
bless {
topdir => $topdir,
# xpfx => 'ei15'
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 2bb9afce..3e7f5604 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -30,6 +30,7 @@ use File::Spec;
sub new {
my (undef, $dir, $opt) = @_;
+ $dir = File::Spec->canonpath($dir);
my $l = $opt->{indexlevel} // 'full';
$l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
die "invalid indexlevel=$l\n";
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 08/10] v2writable: more accurate {current_info} warnings/progress
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (6 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 07/10] extsearch: canonicalize topdir Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:56 ` [PATCH 09/10] extindex: SIGUSR1 supports checkpoint Eric Wong
2020-11-07 10:57 ` [PATCH 10/10] extindex: fix --batch-size support Eric Wong
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
With async git blob retrievals, the OID being enqueued and the
OID being processed can be totally unrelated and misleading.
We'll also prefix $INBOX_DIR for v2, and not just the epoch
since we could be indexing multiple inboxes via both -index
and -extindex.
---
lib/PublicInbox/ExtSearchIdx.pm | 7 +++++++
lib/PublicInbox/V2Writable.pm | 24 ++++++++++++++++++------
script/public-inbox-extindex | 1 +
3 files changed, 26 insertions(+), 6 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 3e7f5604..50342802 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -250,17 +250,22 @@ sub cur_ibx_xnum ($$) {
sub index_oid { # git->cat_async callback for 'm'
my ($bref, $oid, $type, $size, $req) = @_;
+ my $self = $req->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
return if is_bad_blob($oid, $type, $size, $req->{oid});
my $new_smsg = $req->{new_smsg} = bless {
blob => $oid,
}, 'PublicInbox::Smsg';
$new_smsg->{bytes} = $size + crlf_adjust($$bref);
defined($req->{xnum} = cur_ibx_xnum($req, $bref)) or return;
+ ++${$req->{nr}};
do_step($req);
}
sub unindex_oid { # git->cat_async callback for 'd'
my ($bref, $oid, $type, $size, $req) = @_;
+ my $self = $req->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
return if is_bad_blob($oid, $type, $size, $req->{oid});
return if defined(cur_ibx_xnum($req, $bref)); # was re-added
do_step($req);
@@ -286,6 +291,8 @@ sub _sync_inbox ($$$) {
-opt => $opt,
self => $self,
ibx => $ibx,
+ nr => \(my $nr = 0),
+ -regen_fmt => "%u/?\n",
};
my $v = $ibx->version;
my $ekey = $ibx->eidx_key;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 224675ab..18f33655 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -882,12 +882,13 @@ sub reindex_checkpoint ($$) {
sub index_oid { # cat_async callback
my ($bref, $oid, $type, $size, $arg) = @_;
+ my $self = $arg->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
return if $size == 0; # purged
my ($num, $mid0);
my $eml = PublicInbox::Eml->new($$bref);
my $mids = mids($eml);
my $chash = content_hash($eml);
- my $self = $arg->{self};
if (scalar(@$mids) == 0) {
warn "E: $oid has no Message-ID, skipping\n";
@@ -1047,6 +1048,11 @@ sub sync_prepare ($$) {
my $pr = $sync->{-opt}->{-progress};
my $regen_max = 0;
my $head = $sync->{ibx}->{ref_head} || 'HEAD';
+ my $pfx;
+ if ($pr) {
+ ($pfx) = ($sync->{ibx}->{inboxdir} =~ m!([^/]+)\z!g);
+ $pfx //= $sync->{ibx}->{inboxdir};
+ }
# reindex stops at the current heads and we later rerun index_sync
# without {reindex}
@@ -1068,7 +1074,7 @@ sub sync_prepare ($$) {
my $range = log_range($sync, $unit, $tip) or next;
# can't use 'rev-list --count' if we use --diff-filter
- $pr->("$i.git counting $range ... ") if $pr;
+ $pr->("$pfx $i.git counting $range ... ") if $pr;
# Don't bump num_highwater on --reindex by using {D}.
# We intentionally do NOT use {D} in the non-reindex case
# because we want NNTP article number gaps from unindexed
@@ -1086,10 +1092,10 @@ sub sync_prepare ($$) {
# our code and blindly injects "d" file history into git repos
if (my @leftovers = keys %{delete($sync->{D}) // {}}) {
warn('W: unindexing '.scalar(@leftovers)." leftovers\n");
+ local $self->{current_info} = 'leftover ';
my $unindex_oid = $self->can('unindex_oid');
for my $oid (@leftovers) {
$oid = unpack('H*', $oid);
- $self->{current_info} = "leftover $oid";
my $req = { %$sync, oid => $oid };
$self->git->cat_async($oid, $unindex_oid, $req);
}
@@ -1121,6 +1127,7 @@ sub unindex_oid_aux ($$$) {
sub unindex_oid ($$;$) { # git->cat_async callback
my ($bref, $oid, $type, $size, $sync) = @_;
my $self = $sync->{self};
+ local $self->{current_info} = "$self->{current_info} $oid";
my $unindexed = $sync->{in_unindex} ? $sync->{unindexed} : undef;
my $mm = $self->{mm};
my $mids = mids(PublicInbox::Eml->new($bref));
@@ -1230,10 +1237,15 @@ sub index_todo ($$$) {
my $all = $self->git;
my $index_oid = $self->can('index_oid');
my $unindex_oid = $self->can('unindex_oid');
- my ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g);
- $pfx //= $unit->{git}->{git_dir};
+ my $pfx;
+ if ($unit->{git}->{git_dir} =~ m!/([^/]+)/git/([0-9]+\.git)\z!) {
+ $pfx = "$1 $2"; # v2
+ } else { # v1
+ ($pfx) = ($unit->{git}->{git_dir} =~ m!/([^/]+)\z!g);
+ $pfx //= $unit->{git}->{git_dir};
+ }
+ local $self->{current_info} = "$pfx ";
while (my ($f, $at, $ct, $oid) = $stk->pop_rec) {
- $self->{current_info} = "$pfx $oid";
my $req = { %$sync, autime => $at, cotime => $ct, oid => $oid };
if ($f eq 'm') {
if ($sync->{max_size}) {
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index a58f35ca..bb1e174a 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -37,6 +37,7 @@ require PublicInbox::Admin;
my $cfg = PublicInbox::Config->new;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Admin::progress_prepare($opt);
require PublicInbox::ExtSearchIdx;
my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
$eidx->attach_inbox($_) for @ibxs;
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 09/10] extindex: SIGUSR1 supports checkpoint
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (7 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 08/10] v2writable: more accurate {current_info} warnings/progress Eric Wong
@ 2020-11-07 10:56 ` Eric Wong
2020-11-07 10:57 ` [PATCH 10/10] extindex: fix --batch-size support Eric Wong
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:56 UTC (permalink / raw)
To: meta
Matching the behavior of git-fast-import(1), we'll allow a user
to send SIGUSR1 to checkpoint over.sqlite3 and Xapian.
---
lib/PublicInbox/ExtSearchIdx.pm | 23 ++++++++++++-----------
script/public-inbox-extindex | 1 +
2 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 50342802..7aaf8291 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -284,16 +284,9 @@ sub last_commits {
}
sub _sync_inbox ($$$) {
- my ($self, $opt, $ibx) = @_;
- my $sync = {
- need_checkpoint => \(my $bool = 0),
- reindex => $opt->{reindex},
- -opt => $opt,
- self => $self,
- ibx => $ibx,
- nr => \(my $nr = 0),
- -regen_fmt => "%u/?\n",
- };
+ my ($self, $sync, $ibx) = @_;
+ $sync->{ibx} = $ibx;
+ $sync->{nr} = \(my $nr = 0);
my $v = $ibx->version;
my $ekey = $ibx->eidx_key;
if ($v == 2) {
@@ -324,10 +317,18 @@ sub eidx_sync { # main entry point
local $SIG{__WARN__} = sub {
$warn_cb->($self->{current_info}, ': ', @_);
};
+ my $sync = {
+ need_checkpoint => \(my $need_checkpoint = 0),
+ reindex => $opt->{reindex},
+ -opt => $opt,
+ self => $self,
+ -regen_fmt => "%u/?\n",
+ };
+ local $SIG{USR1} = sub { $need_checkpoint = 1 };
# don't use $_ here, it'll get clobbered by reindex_checkpoint
for my $ibx (@{$self->{ibx_list}}) {
- _sync_inbox($self, $opt, $ibx);
+ _sync_inbox($self, $sync, $ibx);
}
$self->{oidx}->rethread_done($opt);
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index bb1e174a..864a2732 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -33,6 +33,7 @@ die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
# require lazily to speed up --help
my $eidx_dir = shift(@ARGV) // die "E: $help";
+local $SIG{USR1} = 'IGNORE'; # to be overridden in eidx_sync
require PublicInbox::Admin;
my $cfg = PublicInbox::Config->new;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 10/10] extindex: fix --batch-size support
2020-11-07 10:56 [PATCH 00/10] extindex: another round of updates Eric Wong
` (8 preceding siblings ...)
2020-11-07 10:56 ` [PATCH 09/10] extindex: SIGUSR1 supports checkpoint Eric Wong
@ 2020-11-07 10:57 ` Eric Wong
9 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2020-11-07 10:57 UTC (permalink / raw)
To: meta
Calling PublicInbox::Admin::index_prepare is required for
--batch-size (k|m|g) modifiiers and indexBatchSize in the config
file. Otherwise, the default 1m batch size stuck and led
to unexpectedly bad performance on a machine which could index
v2 inboxes faster with larger batch sizes.
---
script/public-inbox-extindex | 2 ++
1 file changed, 2 insertions(+)
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 864a2732..5d56e2c3 100644
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -39,6 +39,8 @@ my $cfg = PublicInbox::Config->new;
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
PublicInbox::Admin::require_or_die(qw(-search));
PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
require PublicInbox::ExtSearchIdx;
my $eidx = PublicInbox::ExtSearchIdx->new($eidx_dir, $opt);
$eidx->attach_inbox($_) for @ibxs;
^ permalink raw reply related [flat|nested] 11+ messages in thread