* [PATCH] extindex: support --reindex --fast
@ 2021-10-09 12:04 Eric Wong
2021-10-09 12:15 ` Eric Wong
0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2021-10-09 12:04 UTC (permalink / raw)
To: meta
This mode only checks history for missed/stale messages
and doesn't attempt to reindex messages which are already
indexed.
---
Documentation/public-inbox-extindex.pod | 22 ++++++++++++++++++++++
lib/PublicInbox/ExtSearchIdx.pm | 6 +++---
script/public-inbox-extindex | 9 ++++++---
t/extsearch.t | 5 +++++
4 files changed, 36 insertions(+), 6 deletions(-)
diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod
index 2e2e6383b79b..a0fca83c6255 100644
--- a/Documentation/public-inbox-extindex.pod
+++ b/Documentation/public-inbox-extindex.pod
@@ -40,6 +40,28 @@ C<indexlevel> set to C<basic> and their respective Xapian
public-inboxes where cross-posting is common, this allows
significant space savings on Xapian indices.
+=item --gc
+
+Perform garbage collection instead of indexing. Use this if
+inboxes are removed from the extindex, or if messages are
+purged or removed from some inboxes.
+
+=item --reindex
+
+Forces a re-index of all messages in the extindex. This can be
+used for in-place upgrades and bugfixes while read-only server
+processes are utilizing the index. Keep in mind this roughly
+doubles the size of the already-large Xapian database.
+
+The extindex locks will be released roughly every 10s to
+allow L<public-inbox-mda(1)> and L<public-inbox-watch(1)>
+processes to write to the extindex.
+
+=item --fast
+
+Used with C<--reindex>, it will only look for new and stale
+entries and not touch already-indexed messages.
+
=back
=head1 FILES
diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm
index 7cc8dd952559..20c4cf7807ea 100644
--- a/lib/PublicInbox/ExtSearchIdx.pm
+++ b/lib/PublicInbox/ExtSearchIdx.pm
@@ -811,7 +811,7 @@ sub _reindex_check_unseen ($$$) {
local $sync->{-regen_fmt} =
"$ekey checking unseen %u/".$ibx->over->max."\n";
${$sync->{nr}} = 0;
-
+ my $fast = $sync->{-opt}->{fast};
while (scalar(@{$msgs = $ibx->over->query_xover($beg, $end)})) {
${$sync->{nr}} = $beg;
$beg = $msgs->[-1]->{num} + 1;
@@ -835,7 +835,7 @@ ibx_id = ? AND xnum = ? AND oidbin = ?
# the first time around ASAP:
if (scalar(@$docids) == 0) {
reindex_unseen($self, $sync, $ibx, $xsmsg);
- } else { # already seen, reindex later
+ } elsif (!$fast) { # already seen, reindex later
for my $r (@$docids) {
$self->{oidx}->eidxq_add($r->[0]);
}
@@ -853,7 +853,7 @@ sub _reindex_check_stale ($$$) {
my $fetching;
my $ekey = $ibx->eidx_key;
local $sync->{-regen_fmt} =
- "$ekey check stale/missing %u/".$ibx->over->max."\n";
+ "$ekey checking stale/missing %u/".$ibx->over->max."\n";
${$sync->{nr}} = 0;
do {
if (checkpoint_due($sync)) {
diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex
index 1572a1d23d82..c63f5dc26fd2 100755
--- a/script/public-inbox-extindex
+++ b/script/public-inbox-extindex
@@ -18,6 +18,8 @@ usage: public-inbox-extindex [options] [EXTINDEX_DIR] [INBOX_DIR...]
--max-size=BYTES do not index messages larger than the given size
--gc perform garbage collection instead of indexing
--dedupe[=MSGID] fix prior deduplication errors (may be repeated)
+ --reindex index previously indexed inboxes
+ --fast only reindex unseen/stale messages
--verbose | -v increase verbosity (may be repeated)
--dry-run | -n dry-run on --dedupe
@@ -26,7 +28,7 @@ See public-inbox-extindex(1) man page for full documentation.
EOF
my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 };
GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
- fsync|sync!
+ fsync|sync! fast
indexlevel|index-level|L=s max_size|max-size=s
batch_size|batch-size=s
dedupe:s@ gc commit-interval=i watch scan! dry-run|n
@@ -59,9 +61,10 @@ if ($opt->{gc}) {
} else {
@ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $opt, $cfg);
}
-if ($opt->{'dry-run'} && !$opt->{dedupe}) {
+$opt->{'dry-run'} && !$opt->{dedupe} and
die "E: --dry-run only affects --dedupe\n";
-}
+$opt->{fast} && !$opt->{reindex} and
+ die "E: --fast only affects --reindex\n";
PublicInbox::Admin::require_or_die(qw(-search));
PublicInbox::Config::json() or die "Cpanel::JSON::XS or similar missing\n";
diff --git a/t/extsearch.t b/t/extsearch.t
index ca586f61c29f..896e270414bd 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -336,6 +336,11 @@ if ('reindex catches missed messages') {
$es->{xdb}->reopen;
$mset = $es->mset("mid:$new->{mid}");
is($mset->size, 0, 'stale mid gone Xapian');
+
+ ok(run_script([qw(-extindex --reindex --all --fast), "$home/extindex"],
+ undef, $opt), '--reindex w/ --fast');
+ ok(!run_script([qw(-extindex --all --fast), "$home/extindex"],
+ undef, $opt), '--fast alone makes no sense');
}
if ('reindex catches content bifurcation') {
^ permalink raw reply related [flat|nested] 2+ messages in thread
* Re: [PATCH] extindex: support --reindex --fast
2021-10-09 12:04 [PATCH] extindex: support --reindex --fast Eric Wong
@ 2021-10-09 12:15 ` Eric Wong
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2021-10-09 12:15 UTC (permalink / raw)
To: meta
"fast" is relative of course. Still takes 30-40 minutes, but
that's better than 30-40 hours... Though I could see about
parallelizing it.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2021-10-09 12:15 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2021-10-09 12:04 [PATCH] extindex: support --reindex --fast Eric Wong
2021-10-09 12:15 ` Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).