* [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1)
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 02/26] v1writable: retire in favor of InboxWritable Eric Wong
` (25 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Can't run the test if the required Xapian tools are missing.
---
t/convert-compact.t | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/t/convert-compact.t b/t/convert-compact.t
index 491486d..b45a9b5 100644
--- a/t/convert-compact.t
+++ b/t/convert-compact.t
@@ -5,6 +5,7 @@ use warnings;
use Test::More;
use File::Temp qw/tempdir/;
use PublicInbox::MIME;
+use PublicInbox::Spawn qw(which);
require './t/common.perl';
require_git(2.6);
my @mods = qw(DBD::SQLite Search::Xapian);
@@ -12,6 +13,9 @@ foreach my $mod (@mods) {
eval "require $mod";
plan skip_all => "$mod missing for convert-compact.t" if $@;
}
+which('xapian-compact') or
+ plan skip_all => 'xapian-compact missing for '.__FILE__;
+
use_ok 'PublicInbox::V2Writable';
use PublicInbox::Import;
my $tmpdir = tempdir('convert-compact-XXXXXX', TMPDIR => 1, CLEANUP => 1);
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 02/26] v1writable: retire in favor of InboxWritable
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
2019-05-23 9:36 ` [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1) Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 03/26] doc: document the reason for --no-renumber Eric Wong
` (24 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
In retrospect, introducing V1Writable was unnecessary and
InboxWritable->importer is in a better position to abstract
away differences between v1 and v2 writers.
So teach InboxWritable to initialize inboxes and get rid
of V1Writable.
---
MANIFEST | 1 -
lib/PublicInbox/InboxWritable.pm | 35 ++++++++++++++++++++++++--------
lib/PublicInbox/V1Writable.pm | 34 -------------------------------
lib/PublicInbox/V2Writable.pm | 6 +++---
script/public-inbox-init | 13 +++---------
t/cgi.t | 4 ++--
t/indexlevels-mirror.t | 5 ++---
t/init.t | 4 ++--
t/nntpd.t | 15 +++-----------
t/v2mirror.t | 1 +
10 files changed, 43 insertions(+), 75 deletions(-)
delete mode 100644 lib/PublicInbox/V1Writable.pm
diff --git a/MANIFEST b/MANIFEST
index 2c356c6..2b101fa 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -126,7 +126,6 @@ lib/PublicInbox/SpawnPP.pm
lib/PublicInbox/Syscall.pm
lib/PublicInbox/Unsubscribe.pm
lib/PublicInbox/UserContent.pm
-lib/PublicInbox/V1Writable.pm
lib/PublicInbox/V2Writable.pm
lib/PublicInbox/View.pm
lib/PublicInbox/ViewDiff.pm
diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm
index 2f1ca6f..116f423 100644
--- a/lib/PublicInbox/InboxWritable.pm
+++ b/lib/PublicInbox/InboxWritable.pm
@@ -19,25 +19,44 @@ use constant {
};
sub new {
- my ($class, $ibx) = @_;
- bless $ibx, $class;
+ my ($class, $ibx, $creat_opt) = @_;
+ my $self = bless $ibx, $class;
+
+ # TODO: maybe stop supporting this
+ if ($creat_opt) { # for { nproc => $N }
+ $self->{-creat_opt} = $creat_opt;
+ init_inbox($self) if ($self->{version} || 1) == 1;
+ }
+ $self;
+}
+
+sub init_inbox {
+ my ($self, $partitions, $skip_epoch, $skip_artnum) = @_;
+ # TODO: honor skip_artnum
+ my $v = $self->{version} || 1;
+ if ($v == 1) {
+ my $dir = $self->{mainrepo} or die "no mainrepo in inbox\n";
+ PublicInbox::Import::init_bare($dir);
+ } else {
+ my $v2w = importer($self);
+ $v2w->init_inbox($partitions, $skip_epoch, $skip_artnum);
+ }
}
sub importer {
my ($self, $parallel) = @_;
- $self->{-importer} ||= eval {
+ $self->{-importer} ||= do {
my $v = $self->{version} || 1;
if ($v == 2) {
eval { require PublicInbox::V2Writable };
die "v2 not supported: $@\n" if $@;
- my $v2w = PublicInbox::V2Writable->new($self);
+ my $opt = $self->{-creat_opt};
+ my $v2w = PublicInbox::V2Writable->new($self, $opt);
$v2w->{parallel} = $parallel;
$v2w;
} elsif ($v == 1) {
- my $git = $self->git;
- my $name = $self->{name};
- my $addr = $self->{-primary_address};
- PublicInbox::Import->new($git, $name, $addr, $self);
+ my @arg = (undef, undef, undef, $self);
+ PublicInbox::Import->new(@arg);
} else {
$! = 78; # EX_CONFIG 5.3.5 local configuration error
die "unsupported inbox version: $v\n";
diff --git a/lib/PublicInbox/V1Writable.pm b/lib/PublicInbox/V1Writable.pm
deleted file mode 100644
index 6ca5db4..0000000
--- a/lib/PublicInbox/V1Writable.pm
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (C) 2019 all contributors <meta@public-inbox.org>
-# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
-
-# This interface wraps PublicInbox::Import and makes it closer
-# to V2Writable
-# Used to write to V1 inboxes (see L<public-inbox-v1-format(5)>).
-package PublicInbox::V1Writable;
-use strict;
-use warnings;
-use base qw(PublicInbox::Import);
-use PublicInbox::InboxWritable;
-
-sub new {
- my ($class, $ibx, $creat) = @_;
- my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
- unless (-d $dir) {
- if ($creat) {
- PublicInbox::Import::init_bare($dir);
- } else {
- die "$dir does not exist\n";
- }
- }
- $ibx = PublicInbox::InboxWritable->new($ibx);
- $class->SUPER::new(undef, undef, undef, $ibx);
-}
-
-sub init_inbox {
- my ($self, $partitions, $skip_epoch, $skip_artnum) = @_;
- # TODO: honor skip_artnum
- my $dir = $self->{-inbox}->{mainrepo} or die "no mainrepo in inbox\n";
- PublicInbox::Import::init_bare($dir);
-}
-
-1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index afcac4d..c476cb3 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -94,13 +94,13 @@ sub new {
}
sub init_inbox {
- my ($self, $parallel, $skip) = @_;
+ my ($self, $parallel, $skip_epoch) = @_;
$self->{parallel} = $parallel;
$self->idx_init;
my $epoch_max = -1;
git_dir_latest($self, \$epoch_max);
- if (defined $skip && $epoch_max == -1) {
- $epoch_max = $skip;
+ if (defined $skip_epoch && $epoch_max == -1) {
+ $epoch_max = $skip_epoch;
}
$self->git_init($epoch_max >= 0 ? $epoch_max : 0);
$self->done;
diff --git a/script/public-inbox-init b/script/public-inbox-init
index 2cc704c..5724c52 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -10,7 +10,7 @@ use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-base');
require PublicInbox::Config;
-require PublicInbox::Inbox;
+require PublicInbox::InboxWritable;
use File::Temp qw/tempfile/;
use File::Basename qw/dirname/;
use File::Path qw/mkpath/;
@@ -116,15 +116,8 @@ my $ibx = PublicInbox::Inbox->new({
indexlevel => $indexlevel,
});
-if ($version >= 2) {
- require PublicInbox::V2Writable;
- PublicInbox::V2Writable->new($ibx, 1)->init_inbox(0, $skip);
-} elsif ($version == 1) {
- require PublicInbox::V1Writable;
- PublicInbox::V1Writable->new($ibx, 1)->init_inbox(0, $skip);
-} else {
- die "Unsupported -V/--version: $version\n";
-}
+my $creat_opt = {};
+PublicInbox::InboxWritable->new($ibx, $creat_opt)->init_inbox(0, $skip);
# needed for git prior to v2.1.0
umask(0077) if defined $perm;
diff --git a/t/cgi.t b/t/cgi.t
index d3172bf..81130df 100644
--- a/t/cgi.t
+++ b/t/cgi.t
@@ -41,11 +41,11 @@ my $cfgpfx = "publicinbox.test";
use_ok 'PublicInbox::Git';
use_ok 'PublicInbox::Import';
use_ok 'PublicInbox::Inbox';
-use_ok 'PublicInbox::V1Writable';
+use_ok 'PublicInbox::InboxWritable';
use_ok 'PublicInbox::Config';
my $cfg = PublicInbox::Config->new($pi_config);
my $ibx = $cfg->lookup_name('test');
-my $im = PublicInbox::V1Writable->new($ibx);
+my $im = PublicInbox::InboxWritable->new($ibx)->importer;
{
local $ENV{HOME} = $home;
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
index 3dd4323..d124c75 100644
--- a/t/indexlevels-mirror.t
+++ b/t/indexlevels-mirror.t
@@ -5,6 +5,7 @@ use warnings;
use Test::More;
use PublicInbox::MIME;
use PublicInbox::Inbox;
+use PublicInbox::InboxWritable;
use File::Temp qw/tempdir/;
require './t/common.perl';
require_git(2.6);
@@ -38,9 +39,7 @@ sub import_index_incremental {
-primary_address => 'test@example.com',
indexlevel => $level,
});
- my $cls = "PublicInbox::V${v}Writable";
- use_ok $cls;
- my $im = $cls->new($ibx, {nproc=>1});
+ my $im = PublicInbox::InboxWritable->new($ibx, {nproc=>1})->importer;
$mime->header_set('Message-ID', '<m@1>');
ok($im->add($mime), 'first message added');
$im->done;
diff --git a/t/init.t b/t/init.t
index 86b4eb5..79dcad1 100644
--- a/t/init.t
+++ b/t/init.t
@@ -88,7 +88,7 @@ SKIP: {
qw(http://example.com/skip1 skip1@example.com));
is(system(@cmd), 0, "--skip 1");
my $gits = [ glob("$tmpdir/skip1/git/*.git") ];
- is_deeply(["$tmpdir/skip1/git/1.git"], $gits, 'skip OK');
+ is_deeply($gits, ["$tmpdir/skip1/git/1.git"], 'skip OK');
}
@@ -96,7 +96,7 @@ SKIP: {
qw(http://example.com/skip2 skip2@example.com));
is(system(@cmd), 0, "--skip 2");
my $gits = [ glob("$tmpdir/skip2/git/*.git") ];
- is_deeply(["$tmpdir/skip2/git/2.git"], $gits, 'skipping 2 works, too');
+ is_deeply($gits, ["$tmpdir/skip2/git/2.git"], 'skipping 2 works, too');
}
done_testing();
diff --git a/t/nntpd.t b/t/nntpd.t
index c7ea319..aa62ff6 100644
--- a/t/nntpd.t
+++ b/t/nntpd.t
@@ -9,6 +9,7 @@ foreach my $mod (qw(DBD::SQLite)) {
}
require PublicInbox::SearchIdx;
require PublicInbox::Msgmap;
+require PublicInbox::InboxWritable;
use Email::Simple;
use IO::Socket;
use Socket qw(IPPROTO_TCP TCP_NODELAY);
@@ -30,9 +31,6 @@ my $group = 'test-nntpd';
my $addr = $group . '@example.com';
my $nntpd = 'blib/script/public-inbox-nntpd';
my $init = 'blib/script/public-inbox-init';
-use_ok 'PublicInbox::Import';
-use_ok 'PublicInbox::Inbox';
-use_ok 'PublicInbox::Git';
SKIP: {
skip "git 2.6+ required for V2Writable", 1 if $version == 1;
use_ok 'PublicInbox::V2Writable';
@@ -68,15 +66,8 @@ $ibx = PublicInbox::Inbox->new($ibx);
0, 'enabled newsgroup');
my $len;
- my $im;
- if ($version == 2) {
- $im = PublicInbox::V2Writable->new($ibx);
- } elsif ($version == 1) {
- use_ok 'PublicInbox::V1Writable';
- $im = PublicInbox::V1Writable->new($ibx);
- } else {
- die "unsupported version: $version";
- }
+ $ibx = PublicInbox::InboxWritable->new($ibx);
+ my $im = $ibx->importer;
# ensure successful message delivery
{
diff --git a/t/v2mirror.t b/t/v2mirror.t
index 441e36d..fe05ec4 100644
--- a/t/v2mirror.t
+++ b/t/v2mirror.t
@@ -17,6 +17,7 @@ use File::Temp qw/tempdir/;
use IO::Socket;
use POSIX qw(dup2);
use_ok 'PublicInbox::V2Writable';
+use PublicInbox::InboxWritable;
use PublicInbox::MIME;
use PublicInbox::Config;
# FIXME: too much setup
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 03/26] doc: document the reason for --no-renumber
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
2019-05-23 9:36 ` [PATCH 01/26] t/convert-compact: skip on missing xapian-compact(1) Eric Wong
2019-05-23 9:36 ` [PATCH 02/26] v1writable: retire in favor of InboxWritable Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 04/26] search: reenable phrase search on non-chert Xapian Eric Wong
` (23 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
We're going to need copydatabase, too
---
Documentation/public-inbox-v1-format.pod | 4 ++++
Documentation/public-inbox-v2-format.pod | 4 ++++
script/public-inbox-compact | 2 ++
3 files changed, 10 insertions(+)
diff --git a/Documentation/public-inbox-v1-format.pod b/Documentation/public-inbox-v1-format.pod
index 2a6b8d3..3b0e70e 100644
--- a/Documentation/public-inbox-v1-format.pod
+++ b/Documentation/public-inbox-v1-format.pod
@@ -134,6 +134,10 @@ Since SCHEMA_VERSION 15 and the development of the v2 format,
the "overview" DB also exists in the xapian directory for v1
repositories. See L<public-inbox-v2-format(5)/OVERVIEW DB>
+Our use of the L</OVERVIEW DB> requires Xapian document IDs to
+remain stable. Thus, use of L<xapian-compact(1)> and
+L<copydatabase(8)> require the use of C<--no-renumber> switch.
+
=item $GIT_DIR/ssoma.index
This file is no longer used or created by public-inbox, but it is
diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod
index 7dfe329..bc58074 100644
--- a/Documentation/public-inbox-v2-format.pod
+++ b/Documentation/public-inbox-v2-format.pod
@@ -117,6 +117,10 @@ Rotational storage devices are NOT recommended for indexing of
large mail archives; but are fine for backup and usable for
small instances.
+Our use of the L</OVERVIEW DB> requires Xapian document IDs to
+remain stable. Thus, use of L<xapian-compact(1)> and
+L<copydatabase(8)> require the use of C<--no-renumber> switch.
+
=head2 OVERVIEW DB
Towards the end of v2 development, it became apparent Xapian did
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index d22e403..395eec3 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -51,6 +51,8 @@ sub commit_changes ($$$) {
$im->lock_release;
remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
}
+
+# we rely on --no-renumber to keep docids synched to NNTP
my @compact = qw(xapian-compact --no-renumber);
if ($v == 2) {
require PublicInbox::V2Writable;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 04/26] search: reenable phrase search on non-chert Xapian
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (2 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 03/26] doc: document the reason for --no-renumber Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 05/26] xapcmd: new module for wrapping Xapian commands Eric Wong
` (22 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
This is assuming nobody uses flint or earlier, anymore;
as flint predates the existence of this project.
---
lib/PublicInbox/Search.pm | 48 +++++++++++++++++++++++----------------
t/search.t | 1 +
2 files changed, 30 insertions(+), 19 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index eae10d8..d861cf4 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -24,8 +24,8 @@ sub load_xapian () {
# n.b. FLAG_PURE_NOT is expensive not suitable for a public
# website as it could become a denial-of-service vector
- # FLAG_PHRASE also seems to cause performance problems
- # sometimes.
+ # FLAG_PHRASE also seems to cause performance problems chert
+ # (and probably earlier Xapian DBs). glass seems fine...
# TODO: make this an option, maybe?
# or make indexlevel=medium as default
FLAG_PHRASE()|FLAG_BOOLEAN()|FLAG_LOVEHATE()|FLAG_WILDCARD();
@@ -137,26 +137,35 @@ sub xdir ($;$) {
}
}
+sub _xdb ($) {
+ my ($self) = @_;
+ my $dir = xdir($self, 1);
+ my ($xdb, $slow_phrase);
+ my $qpf = \($self->{qp_flags} ||= $QP_FLAGS);
+ if ($self->{version} >= 2) {
+ foreach my $part (<$dir/*>) {
+ -d $part && $part =~ m!/\d+\z! or next;
+ my $sub = Search::Xapian::Database->new($part);
+ if ($xdb) {
+ $xdb->add_database($sub);
+ } else {
+ $xdb = $sub;
+ }
+ $slow_phrase ||= -f "$part/iamchert";
+ }
+ } else {
+ $slow_phrase = -f "$dir/iamchert";
+ $xdb = Search::Xapian::Database->new($dir);
+ }
+ $$qpf |= FLAG_PHRASE() unless $slow_phrase;
+ $xdb;
+}
+
sub xdb ($) {
my ($self) = @_;
$self->{xdb} ||= do {
load_xapian();
- my $dir = xdir($self, 1);
- if ($self->{version} >= 2) {
- my $xdb;
- foreach my $part (<$dir/*>) {
- -d $part && $part =~ m!/\d+\z! or next;
- my $sub = Search::Xapian::Database->new($part);
- if ($xdb) {
- $xdb->add_database($sub);
- } else {
- $xdb = $sub;
- }
- }
- $xdb;
- } else {
- Search::Xapian::Database->new($dir);
- }
+ _xdb($self);
};
}
@@ -194,7 +203,8 @@ sub query {
$self->{over_ro}->recent($opts);
} else {
my $qp = qp($self);
- my $query = $qp->parse_query($query_string, $QP_FLAGS);
+ my $qp_flags = $self->{qp_flags};
+ my $query = $qp->parse_query($query_string, $qp_flags);
$opts->{relevance} = 1 unless exists $opts->{relevance};
_do_enquire($self, $query, $opts);
}
diff --git a/t/search.t b/t/search.t
index c063620..538baef 100644
--- a/t/search.t
+++ b/t/search.t
@@ -30,6 +30,7 @@ my $ro = PublicInbox::Search->new($git_dir);
my $rw_commit = sub {
$rw->commit_txn_lazy if $rw;
$rw = PublicInbox::SearchIdx->new($git_dir, 1);
+ $rw->{qp_flags} = 0; # quiet a warning
$rw->begin_txn_lazy;
};
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 05/26] xapcmd: new module for wrapping Xapian commands
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (3 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 04/26] search: reenable phrase search on non-chert Xapian Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index Eric Wong
` (21 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Port public-inbox-compact(1) over to using it, and we will need
to wrap copydatabase(1) to ease glass migrations, too.
---
MANIFEST | 1 +
lib/PublicInbox/Xapcmd.pm | 65 ++++++++++++++++++++++++++++++++++
script/public-inbox-compact | 70 ++-----------------------------------
3 files changed, 68 insertions(+), 68 deletions(-)
create mode 100644 lib/PublicInbox/Xapcmd.pm
diff --git a/MANIFEST b/MANIFEST
index 2b101fa..dfc1f66 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -139,6 +139,7 @@ lib/PublicInbox/WwwHighlight.pm
lib/PublicInbox/WwwListing.pm
lib/PublicInbox/WwwStream.pm
lib/PublicInbox/WwwText.pm
+lib/PublicInbox/Xapcmd.pm
sa_config/Makefile
sa_config/README
sa_config/root/etc/spamassassin/public-inbox.pre
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
new file mode 100644
index 0000000..586d7e6
--- /dev/null
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -0,0 +1,65 @@
+# Copyright (C) 2018-2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::Xapcmd;
+use strict;
+use warnings;
+use PublicInbox::Spawn qw(which spawn);
+use PublicInbox::Over;
+use File::Temp qw(tempdir);
+use File::Path qw(remove_tree);
+
+sub commit_changes ($$$) {
+ my ($im, $old, $new) = @_;
+ my @st = stat($old) or die "failed to stat($old): $!\n";
+
+ my $over = "$old/over.sqlite3";
+ if (-f $over) {
+ $over = PublicInbox::Over->new($over);
+ $over->connect->sqlite_backup_to_file("$new/over.sqlite3");
+ }
+ rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
+ chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
+ rename($new, $old) or die "rename $new => $old: $!\n";
+ $im->lock_release;
+ remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
+}
+
+sub run {
+ my ($ibx, $cmd) = @_;
+ my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
+ which($cmd->[0]) or die "$cmd->[0] not found in PATH\n";
+ $ibx->umask_prepare;
+ my $old = $ibx->search->xdir(1);
+ -d $old or die "$old does not exist\n";
+ my $new = tempdir($cmd->[0].'-XXXXXXXX', CLEANUP => 1, DIR => $dir);
+ my $v = $ibx->{version} || 1;
+ my @cmds;
+ if ($v == 1) {
+ push @cmds, [@$cmd, $old, $new];
+ } else {
+ opendir my $dh, $old or die "Failed to opendir $old: $!\n";
+ while (defined(my $dn = readdir($dh))) {
+ if ($dn =~ /\A\d+\z/) {
+ push @cmds, [@$cmd, "$old/$dn", "$new/$dn"];
+ } elsif ($dn eq '.' || $dn eq '..') {
+ } elsif ($dn =~ /\Aover\.sqlite3/) {
+ } else {
+ warn "W: skipping unknown dir: $old/$dn\n"
+ }
+ }
+ die "No Xapian parts found in $old\n" unless @cmds;
+ }
+ my $im = $ibx->importer(0);
+ $ibx->with_umask(sub {
+ $im->lock_acquire;
+ my %pids = map {; spawn($_) => join(' ', @$_) } @cmds;
+ while (scalar keys %pids) {
+ my $pid = waitpid(-1, 0);
+ my $desc = delete $pids{$pid};
+ die "$desc failed: $?\n" if $?;
+ }
+ commit_changes($im, $old, $new);
+ });
+}
+
+1;
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 395eec3..4aa6273 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -8,9 +8,7 @@ use PublicInbox::Search;
use PublicInbox::Config;
use PublicInbox::InboxWritable;
use Cwd 'abs_path';
-use File::Temp qw(tempdir);
-use File::Path qw(remove_tree);
-use PublicInbox::Spawn qw(spawn);
+use PublicInbox::Xapcmd;
my $usage = "Usage: public-inbox-compact REPO_DIR\n";
my $dir = shift or die $usage;
my $config = eval { PublicInbox::Config->new };
@@ -31,71 +29,7 @@ unless ($ibx) {
};
$ibx = PublicInbox::Inbox->new($ibx);
}
-my $v = ($ibx->{version} || 1);
$ibx = PublicInbox::InboxWritable->new($ibx);
-$ibx->umask_prepare;
-
-sub commit_changes ($$$) {
- my ($im, $old, $new) = @_;
- my @st = stat($old) or die "failed to stat($old): $!\n";
-
- my $over = "$old/over.sqlite3";
- if (-f $over) {
- require PublicInbox::Over;
- $over = PublicInbox::Over->new($over);
- $over->connect->sqlite_backup_to_file("$new/over.sqlite3");
- }
- rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
- chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
- rename($new, $old) or die "rename $new => $old: $!\n";
- $im->lock_release;
- remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
-}
# we rely on --no-renumber to keep docids synched to NNTP
-my @compact = qw(xapian-compact --no-renumber);
-if ($v == 2) {
- require PublicInbox::V2Writable;
- my $v2w = PublicInbox::V2Writable->new($ibx);
- my $xap_v = 'xap'.PublicInbox::Search::SCHEMA_VERSION;
- my $old = "$dir/$xap_v";
- opendir my $dh, $old or die "Failed to opendir $old: $!\n";
- my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $dir);
- $ibx->with_umask(sub {
- $v2w->lock_acquire;
- my %pids;
- while (defined(my $dn = readdir($dh))) {
- if ($dn =~ /\A\d+\z/) {
- my $cmd = [ @compact, "$old/$dn", "$new/$dn" ];
- $pids{spawn($cmd)} = join(' ', @$cmd);
- } elsif ($dn eq '.' || $dn eq '..') {
- } elsif ($dn =~ /\Aover\.sqlite3/) {
- } else {
- warn "W: skipping unknown Xapian DB: $old/$dn\n"
- }
- }
- close $dh;
- die "No Xapian parts found in $old\n" unless keys %pids;
- while (scalar keys %pids) {
- my $pid = waitpid(-1, 0);
- my $desc = delete $pids{$pid};
- die "$desc failed: $?\n" if $?;
- }
- commit_changes($v2w, $old, $new);
- });
-} elsif ($v == 1) {
- require PublicInbox::Import;
- my $im = PublicInbox::Import->new($ibx->git, undef, undef, $ibx);
- my $xap_v = 'xapian'.PublicInbox::Search::SCHEMA_VERSION;
- my $v1_root = "$dir/public-inbox";
- my $old = "$v1_root/$xap_v";
- -d $old or die "$old does not exist\n";
- my $new = tempdir('compact-XXXXXXXX', CLEANUP => 1, DIR => $v1_root);
- $ibx->with_umask(sub {
- $im->lock_acquire;
- PublicInbox::Import::run_die([@compact, $old, $new]);
- commit_changes($im, $old, $new);
- });
-} else {
- die "Unsupported inbox version: $v\n";
-}
+PublicInbox::Xapcmd::run($ibx, [qw(xapian-compact --no-renumber)]);
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (4 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 05/26] xapcmd: new module for wrapping Xapian commands Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 07/26] xapcmd: support spawn options Eric Wong
` (20 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Both of these index-affecting commands should work similarly
on the command-line.
public-inbox-index no longer complains about unconfigured
~/.public-inbox/config; but often I found myself being
annoyed by that, anyways...
---
lib/PublicInbox/Admin.pm | 34 ++++++++++++++++++++++++
script/public-inbox-compact | 35 ++++++------------------
script/public-inbox-index | 53 +++++++------------------------------
3 files changed, 52 insertions(+), 70 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 3eff5cd..2784820 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -41,6 +41,40 @@ sub resolve_repo_dir {
}
}
+sub resolve_inboxes {
+ my ($argv, $warn_on_unconfigured) = @_;
+ require PublicInbox::Config;
+ require PublicInbox::Inbox;
+
+ my @ibxs = map { resolve_repo_dir($_) } @$argv;
+ push(@ibxs, resolve_repo_dir()) unless @ibxs;
+
+ my %dir2ibx;
+ if (my $config = eval { PublicInbox::Config->new }) {
+ $config->each_inbox(sub {
+ my ($ibx) = @_;
+ $dir2ibx{abs_path($ibx->{mainrepo})} = $ibx;
+ });
+ } elsif ($warn_on_unconfigured) {
+ # do we really care about this? It's annoying...
+ warn $warn_on_unconfigured, "\n";
+ }
+ for my $i (0..$#ibxs) {
+ my $dir = $ibxs[$i];
+ $ibxs[$i] = $dir2ibx{$dir} ||= do {
+ my $name = "unconfigured-$i";
+ PublicInbox::Inbox->new({
+ name => $name,
+ address => [ "$name\@example.com" ],
+ mainrepo => $dir,
+ # TODO: consumers may want to warn on this:
+ #-unconfigured => 1,
+ });
+ };
+ }
+ @ibxs;
+}
+
# TODO: make Devel::Peek optional, only used for daemon
my @base_mod = qw(Email::MIME Date::Parse Devel::Peek);
my @over_mod = qw(DBD::SQLite DBI);
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 4aa6273..709fb92 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -1,35 +1,16 @@
#!/usr/bin/perl -w
-# Copyright (C) 2018 all contributors <meta@public-inbox.org>
+# Copyright (C) 2018-2019 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
use strict;
use warnings;
-use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
-use PublicInbox::Search;
-use PublicInbox::Config;
use PublicInbox::InboxWritable;
-use Cwd 'abs_path';
use PublicInbox::Xapcmd;
+use PublicInbox::Admin;
+PublicInbox::Admin::require_or_die('-index');
my $usage = "Usage: public-inbox-compact REPO_DIR\n";
-my $dir = shift or die $usage;
-my $config = eval { PublicInbox::Config->new };
-my $ibx;
-$dir = abs_path($dir);
-if ($config) {
- $config->each_inbox(sub {
- $ibx = $_[0] if abs_path($_[0]->{mainrepo}) eq $dir
- });
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
+foreach (@ibxs) {
+ my $ibx = PublicInbox::InboxWritable->new($_);
+ # we rely on --no-renumber to keep docids synched to NNTP
+ PublicInbox::Xapcmd::run($ibx, [qw(xapian-compact --no-renumber)]);
}
-unless ($ibx) {
- warn "W: $dir not configured in ".
- PublicInbox::Config::default_file() . "\n";
- $ibx = {
- mainrepo => $dir,
- name => 'ignored',
- address => [ 'old@example.com' ],
- };
- $ibx = PublicInbox::Inbox->new($ibx);
-}
-$ibx = PublicInbox::InboxWritable->new($ibx);
-
-# we rely on --no-renumber to keep docids synched to NNTP
-PublicInbox::Xapcmd::run($ibx, [qw(xapian-compact --no-renumber)]);
diff --git a/script/public-inbox-index b/script/public-inbox-index
index cf001cc..9399c27 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -1,5 +1,5 @@
#!/usr/bin/perl -w
-# Copyright (C) 2015-2018 all contributors <meta@public-inbox.org>
+# Copyright (C) 2015-2019 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# Basic tool to create a Xapian search index for a git repository
# configured for public-inbox.
@@ -10,14 +10,8 @@ use strict;
use warnings;
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
my $usage = "public-inbox-index REPO_DIR";
-use PublicInbox::Admin qw(resolve_repo_dir);
+use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
-require PublicInbox::Config;
-
-my $config = eval { PublicInbox::Config->new } || eval {
- warn "public-inbox unconfigured for serving, indexing anyways...\n";
- undef;
-};
my $reindex;
my $prune;
@@ -32,53 +26,26 @@ my %opts = (
GetOptions(%opts) or die "bad command-line args\n$usage";
die "--jobs must be positive\n" if defined $jobs && $jobs < 0;
-my @dirs;
-
-if (@ARGV) {
- @dirs = map { resolve_repo_dir($_) } @ARGV;
-} else {
- @dirs = (resolve_repo_dir());
-}
-
sub usage { print STDERR "Usage: $usage\n"; exit 1 }
-usage() unless @dirs;
-
-defined($config) and $config->each_inbox(sub {
- my ($ibx) = @_;
- for my $i (0..$#dirs) {
- next if $dirs[$i] ne $ibx->{mainrepo};
- $dirs[$i] = $ibx;
- }
-});
-
-my @inboxes;
+# do we really care about this message? It's annoying...
+my $warn = 'public-inbox unconfigured for serving, indexing anyways...';
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, $warn);
+PublicInbox::Admin::require_or_die('-index');
+usage() unless @ibxs;
my $mods = {};
-
-foreach my $dir (@dirs) {
- my $ibx = $dir;
- if (!ref($ibx)) {
- unless (-d $dir) {
- die "$dir does not appear to be an inbox repository\n";
- }
- $ibx = PublicInbox::Inbox->new({
- mainrepo => $dir,
- name => 'unnamed',
- indexlevel => $indexlevel,
- version => -f "$dir/inbox.lock" ? 2 : 1,
- });
- } elsif (defined $indexlevel && !defined($ibx->{indexlevel})) {
+foreach my $ibx (@ibxs) {
+ if (defined $indexlevel && !defined($ibx->{indexlevel})) {
# XXX: users can shoot themselves in the foot, with this...
$ibx->{indexlevel} = $indexlevel;
}
- push @inboxes, $ibx;
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
PublicInbox::Admin::require_or_die(keys %$mods);
require PublicInbox::SearchIdx;
-index_inbox($_) for @inboxes;
+index_inbox($_) for @ibxs;
sub index_inbox {
my ($repo) = @_;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 07/26] xapcmd: support spawn options
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (5 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 06/26] admin: hoist out resolve_inboxes for -compact and -index Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1) Eric Wong
` (19 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
copydatabase(1) is exceptionally noisy and it's output is
confusing when run in parallel. Support redirects at least, and
env while we're at it to give us future options.
We can also stuff a -jobs parameter into the options to limit
parallelism since it can be useful for low-priority upgrade
jobs.
---
lib/PublicInbox/Xapcmd.pm | 21 +++++++++++++++------
1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 586d7e6..999ddd1 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -25,7 +25,8 @@ sub commit_changes ($$$) {
}
sub run {
- my ($ibx, $cmd) = @_;
+ my ($ibx, $cmd, $env, $opt) = @_;
+ $opt ||= {};
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
which($cmd->[0]) or die "$cmd->[0] not found in PATH\n";
$ibx->umask_prepare;
@@ -50,13 +51,21 @@ sub run {
die "No Xapian parts found in $old\n" unless @cmds;
}
my $im = $ibx->importer(0);
+ my $max = $opt->{jobs} || scalar(@cmds);
$ibx->with_umask(sub {
$im->lock_acquire;
- my %pids = map {; spawn($_) => join(' ', @$_) } @cmds;
- while (scalar keys %pids) {
- my $pid = waitpid(-1, 0);
- my $desc = delete $pids{$pid};
- die "$desc failed: $?\n" if $?;
+ my %pids;
+ while (@cmds) {
+ while (scalar(keys(%pids)) < $max && scalar(@cmds)) {
+ my $x = shift @cmds;
+ $pids{spawn($x, $env, $opt)} = $x;
+ }
+
+ while (scalar keys %pids) {
+ my $pid = waitpid(-1, 0);
+ my $x = delete $pids{$pid};
+ die join(' ', @$x)." failed: $?\n" if $?;
+ }
}
commit_changes($im, $old, $new);
});
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1)
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (6 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 07/26] xapcmd: support spawn options Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 09/26] xapcmd: do not cleanup on errors Eric Wong
` (18 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
copydatabase(1) is an existing Xapian tool which is the
recommended way to upgrade existing DBs to the latest Xapian
database format (currently "glass" for stable/released
versions). Our use of Xapian relies on preserving document IDs,
so we'll wrap it like we do xapian-compact(1) and use the
"--no-renumber" switch.
I could not name the tool "public-inbox-copydatabase" since it
would be ambiguous as to which DB it's actually copying. So, I
abbreviated the suffix to "xcpdb" (Xapian CoPy DataBase), which
I hope is acceptable and unambiguous.
---
Documentation/include.mk | 6 ++--
Documentation/public-inbox-xcpdb.pod | 51 ++++++++++++++++++++++++++++
MANIFEST | 2 ++
script/public-inbox-xcpdb | 18 ++++++++++
t/indexlevels-mirror.t | 22 ++++++++++++
5 files changed, 97 insertions(+), 2 deletions(-)
create mode 100644 Documentation/public-inbox-xcpdb.pod
create mode 100755 script/public-inbox-xcpdb
diff --git a/Documentation/include.mk b/Documentation/include.mk
index 6415338..27d6ea6 100644
--- a/Documentation/include.mk
+++ b/Documentation/include.mk
@@ -26,11 +26,13 @@ podtext = $(PODTEXT) $(PODTEXT_OPTS)
# MakeMaker only seems to support manpage sections 1 and 3...
m1 =
-m1 += public-inbox-mda
+m1 += public-inbox-compact
m1 += public-inbox-httpd
+m1 += public-inbox-index
+m1 += public-inbox-mda
m1 += public-inbox-nntpd
m1 += public-inbox-watch
-m1 += public-inbox-index
+m1 += public-inbox-xcpdb
m5 =
m5 += public-inbox-config
m5 += public-inbox-v1-format
diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod
new file mode 100644
index 0000000..4ff5186
--- /dev/null
+++ b/Documentation/public-inbox-xcpdb.pod
@@ -0,0 +1,51 @@
+=head1 NAME
+
+public-inbox-xcpdb - copy Xapian DBs (for format upgrades)
+
+=head1 SYNOPSIS
+
+ public-inbox-xcpdb INBOX_DIR
+
+=head1 DESCRIPTION
+
+public-inbox-xcpdb is a wrapper for L<copydatabase(1)> for
+upgrading to the latest database format supported by Xapian
+(e.g. "glass" or "honey").
+
+It locks the inbox and prevents other processes such as
+L<public-inbox-watch(1)> and L<public-inbox-mda(1)> from
+writing while it operates.
+
+This is intended for upgrading the database format used by
+Xapian. It DOES NOT upgrade the schema used by the
+public-inbox search interface (see L<public-inbox-index(1)>).
+
+=head1 ENVIRONMENT
+
+=over 8
+
+=item PI_CONFIG
+
+The default config file, normally "~/.public-inbox/config".
+See L<public-inbox-config(5)>
+
+=back
+
+=head1 UPGRADING
+
+=head1 CONTACT
+
+Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org>
+
+The mail archives are hosted at L<https://public-inbox.org/meta/>
+and L<http://hjrcffqmbrq6wope.onion/meta/>
+
+=head1 COPYRIGHT
+
+Copyright 2019 all contributors L<mailto:meta@public-inbox.org>
+
+License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
+
+=head1 SEE ALSO
+
+L<copydatabase(1)>, L<public-inbox-index(1)>
diff --git a/MANIFEST b/MANIFEST
index dfc1f66..efd5658 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -20,6 +20,7 @@ Documentation/public-inbox-overview.pod
Documentation/public-inbox-v1-format.pod
Documentation/public-inbox-v2-format.pod
Documentation/public-inbox-watch.pod
+Documentation/public-inbox-xcpdb.pod
Documentation/standards.perl
Documentation/txt2pre
HACKING
@@ -154,6 +155,7 @@ script/public-inbox-mda
script/public-inbox-nntpd
script/public-inbox-purge
script/public-inbox-watch
+script/public-inbox-xcpdb
script/public-inbox.cgi
scripts/dc-dlvr
scripts/dc-dlvr.pre
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
new file mode 100755
index 0000000..cbf9f55
--- /dev/null
+++ b/script/public-inbox-xcpdb
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+# Copyright (C) 2019 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# xcpdb: Xapian copy database, a wrapper around Xapian's copydatabase(1)
+use PublicInbox::InboxWritable;
+use PublicInbox::Xapcmd;
+use PublicInbox::Admin;
+PublicInbox::Admin::require_or_die('-search');
+my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
+my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
+my $cmd = [qw(copydatabase --no-renumber)];
+open my $null, '>', '/dev/null' or die "failed to open /dev/null: $!\n";
+my $rdr = { 1 => fileno($null) };
+foreach (@ibxs) {
+ my $ibx = PublicInbox::InboxWritable->new($_);
+ # we rely on --no-renumber to keep docids synched to NNTP
+ PublicInbox::Xapcmd::run($ibx, $cmd, undef, $rdr);
+}
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
index d124c75..61053b6 100644
--- a/t/indexlevels-mirror.t
+++ b/t/indexlevels-mirror.t
@@ -18,6 +18,7 @@ foreach my $mod (qw(DBD::SQLite)) {
my $path = 'blib/script';
my $index = "$path/public-inbox-index";
+my $xcpdb = "$path/public-inbox-xcpdb";
my $mime = PublicInbox::MIME->create(
header => [
@@ -108,6 +109,13 @@ sub import_index_incremental {
ok($im->remove($mime), '2nd message removed');
$im->done;
+ if ($level ne 'basic') {
+ is(system($xcpdb, $mirror), 0, "v$v xcpdb OK");
+ delete $ro_mirror->{$_} for (qw(over search));
+ ($nr, $msgs) = $ro_mirror->search->query('m:m@2');
+ is($nr, 1, "v$v found m\@2 via Xapian on $level");
+ }
+
# sync the mirror
is(system('git', "--git-dir=$fetch_dir", qw(fetch -q)), 0, 'fetch OK');
is(system($index, $mirror), 0, "v$v index mirror again OK");
@@ -120,6 +128,10 @@ sub import_index_incremental {
is_deeply([glob("$ibx->{mainrepo}/xap*/?/")], [],
'no Xapian partition directories for v2 basic');
}
+ if ($level ne 'basic') {
+ ($nr, $msgs) = $ro_mirror->search->reopen->query('m:m@2');
+ is($nr, 0, "v$v m\@2 gone from Xapian in mirror on $level");
+ }
}
# we can probably cull some other tests and put full/medium tests, here
@@ -131,4 +143,14 @@ for my $level (qw(basic)) {
}
}
+SKIP: {
+ require PublicInbox::Search;
+ PublicInbox::Search::load_xapian() or skip 'Search::Xapian missing', 2;
+ for my $v (1..2) {
+ subtest("v$v indexlevel=medium" => sub {
+ import_index_incremental($v, 'medium');
+ })
+ }
+}
+
done_testing();
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 09/26] xapcmd: do not cleanup on errors
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (7 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 08/26] xcpdb: new tool which wraps Xapian's copydatabase(1) Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 10/26] admin: move index_inbox over Eric Wong
` (17 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
We move the old directory into the new directory, so avoid the
situation where a bug or error could cause the tempdir cleanup to run
and destroy both our old and new directories.
---
lib/PublicInbox/Xapcmd.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 999ddd1..81e2f10 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -32,7 +32,7 @@ sub run {
$ibx->umask_prepare;
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
- my $new = tempdir($cmd->[0].'-XXXXXXXX', CLEANUP => 1, DIR => $dir);
+ my $new = tempdir($cmd->[0].'-XXXXXXXX', DIR => $dir);
my $v = $ibx->{version} || 1;
my @cmds;
if ($v == 1) {
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 10/26] admin: move index_inbox over
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (8 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 09/26] xapcmd: do not cleanup on errors Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 11/26] xcpdb: implement using Perl bindings Eric Wong
` (16 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
We will be reindexing after copydatabase
---
lib/PublicInbox/Admin.pm | 32 +++++++++++++++++++++++
script/public-inbox-index | 53 ++++++---------------------------------
2 files changed, 39 insertions(+), 46 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 2784820..94f47ab 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -135,4 +135,36 @@ invalid indexlevel=$indexlevel (must be `basic', `medium', or `full')
die missing_mod_msg($err) ." required for indexlevel=$indexlevel\n";
}
+sub index_inbox {
+ my ($ibx, $opt) = @_;
+ my $jobs = delete $opt->{jobs} if $opt;
+ if (ref($ibx) && ($ibx->{version} || 1) == 2) {
+ eval { require PublicInbox::V2Writable };
+ die "v2 requirements not met: $@\n" if $@;
+ my $v2w = eval {
+ PublicInbox::V2Writable->new($ibx, {nproc=>$jobs});
+ };
+ if (defined $jobs) {
+ if ($jobs == 0) {
+ $v2w->{parallel} = 0;
+ } else {
+ my $n = $v2w->{partitions};
+ if ($jobs != ($n + 1)) {
+ warn
+"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n";
+ }
+ }
+ }
+ my $warn_cb = $SIG{__WARN__} || sub { print STDERR @_ };
+ local $SIG{__WARN__} = sub {
+ $warn_cb->($v2w->{current_info}, ': ', @_);
+ };
+ $v2w->index_sync($opt);
+ } else {
+ require PublicInbox::SearchIdx;
+ my $s = PublicInbox::SearchIdx->new($ibx, 1);
+ $s->index_sync($opt);
+ }
+}
+
1;
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 9399c27..b6e3052 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -13,18 +13,10 @@ my $usage = "public-inbox-index REPO_DIR";
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
-my $reindex;
-my $prune;
-my $jobs = undef;
-my $indexlevel;
-my %opts = (
- '--reindex' => \$reindex,
- '--jobs|j=i' => \$jobs,
- '--prune' => \$prune,
- 'L|indexlevel=s' => \$indexlevel,
-);
-GetOptions(%opts) or die "bad command-line args\n$usage";
-die "--jobs must be positive\n" if defined $jobs && $jobs < 0;
+my $opt = {};
+GetOptions($opt, qw(reindex jobs|j=i prune indexlevel|L=s))
+ or die "bad command-line args\n$usage";
+die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0;
sub usage { print STDERR "Usage: $usage\n"; exit 1 }
@@ -35,43 +27,12 @@ PublicInbox::Admin::require_or_die('-index');
usage() unless @ibxs;
my $mods = {};
foreach my $ibx (@ibxs) {
- if (defined $indexlevel && !defined($ibx->{indexlevel})) {
+ if (defined $opt->{indexlevel} && !defined($ibx->{indexlevel})) {
# XXX: users can shoot themselves in the foot, with this...
- $ibx->{indexlevel} = $indexlevel;
+ $ibx->{indexlevel} = $opt->{indexlevel};
}
PublicInbox::Admin::scan_ibx_modules($mods, $ibx);
}
PublicInbox::Admin::require_or_die(keys %$mods);
-
-require PublicInbox::SearchIdx;
-index_inbox($_) for @ibxs;
-
-sub index_inbox {
- my ($repo) = @_;
- if (ref($repo) && ($repo->{version} || 1) == 2) {
- eval { require PublicInbox::V2Writable };
- die "v2 requirements not met: $@\n" if $@;
- my $v2w = eval {
- PublicInbox::V2Writable->new($repo, {nproc=>$jobs});
- };
- if (defined $jobs) {
- if ($jobs == 0) {
- $v2w->{parallel} = 0;
- } else {
- my $n = $v2w->{partitions};
- if ($jobs != ($n + 1)) {
- warn
-"Unable to respect --jobs=$jobs, inbox was created with $n partitions\n";
- }
- }
- }
- local $SIG{__WARN__} = sub {
- print STDERR $v2w->{current_info}, ': ', @_;
- };
- $v2w->index_sync({ reindex => $reindex, prune => $prune });
- } else {
- my $s = PublicInbox::SearchIdx->new($repo, 1);
- $s->index_sync({ reindex => $reindex });
- }
-}
+PublicInbox::Admin::index_inbox($_, $opt) for @ibxs;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 11/26] xcpdb: implement using Perl bindings
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (9 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 10/26] admin: move index_inbox over Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 12/26] xapcmd: xcpdb supports compaction Eric Wong
` (15 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
By avoid copydatabase(1) entirely, we can make further changes
to avoid locking the entire inbox for a long operation and
switch to fine-grained locking.
---
lib/PublicInbox/Xapcmd.pm | 77 +++++++++++++++++++++++++++++++++++++--
script/public-inbox-xcpdb | 2 +-
2 files changed, 75 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 81e2f10..ca74ea0 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -24,15 +24,36 @@ sub commit_changes ($$$) {
remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
}
+sub xspawn {
+ my ($cmd, $env, $opt) = @_;
+ if (ref($cmd->[0]) eq 'CODE') {
+ my $cb = shift(@$cmd); # $cb = cpdb()
+ defined(my $pid = fork) or die "fork: $!";
+ return $pid if $pid > 0;
+ eval { $cb->($cmd, $env, $opt) };
+ die $@ if $@;
+ exit 0;
+ } else {
+ spawn($cmd, $env, $opt);
+ }
+}
+
sub run {
my ($ibx, $cmd, $env, $opt) = @_;
$opt ||= {};
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
- which($cmd->[0]) or die "$cmd->[0] not found in PATH\n";
+ my $exe = $cmd->[0];
+ my $pfx = $exe;
+ if (ref($exe) eq 'CODE') {
+ $pfx = 'CODE';
+ require Search::Xapian::WritableDatabase;
+ } else {
+ which($exe) or die "$exe not found in PATH\n";
+ }
$ibx->umask_prepare;
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
- my $new = tempdir($cmd->[0].'-XXXXXXXX', DIR => $dir);
+ my $new = tempdir("$pfx-XXXXXXXX", DIR => $dir);
my $v = $ibx->{version} || 1;
my @cmds;
if ($v == 1) {
@@ -58,7 +79,7 @@ sub run {
while (@cmds) {
while (scalar(keys(%pids)) < $max && scalar(@cmds)) {
my $x = shift @cmds;
- $pids{spawn($x, $env, $opt)} = $x;
+ $pids{xspawn($x, $env, $opt)} = $x;
}
while (scalar keys %pids) {
@@ -71,4 +92,54 @@ sub run {
});
}
+sub cpdb_retryable ($$) {
+ my ($src, $err) = @_;
+ if (ref($err) eq 'Search::Xapian::DatabaseModifiedError') {
+ warn "$err, reopening and retrying\n";
+ $src->reopen;
+ return 1;
+ }
+ die $err if $err;
+ 0;
+}
+
+sub cpdb {
+ my ($args, $env, $opt) = @_;
+ my ($old, $new) = @$args;
+ my $src = Search::Xapian::Database->new($old);
+
+ # like copydatabase(1), be sure we don't overwrite anything in case
+ # of other bugs:
+ my $creat = Search::Xapian::DB_CREATE();
+ my $dst = Search::Xapian::WritableDatabase->new($new, $creat);
+ my ($it, $end);
+
+ do {
+ eval {
+ # update the only metadata key for v1:
+ my $lc = $src->get_metadata('last_commit');
+ $dst->set_metadata('last_commit', $lc) if $lc;
+
+ $it = $src->postlist_begin('');
+ $end = $src->postlist_end('');
+ };
+ } while (cpdb_retryable($src, $@));
+
+ do {
+ eval {
+ while ($it != $end) {
+ my $docid = $it->get_docid;
+ my $doc = $src->get_document($docid);
+ $dst->replace_document($docid, $doc);
+ $it->inc;
+ }
+
+ # unlike copydatabase(1), we don't copy spelling
+ # and synonym data (or other user metadata) since
+ # the Perl APIs don't expose iterators for them
+ # (and public-inbox does not use those features)
+ };
+ } while (cpdb_retryable($src, $@));
+}
+
1;
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index cbf9f55..d494991 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -8,7 +8,7 @@ use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-search');
my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
-my $cmd = [qw(copydatabase --no-renumber)];
+my $cmd = [ \&PublicInbox::Xapcmd::cpdb ];
open my $null, '>', '/dev/null' or die "failed to open /dev/null: $!\n";
my $rdr = { 1 => fileno($null) };
foreach (@ibxs) {
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 12/26] xapcmd: xcpdb supports compaction
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (10 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 11/26] xcpdb: implement using Perl bindings Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 13/26] v2writable: hoist out log_range sub for readability Eric Wong
` (14 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
To minimize the delay on active inboxes, it's actually ideal to
run xapian-compact at the end of the per-partition cpdb process;
since the new DB isn't accessible yet and so we don't have to
deal with lock contention with -mda or -watch processes. The
downside is temporary file overhead (3x instead of 2x) required.
---
lib/PublicInbox/Xapcmd.pm | 34 ++++++++++++++++++++++++++++++++--
script/public-inbox-xcpdb | 8 ++++++--
2 files changed, 38 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index ca74ea0..d2de874 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -8,6 +8,10 @@ use PublicInbox::Over;
use File::Temp qw(tempdir);
use File::Path qw(remove_tree);
+# support testing with dev versions of Xapian which installs
+# commands with a version number suffix (e.g. "xapian-compact-1.5")
+our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
+
sub commit_changes ($$$) {
my ($im, $old, $new) = @_;
my @st = stat($old) or die "failed to stat($old): $!\n";
@@ -38,17 +42,23 @@ sub xspawn {
}
}
+sub runnable_or_die ($) {
+ my ($exe) = @_;
+ which($exe) or die "$exe not found in PATH\n";
+}
+
sub run {
my ($ibx, $cmd, $env, $opt) = @_;
$opt ||= {};
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
my $exe = $cmd->[0];
my $pfx = $exe;
+ runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
if (ref($exe) eq 'CODE') {
$pfx = 'CODE';
require Search::Xapian::WritableDatabase;
} else {
- which($exe) or die "$exe not found in PATH\n";
+ runnable_or_die($exe);
}
$ibx->umask_prepare;
my $old = $ibx->search->xdir(1);
@@ -107,11 +117,12 @@ sub cpdb {
my ($args, $env, $opt) = @_;
my ($old, $new) = @$args;
my $src = Search::Xapian::Database->new($old);
+ my $tmp = $opt->{compact} ? "$new.compact" : $new;
# like copydatabase(1), be sure we don't overwrite anything in case
# of other bugs:
my $creat = Search::Xapian::DB_CREATE();
- my $dst = Search::Xapian::WritableDatabase->new($new, $creat);
+ my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat);
my ($it, $end);
do {
@@ -140,6 +151,25 @@ sub cpdb {
# (and public-inbox does not use those features)
};
} while (cpdb_retryable($src, $@));
+
+ return unless $opt->{compact};
+
+ $src = $dst = undef; # flushes and closes
+
+ # this is probably the best place to do xapian-compact
+ # since $dst isn't readable by HTTP or NNTP clients, yet:
+ my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $tmp, $new ];
+ my $rdr = {};
+ foreach my $fd (0..2) {
+ defined(my $dst = $opt->{$fd}) or next;
+ $rdr->{$fd} = $dst;
+ }
+ my $pid = spawn($cmd, $env, $rdr);
+ my $r = waitpid($pid, 0);
+ if ($? || $r != $pid) {
+ die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$r)\n";
+ }
+ remove_tree($tmp) or die "failed to remove $tmp: $!\n";
}
1;
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index d494991..78d37da 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -2,17 +2,21 @@
# Copyright (C) 2019 all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# xcpdb: Xapian copy database, a wrapper around Xapian's copydatabase(1)
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
use PublicInbox::InboxWritable;
use PublicInbox::Xapcmd;
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-search');
my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
+my $opt = {};
+GetOptions($opt, qw(compact)) or die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
+
my $cmd = [ \&PublicInbox::Xapcmd::cpdb ];
open my $null, '>', '/dev/null' or die "failed to open /dev/null: $!\n";
-my $rdr = { 1 => fileno($null) };
+$opt->{1} = fileno($null);
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
# we rely on --no-renumber to keep docids synched to NNTP
- PublicInbox::Xapcmd::run($ibx, $cmd, undef, $rdr);
+ PublicInbox::Xapcmd::run($ibx, $cmd, undef, $opt);
}
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 13/26] v2writable: hoist out log_range sub for readability
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (11 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 12/26] xapcmd: xcpdb supports compaction Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 14/26] xcpdb: use fine-grained locking Eric Wong
` (13 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
This is preparation to to support partial reindexing
---
lib/PublicInbox/V2Writable.pm | 72 ++++++++++++++++++-----------------
1 file changed, 37 insertions(+), 35 deletions(-)
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index c476cb3..3dd606e 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -808,6 +808,40 @@ sub last_commits {
*is_ancestor = *PublicInbox::SearchIdx::is_ancestor;
+# returns a revision range for git-log(1)
+sub log_range ($$$$$) {
+ my ($self, $git, $ranges, $i, $tip) = @_;
+ my $cur = $ranges->[$i] or return $tip; # all of it
+ my $range = "$cur..$tip";
+ if (is_ancestor($git, $cur, $tip)) { # common case
+ my $n = $git->qx(qw(rev-list --count), $range);
+ chomp($n);
+ if ($n == 0) {
+ $ranges->[$i] = undef;
+ return; # nothing to do
+ }
+ } else {
+ warn <<"";
+discontiguous range: $range
+Rewritten history? (in $git->{git_dir})
+
+ chomp(my $base = $git->qx('merge-base', $tip, $cur));
+ if ($base) {
+ $range = "$base..$tip";
+ warn "found merge-base: $base\n"
+ } else {
+ $range = $tip;
+ warn "discarding history at $cur\n";
+ }
+ warn <<"";
+reindexing $git->{git_dir} starting at
+$range
+
+ $self->{"unindex-range.$i"} = "$base..$cur";
+ }
+ $range;
+}
+
sub index_prepare {
my ($self, $opts, $epoch_max, $ranges) = @_;
my $regen_max = 0;
@@ -818,42 +852,9 @@ sub index_prepare {
-d $git_dir or next; # missing parts are fine
my $git = PublicInbox::Git->new($git_dir);
chomp(my $tip = $git->qx(qw(rev-parse -q --verify), $head));
- next if $?; # new repo
- my $range;
- if (defined(my $cur = $ranges->[$i])) {
- $range = "$cur..$tip";
- if (is_ancestor($git, $cur, $tip)) { # common case
- my $n = $git->qx(qw(rev-list --count), $range);
- chomp($n);
- if ($n == 0) {
- $ranges->[$i] = undef;
- next;
- }
- } else {
- warn <<"";
-discontiguous range: $range
-Rewritten history? (in $git_dir)
-
- my $base = $git->qx('merge-base', $tip, $cur);
- chomp $base;
- if ($base) {
- $range = "$base..$tip";
- warn "found merge-base: $base\n"
- } else {
- $range = $tip;
- warn <<"";
-discarding history at $cur
-
- }
- warn <<"";
-reindexing $git_dir starting at
-$range
- $self->{"unindex-range.$i"} = "$base..$cur";
- }
- } else {
- $range = $tip; # all of it
- }
+ next if $?; # new repo
+ my $range = log_range($self, $git, $ranges, $i, $tip) or next;
$ranges->[$i] = $range;
# can't use 'rev-list --count' if we use --diff-filter
@@ -923,6 +924,7 @@ sub unindex {
qw(-c gc.reflogExpire=now gc --prune=all)]);
}
+# called for public-inbox-index
sub index_sync {
my ($self, $opts) = @_;
$opts ||= {};
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 14/26] xcpdb: use fine-grained locking
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (12 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 13/26] v2writable: hoist out log_range sub for readability Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 15/26] xcpdb: implement progress reporting Eric Wong
` (12 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Copying an entire Xapian DB takes a long time, so update our
reindexing code to support partial reindexing, snapshot the
pre-copydatabase git revisions, perform the lengthy copy,
and do a partial reindex when the copy + renames are done.
---
lib/PublicInbox/Admin.pm | 2 +-
lib/PublicInbox/SearchIdx.pm | 10 +++++-
lib/PublicInbox/V2Writable.pm | 21 ++++++++++---
lib/PublicInbox/Xapcmd.pm | 58 ++++++++++++++++++++++++++++++++---
4 files changed, 80 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 94f47ab..34aa312 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -141,7 +141,7 @@ sub index_inbox {
if (ref($ibx) && ($ibx->{version} || 1) == 2) {
eval { require PublicInbox::V2Writable };
die "v2 requirements not met: $@\n" if $@;
- my $v2w = eval {
+ my $v2w = eval { $ibx->importer(0) } || eval {
PublicInbox::V2Writable->new($ibx, {nproc=>$jobs});
};
if (defined $jobs) {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 114420e..0aeeb6b 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -542,8 +542,10 @@ sub do_cat_mail {
$@ ? undef : $mime;
}
+# called by public-inbox-index
sub index_sync {
my ($self, $opts) = @_;
+ delete $self->{lock_path} if $opts->{-skip_lock};
$self->{-inbox}->with_umask(sub { $self->_index_sync($opts) })
}
@@ -692,6 +694,12 @@ sub _last_x_commit {
$lx;
}
+sub reindex_from ($$) {
+ my ($reindex, $last_commit) = @_;
+ return $last_commit unless $reindex;
+ ref($reindex) eq 'HASH' ? $reindex->{from} : '';
+}
+
# indexes all unindexed messages (v1 only)
sub _index_sync {
my ($self, $opts) = @_;
@@ -705,7 +713,7 @@ sub _index_sync {
do {
$xlog = undef;
$last_commit = _last_x_commit($self, $mm);
- $lx = $opts->{reindex} ? '' : $last_commit;
+ $lx = reindex_from($opts->{reindex}, $last_commit);
$self->{over}->rollback_lazy;
$self->{over}->disconnect;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 3dd606e..1ee19b2 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -238,7 +238,7 @@ sub idx_part {
# idempotent
sub idx_init {
- my ($self) = @_;
+ my ($self, $opt) = @_;
return if $self->{idx_parts};
my $ibx = $self->{-inbox};
@@ -264,7 +264,7 @@ sub idx_init {
my $over = $self->{over};
$ibx->umask_prepare;
$ibx->with_umask(sub {
- $self->lock_acquire;
+ $self->lock_acquire unless ($opt && $opt->{-skip_lock});
$over->create;
# -compact can change partition count while -watch is idle
@@ -924,6 +924,19 @@ sub unindex {
qw(-c gc.reflogExpire=now gc --prune=all)]);
}
+sub index_ranges ($$$) {
+ my ($self, $reindex, $epoch_max) = @_;
+ return last_commits($self, $epoch_max) unless $reindex;
+
+ return [] if ref($reindex) ne 'HASH';
+
+ my $ranges = $reindex->{from}; # arrayref;
+ if (ref($ranges) ne 'ARRAY') {
+ die 'BUG: $reindex->{from} not an ARRAY';
+ }
+ $ranges;
+}
+
# called for public-inbox-index
sub index_sync {
my ($self, $opts) = @_;
@@ -931,10 +944,10 @@ sub index_sync {
my $epoch_max;
my $latest = git_dir_latest($self, \$epoch_max);
return unless defined $latest;
- $self->idx_init; # acquire lock
+ $self->idx_init($opts); # acquire lock
my $mm_tmp = $self->{mm}->tmp_clone;
my $reindex = $opts->{reindex};
- my $ranges = $reindex ? [] : $self->last_commits($epoch_max);
+ my $ranges = index_ranges($self, $reindex, $epoch_max);
my $high = $self->{mm}->num_highwater();
my $regen = $self->index_prepare($opts, $epoch_max, $ranges);
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index d2de874..4555340 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -5,6 +5,7 @@ use strict;
use warnings;
use PublicInbox::Spawn qw(which spawn);
use PublicInbox::Over;
+use PublicInbox::Search;
use File::Temp qw(tempdir);
use File::Path qw(remove_tree);
@@ -12,20 +13,33 @@ use File::Path qw(remove_tree);
# commands with a version number suffix (e.g. "xapian-compact-1.5")
our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
-sub commit_changes ($$$) {
- my ($im, $old, $new) = @_;
+sub commit_changes ($$$$) {
+ my ($ibx, $old, $new, $opt) = @_;
+
+ my $reindex = $opt->{reindex};
+ my $im = $ibx->importer(0);
+ $im->lock_acquire if $reindex;
+
my @st = stat($old) or die "failed to stat($old): $!\n";
my $over = "$old/over.sqlite3";
if (-f $over) {
$over = PublicInbox::Over->new($over);
$over->connect->sqlite_backup_to_file("$new/over.sqlite3");
+ $over = undef;
}
rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
rename($new, $old) or die "rename $new => $old: $!\n";
- $im->lock_release;
remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
+
+ if ($reindex) {
+ $opt->{-skip_lock} = 1;
+ PublicInbox::Admin::index_inbox($ibx, $opt);
+ # implicit lock_release
+ } else {
+ $im->lock_release;
+ }
}
sub xspawn {
@@ -47,6 +61,27 @@ sub runnable_or_die ($) {
which($exe) or die "$exe not found in PATH\n";
}
+sub prepare_reindex ($$) {
+ my ($ibx, $reindex) = @_;
+ if ($ibx->{version} == 1) {
+ my $dir = $ibx->search->xdir(1);
+ my $xdb = Search::Xapian::Database->new($dir);
+ if (my $lc = $xdb->get_metadata('last_commit')) {
+ $reindex->{from} = $lc;
+ }
+ } else { # v2
+ my $v2w = $ibx->importer(0);
+ my $max;
+ $v2w->git_dir_latest(\$max) or return;
+ my $from = $reindex->{from};
+ my $mm = $ibx->mm;
+ my $v = PublicInbox::Search::SCHEMA_VERSION();
+ foreach my $i (0..$max) {
+ $from->[$i] = $mm->last_commit_xap($v, $i);
+ }
+ }
+}
+
sub run {
my ($ibx, $cmd, $env, $opt) = @_;
$opt ||= {};
@@ -54,8 +89,14 @@ sub run {
my $exe = $cmd->[0];
my $pfx = $exe;
runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
+
+ my $reindex; # v1:{ from => $x40 }, v2:{ from => [ $x40, $x40, .. ] } }
+ my $from; # per-epoch ranges
+
if (ref($exe) eq 'CODE') {
$pfx = 'CODE';
+ $reindex = $opt->{reindex} = {};
+ $from = $reindex->{from} = [];
require Search::Xapian::WritableDatabase;
} else {
runnable_or_die($exe);
@@ -64,7 +105,7 @@ sub run {
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
my $new = tempdir("$pfx-XXXXXXXX", DIR => $dir);
- my $v = $ibx->{version} || 1;
+ my $v = $ibx->{version} ||= 1;
my @cmds;
if ($v == 1) {
push @cmds, [@$cmd, $old, $new];
@@ -85,6 +126,13 @@ sub run {
my $max = $opt->{jobs} || scalar(@cmds);
$ibx->with_umask(sub {
$im->lock_acquire;
+
+ # fine-grained locking if we prepare for reindex
+ if ($reindex) {
+ prepare_reindex($ibx, $reindex);
+ $im->lock_release;
+ }
+ delete($ibx->{$_}) for (qw(mm over search)); # cleanup
my %pids;
while (@cmds) {
while (scalar(keys(%pids)) < $max && scalar(@cmds)) {
@@ -98,7 +146,7 @@ sub run {
die join(' ', @$x)." failed: $?\n" if $?;
}
}
- commit_changes($im, $old, $new);
+ commit_changes($ibx, $old, $new, $opt);
});
}
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 15/26] xcpdb: implement progress reporting
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (13 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 14/26] xcpdb: use fine-grained locking Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 16/26] xcpdb: cleanup error handling and diagnosis Eric Wong
` (11 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Copying an entire Xapian DB is horribly slow whether it's done
via Perl or copydatabase(1). So displaying some progress
indication is good for user experience.
While we're at it, prefix xapian-compact output, too; since
parallel processes end up clobbering each other.
---
lib/PublicInbox/Xapcmd.pm | 47 +++++++++++++++++++++++++++++++++++----
script/public-inbox-xcpdb | 5 +----
t/indexlevels-mirror.t | 4 ++--
3 files changed, 46 insertions(+), 10 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 4555340..99f0e7c 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -82,9 +82,21 @@ sub prepare_reindex ($$) {
}
}
+sub progress_prepare ($) {
+ my ($opt) = @_;
+ if ($opt->{quiet}) {
+ open my $null, '>', '/dev/null' or
+ die "failed to open /dev/null: $!\n";
+ $opt->{1} = fileno($null);
+ $opt->{-dev_null} = $null;
+ } else {
+ $opt->{-progress} = 1;
+ }
+}
+
sub run {
my ($ibx, $cmd, $env, $opt) = @_;
- $opt ||= {};
+ progress_prepare($opt ||= {});
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
my $exe = $cmd->[0];
my $pfx = $exe;
@@ -161,6 +173,8 @@ sub cpdb_retryable ($$) {
0;
}
+# Like copydatabase(1), this is horribly slow; and it doesn't seem due
+# to the overhead of Perl.
sub cpdb {
my ($args, $env, $opt) = @_;
my ($old, $new) = @$args;
@@ -172,6 +186,7 @@ sub cpdb {
my $creat = Search::Xapian::DB_CREATE();
my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat);
my ($it, $end);
+ my ($pfx, $nr, $tot, $fmt); # progress output
do {
eval {
@@ -181,6 +196,13 @@ sub cpdb {
$it = $src->postlist_begin('');
$end = $src->postlist_end('');
+ if ($opt->{-progress}) {
+ $nr = 0;
+ $pfx = (split('/', $old))[-1].':';
+ $tot = $src->get_doccount;
+ $fmt = "$pfx % ".length($tot)."u/$tot\n";
+ warn "$pfx copying $tot documents\n";
+ }
};
} while (cpdb_retryable($src, $@));
@@ -191,6 +213,9 @@ sub cpdb {
my $doc = $src->get_document($docid);
$dst->replace_document($docid, $doc);
$it->inc;
+ if ($fmt && !(++$nr & 1023)) {
+ warn(sprintf($fmt, $nr));
+ }
}
# unlike copydatabase(1), we don't copy spelling
@@ -200,10 +225,12 @@ sub cpdb {
};
} while (cpdb_retryable($src, $@));
+ warn(sprintf($fmt, $nr)) if $fmt;
return unless $opt->{compact};
$src = $dst = undef; # flushes and closes
+ warn "$pfx compacting...\n" if $pfx;
# this is probably the best place to do xapian-compact
# since $dst isn't readable by HTTP or NNTP clients, yet:
my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $tmp, $new ];
@@ -212,10 +239,22 @@ sub cpdb {
defined(my $dst = $opt->{$fd}) or next;
$rdr->{$fd} = $dst;
}
+
+ my ($r, $w);
+ if ($pfx && pipe($r, $w)) {
+ $rdr->{1} = fileno($w);
+ }
my $pid = spawn($cmd, $env, $rdr);
- my $r = waitpid($pid, 0);
- if ($? || $r != $pid) {
- die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$r)\n";
+ if ($pfx) {
+ close $w or die "close: \$w: $!";
+ foreach (<$r>) {
+ s/\r/\r$pfx /g;
+ warn "$pfx $_";
+ }
+ }
+ my $rp = waitpid($pid, 0);
+ if ($? || $rp != $pid) {
+ die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$rp)\n";
}
remove_tree($tmp) or die "failed to remove $tmp: $!\n";
}
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index 78d37da..5b66337 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -9,12 +9,9 @@ use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-search');
my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
my $opt = {};
-GetOptions($opt, qw(compact)) or die "bad command-line args\n$usage";
+GetOptions($opt, qw(compact quiet|q)) or die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
-
my $cmd = [ \&PublicInbox::Xapcmd::cpdb ];
-open my $null, '>', '/dev/null' or die "failed to open /dev/null: $!\n";
-$opt->{1} = fileno($null);
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
# we rely on --no-renumber to keep docids synched to NNTP
diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t
index 61053b6..57a776f 100644
--- a/t/indexlevels-mirror.t
+++ b/t/indexlevels-mirror.t
@@ -18,7 +18,7 @@ foreach my $mod (qw(DBD::SQLite)) {
my $path = 'blib/script';
my $index = "$path/public-inbox-index";
-my $xcpdb = "$path/public-inbox-xcpdb";
+my @xcpdb = ("$path/public-inbox-xcpdb", '-q');
my $mime = PublicInbox::MIME->create(
header => [
@@ -110,7 +110,7 @@ sub import_index_incremental {
$im->done;
if ($level ne 'basic') {
- is(system($xcpdb, $mirror), 0, "v$v xcpdb OK");
+ is(system(@xcpdb, $mirror), 0, "v$v xcpdb OK");
delete $ro_mirror->{$_} for (qw(over search));
($nr, $msgs) = $ro_mirror->search->query('m:m@2');
is($nr, 1, "v$v found m\@2 via Xapian on $level");
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 16/26] xcpdb: cleanup error handling and diagnosis
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (14 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 15/26] xcpdb: implement progress reporting Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes Eric Wong
` (10 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Running a full "public-inbox-index --reindex" in parallel
with "public-inbox-xcpdb" on the same inbox can still cause
problems, though.
---
lib/PublicInbox/Xapcmd.pm | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 99f0e7c..697221d 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -163,13 +163,16 @@ sub run {
}
sub cpdb_retryable ($$) {
- my ($src, $err) = @_;
- if (ref($err) eq 'Search::Xapian::DatabaseModifiedError') {
- warn "$err, reopening and retrying\n";
+ my ($src, $pfx) = @_;
+ if (ref($@) eq 'Search::Xapian::DatabaseModifiedError') {
+ warn "$pfx Xapian DB modified, reopening and retrying\n";
$src->reopen;
return 1;
}
- die $err if $err;
+ if ($@) {
+ warn "$pfx E: ", ref($@), "\n";
+ die;
+ }
0;
}
@@ -186,7 +189,8 @@ sub cpdb {
my $creat = Search::Xapian::DB_CREATE();
my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat);
my ($it, $end);
- my ($pfx, $nr, $tot, $fmt); # progress output
+ my $pfx = '';
+ my ($nr, $tot, $fmt); # progress output
do {
eval {
@@ -196,15 +200,15 @@ sub cpdb {
$it = $src->postlist_begin('');
$end = $src->postlist_end('');
+ $pfx = (split('/', $old))[-1].':';
if ($opt->{-progress}) {
$nr = 0;
- $pfx = (split('/', $old))[-1].':';
$tot = $src->get_doccount;
$fmt = "$pfx % ".length($tot)."u/$tot\n";
warn "$pfx copying $tot documents\n";
}
};
- } while (cpdb_retryable($src, $@));
+ } while (cpdb_retryable($src, $pfx));
do {
eval {
@@ -223,12 +227,13 @@ sub cpdb {
# the Perl APIs don't expose iterators for them
# (and public-inbox does not use those features)
};
- } while (cpdb_retryable($src, $@));
+ } while (cpdb_retryable($src, $pfx));
warn(sprintf($fmt, $nr)) if $fmt;
return unless $opt->{compact};
$src = $dst = undef; # flushes and closes
+ $pfx = undef unless $fmt;
warn "$pfx compacting...\n" if $pfx;
# this is probably the best place to do xapian-compact
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (15 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 16/26] xcpdb: cleanup error handling and diagnosis Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 18/26] doc: xcpdb: update to reflect the current state Eric Wong
` (9 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
By creating temporary directories as deep as possible,
we can allow v2 repositories to have `xap$SCHEMA_VERSION'
(e.g. `xap15') reside on a separate FS.
We also check st_dev ahead-of-time to avoid doing work which
will fail with EXDEV. Of course, another process may still
move/change things around.
---
lib/PublicInbox/Xapcmd.pm | 59 ++++++++++++++++++++++++++-------------
1 file changed, 40 insertions(+), 19 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 697221d..860f90a 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -8,31 +8,36 @@ use PublicInbox::Over;
use PublicInbox::Search;
use File::Temp qw(tempdir);
use File::Path qw(remove_tree);
+use File::Basename qw(dirname);
# support testing with dev versions of Xapian which installs
# commands with a version number suffix (e.g. "xapian-compact-1.5")
our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
-sub commit_changes ($$$$) {
- my ($ibx, $old, $new, $opt) = @_;
+sub commit_changes ($$$) {
+ my ($ibx, $tmp, $opt) = @_;
my $reindex = $opt->{reindex};
my $im = $ibx->importer(0);
$im->lock_acquire if $reindex;
- my @st = stat($old) or die "failed to stat($old): $!\n";
+ while (my ($old, $new) = each %$tmp) {
+ my @st = stat($old) or die "failed to stat($old): $!\n";
- my $over = "$old/over.sqlite3";
- if (-f $over) {
- $over = PublicInbox::Over->new($over);
- $over->connect->sqlite_backup_to_file("$new/over.sqlite3");
- $over = undef;
- }
- rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
- chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
- rename($new, $old) or die "rename $new => $old: $!\n";
- remove_tree("$old/old") or die "failed to remove $old/old: $!\n";
+ my $over = "$old/over.sqlite3";
+ if (-f $over) { # only for v1, v2 over is untouched
+ $over = PublicInbox::Over->new($over);
+ my $tmp_over = "$new/over.sqlite3";
+ $over->connect->sqlite_backup_to_file($tmp_over);
+ $over = undef;
+ }
+ rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
+ chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
+ rename($new, $old) or die "rename $new => $old: $!\n";
+ my $prev = "$old/old";
+ remove_tree($prev) or die "failed to remove $prev: $!\n";
+ }
if ($reindex) {
$opt->{-skip_lock} = 1;
PublicInbox::Admin::index_inbox($ibx, $opt);
@@ -94,19 +99,23 @@ sub progress_prepare ($) {
}
}
+sub same_fs_or_die ($$) {
+ my ($x, $y) = @_;
+ return if ((stat($x))[0] == (stat($y))[0]); # 0 - st_dev
+ die "$x and $y reside on different filesystems\n";
+}
+
sub run {
my ($ibx, $cmd, $env, $opt) = @_;
progress_prepare($opt ||= {});
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
my $exe = $cmd->[0];
- my $pfx = $exe;
runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
my $reindex; # v1:{ from => $x40 }, v2:{ from => [ $x40, $x40, .. ] } }
my $from; # per-epoch ranges
if (ref($exe) eq 'CODE') {
- $pfx = 'CODE';
$reindex = $opt->{reindex} = {};
$from = $reindex->{from} = [];
require Search::Xapian::WritableDatabase;
@@ -116,16 +125,28 @@ sub run {
$ibx->umask_prepare;
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
- my $new = tempdir("$pfx-XXXXXXXX", DIR => $dir);
+
+ my $tmp = {}; # old partition => new (tmp) partition
my $v = $ibx->{version} ||= 1;
my @cmds;
+
+ # we want temporary directories to be as deep as possible,
+ # so v2 partitions can keep "xap$SCHEMA_VERSION" on a separate FS.
if ($v == 1) {
- push @cmds, [@$cmd, $old, $new];
+ my $old_parent = dirname($old);
+ same_fs_or_die($old_parent, $old);
+ $tmp->{$old} = tempdir('xapcmd-XXXXXXXX', DIR => $old_parent);
+ push @cmds, [ @$cmd, $old, $tmp->{$old} ];
} else {
opendir my $dh, $old or die "Failed to opendir $old: $!\n";
while (defined(my $dn = readdir($dh))) {
if ($dn =~ /\A\d+\z/) {
- push @cmds, [@$cmd, "$old/$dn", "$new/$dn"];
+ my $tmpl = "$dn-XXXXXXXX";
+ my $dst = tempdir($tmpl, DIR => $old);
+ same_fs_or_die($old, $dst);
+ my $cur = "$old/$dn";
+ push @cmds, [@$cmd, $cur, $dst ];
+ $tmp->{$cur} = $dst;
} elsif ($dn eq '.' || $dn eq '..') {
} elsif ($dn =~ /\Aover\.sqlite3/) {
} else {
@@ -158,7 +179,7 @@ sub run {
die join(' ', @$x)." failed: $?\n" if $?;
}
}
- commit_changes($ibx, $old, $new, $opt);
+ commit_changes($ibx, $tmp, $opt);
});
}
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 18/26] doc: xcpdb: update to reflect the current state
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (16 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 17/26] xapcmd: avoid EXDEV when finalizing changes Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting Eric Wong
` (8 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
It is no longer a wrapper around copydatabase(1), since
copydatabase did not recover from DatabaseModifiedError.
---
Documentation/public-inbox-xcpdb.pod | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod
index 4ff5186..c47500b 100644
--- a/Documentation/public-inbox-xcpdb.pod
+++ b/Documentation/public-inbox-xcpdb.pod
@@ -8,13 +8,11 @@ public-inbox-xcpdb - copy Xapian DBs (for format upgrades)
=head1 DESCRIPTION
-public-inbox-xcpdb is a wrapper for L<copydatabase(1)> for
+public-inbox-xcpdb is similar to L<copydatabase(1)> for
upgrading to the latest database format supported by Xapian
-(e.g. "glass" or "honey").
-
-It locks the inbox and prevents other processes such as
-L<public-inbox-watch(1)> and L<public-inbox-mda(1)> from
-writing while it operates.
+(e.g. "glass" or "honey"), but is designed to tolerate and
+recover from Xapian database modifications from
+L<public-inbox-watch(1)> or L<public-inbox-mda(1)>.
This is intended for upgrading the database format used by
Xapian. It DOES NOT upgrade the schema used by the
@@ -29,6 +27,14 @@ public-inbox search interface (see L<public-inbox-index(1)>).
The default config file, normally "~/.public-inbox/config".
See L<public-inbox-config(5)>
+=item XAPIAN_FLUSH_THRESHOLD
+
+The number of documents to update before committing changes to
+disk. This environment is handled directly by Xapian, refer to
+Xapian API documentation for more details.
+
+Default: 10000
+
=back
=head1 UPGRADING
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (17 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 18/26] doc: xcpdb: update to reflect the current state Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 20/26] xcpdb: show re-indexing progress Eric Wong
` (7 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
`warn' is reserved for actual warnings, as it respects
$SIG{__WARN__} and we rely on that override to print
message context information when we are indexing.
---
lib/PublicInbox/Xapcmd.pm | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 860f90a..aa3e4c0 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -95,7 +95,7 @@ sub progress_prepare ($) {
$opt->{1} = fileno($null);
$opt->{-dev_null} = $null;
} else {
- $opt->{-progress} = 1;
+ $opt->{-progress} = sub { print STDERR @_ };
}
}
@@ -212,6 +212,7 @@ sub cpdb {
my ($it, $end);
my $pfx = '';
my ($nr, $tot, $fmt); # progress output
+ my $pr = $opt->{-progress};
do {
eval {
@@ -222,11 +223,11 @@ sub cpdb {
$it = $src->postlist_begin('');
$end = $src->postlist_end('');
$pfx = (split('/', $old))[-1].':';
- if ($opt->{-progress}) {
+ if ($pr) {
$nr = 0;
$tot = $src->get_doccount;
$fmt = "$pfx % ".length($tot)."u/$tot\n";
- warn "$pfx copying $tot documents\n";
+ $pr->("$pfx copying $tot documents\n");
}
};
} while (cpdb_retryable($src, $pfx));
@@ -238,8 +239,8 @@ sub cpdb {
my $doc = $src->get_document($docid);
$dst->replace_document($docid, $doc);
$it->inc;
- if ($fmt && !(++$nr & 1023)) {
- warn(sprintf($fmt, $nr));
+ if ($pr && !(++$nr & 1023)) {
+ $pr->(sprintf($fmt, $nr));
}
}
@@ -250,13 +251,13 @@ sub cpdb {
};
} while (cpdb_retryable($src, $pfx));
- warn(sprintf($fmt, $nr)) if $fmt;
+ $pr->(sprintf($fmt, $nr)) if $pr;
return unless $opt->{compact};
$src = $dst = undef; # flushes and closes
$pfx = undef unless $fmt;
- warn "$pfx compacting...\n" if $pfx;
+ $pr->("$pfx compacting...\n") if $pr;
# this is probably the best place to do xapian-compact
# since $dst isn't readable by HTTP or NNTP clients, yet:
my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $tmp, $new ];
@@ -275,7 +276,7 @@ sub cpdb {
close $w or die "close: \$w: $!";
foreach (<$r>) {
s/\r/\r$pfx /g;
- warn "$pfx $_";
+ $pr->("$pfx $_");
}
}
my $rp = waitpid($pid, 0);
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 20/26] xcpdb: show re-indexing progress
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (18 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 19/26] xapcmd: use "print STDERR" for progress reporting Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:36 ` [PATCH 21/26] xcpdb: remove temporary directories on aborts Eric Wong
` (6 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Emit information about reindexing git revision ranges when used
with xcpdb. Additionally, distinguish Xapian copy output from
v2 git epoch counting by increasing directory context info.
For now, v1 batches batches are emitted. v2 indexing is still
missing progress reporting for batches, as the data structures
for reindexing would benefit from a refactoring, first.
This does not currently affect the use of public-inbox-index,
but may in the future.
---
lib/PublicInbox/SearchIdx.pm | 24 ++++++++++++++++--------
lib/PublicInbox/V2Writable.pm | 10 +++++++++-
lib/PublicInbox/Xapcmd.pm | 8 ++++----
3 files changed, 29 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 0aeeb6b..9c29106 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -549,12 +549,12 @@ sub index_sync {
$self->{-inbox}->with_umask(sub { $self->_index_sync($opts) })
}
-sub batch_adjust ($$$$) {
- my ($max, $bytes, $batch_cb, $latest) = @_;
+sub batch_adjust ($$$$$) {
+ my ($max, $bytes, $batch_cb, $latest, $nr) = @_;
$$max -= $bytes;
if ($$max <= 0) {
$$max = BATCH_BYTES;
- $batch_cb->($latest);
+ $batch_cb->($nr, $latest);
}
}
@@ -573,6 +573,7 @@ sub read_log {
my %D;
my $line;
my $newest;
+ my $nr = 0;
while (defined($line = <$log>)) {
if ($line =~ /$addmsg/o) {
my $blob = $1;
@@ -584,7 +585,7 @@ sub read_log {
next;
}
my $mime = do_cat_mail($git, $blob, \$bytes) or next;
- batch_adjust(\$max, $bytes, $batch_cb, $latest);
+ batch_adjust(\$max, $bytes, $batch_cb, $latest, ++$nr);
$add_cb->($self, $mime, $bytes, $blob);
} elsif ($line =~ /$delmsg/o) {
my $blob = $1;
@@ -599,7 +600,7 @@ sub read_log {
my $mime = do_cat_mail($git, $blob, \$bytes) or next;
$del_cb->($self, $mime);
}
- $batch_cb->($latest, $newest);
+ $batch_cb->($nr, $latest, $newest);
}
sub _msgmap_init {
@@ -612,7 +613,7 @@ sub _msgmap_init {
}
sub _git_log {
- my ($self, $range) = @_;
+ my ($self, $opts, $range) = @_;
my $git = $self->{git};
if (index($range, '..') < 0) {
@@ -629,12 +630,17 @@ sub _git_log {
# Count the new files so they can be added newest to oldest
# and still have numbers increasing from oldest to newest
my $fcount = 0;
+ my $pr = $opts->{-progress};
+ $pr->("counting changes\n\t$range ... ") if $pr;
# can't use 'rev-list --count' if we use --diff-filter
my $fh = $git->popen(qw(log --pretty=tformat:%h
--no-notes --no-color --no-renames
--diff-filter=AM), $range);
++$fcount while <$fh>;
+ close $fh;
my $high = $self->{mm}->num_highwater;
+ $pr->("$fcount\n") if $pr; # continue previous line
+ $self->{ntodo} = $fcount;
if (index($range, '..') < 0) {
if ($high && $high == $fcount) {
@@ -707,6 +713,7 @@ sub _index_sync {
my ($last_commit, $lx, $xlog);
my $git = $self->{git};
$git->batch_prepare;
+ my $pr = $opts->{-progress};
my $xdb = $self->begin_txn_lazy;
my $mm = _msgmap_init($self);
@@ -724,14 +731,14 @@ sub _index_sync {
# ensure we leak no FDs to "git log" with Xapian <= 1.2
my $range = $lx eq '' ? $tip : "$lx..$tip";
- $xlog = _git_log($self, $range);
+ $xlog = _git_log($self, $opts, $range);
$xdb = $self->begin_txn_lazy;
} while (_last_x_commit($self, $mm) ne $last_commit);
my $dbh = $mm->{dbh} if $mm;
my $cb = sub {
- my ($commit, $newest) = @_;
+ my ($nr, $commit, $newest) = @_;
if ($dbh) {
if ($newest) {
my $cur = $mm->last_commit || '';
@@ -751,6 +758,7 @@ sub _index_sync {
$git->cleanup;
$xdb = _xdb_release($self);
# let another process do some work... <
+ $pr->("indexed $nr/$self->{ntodo}\n") if $pr && $nr;
if (!$newest) {
$xdb = $self->begin_txn_lazy;
$dbh->begin_work if $dbh;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 1ee19b2..1170f32 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -777,6 +777,9 @@ sub reindex_oid {
$git->cleanup;
$mm_tmp->atfork_prepare;
$self->done; # release lock
+
+ # TODO: print progress info, here
+
# allow -watch or -mda to write...
$self->idx_init; # reacquire lock
$mm_tmp->atfork_parent;
@@ -844,6 +847,7 @@ $range
sub index_prepare {
my ($self, $opts, $epoch_max, $ranges) = @_;
+ my $pr = $opts->{-progress};
my $regen_max = 0;
my $head = $self->{-inbox}->{ref_head} || 'refs/heads/master';
for (my $i = $epoch_max; $i >= 0; $i--) {
@@ -858,10 +862,14 @@ sub index_prepare {
$ranges->[$i] = $range;
# can't use 'rev-list --count' if we use --diff-filter
+ $pr->("$i.git counting changes\n\t$range ... ") if $pr;
+ my $n = 0;
my $fh = $git->popen(qw(log --pretty=tformat:%H
--no-notes --no-color --no-renames
--diff-filter=AM), $range, '--', 'm');
- ++$regen_max while <$fh>;
+ ++$n while <$fh>;
+ $pr->("$n\n") if $pr;
+ $regen_max += $n;
}
\$regen_max;
}
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index aa3e4c0..0e44804 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -222,10 +222,11 @@ sub cpdb {
$it = $src->postlist_begin('');
$end = $src->postlist_end('');
- $pfx = (split('/', $old))[-1].':';
if ($pr) {
$nr = 0;
$tot = $src->get_doccount;
+ my @p = split('/', $old);
+ $pfx = "$p[-2]/$p[-1]:";
$fmt = "$pfx % ".length($tot)."u/$tot\n";
$pr->("$pfx copying $tot documents\n");
}
@@ -255,7 +256,6 @@ sub cpdb {
return unless $opt->{compact};
$src = $dst = undef; # flushes and closes
- $pfx = undef unless $fmt;
$pr->("$pfx compacting...\n") if $pr;
# this is probably the best place to do xapian-compact
@@ -268,11 +268,11 @@ sub cpdb {
}
my ($r, $w);
- if ($pfx && pipe($r, $w)) {
+ if ($pr && pipe($r, $w)) {
$rdr->{1} = fileno($w);
}
my $pid = spawn($cmd, $env, $rdr);
- if ($pfx) {
+ if ($pr) {
close $w or die "close: \$w: $!";
foreach (<$r>) {
s/\r/\r$pfx /g;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 21/26] xcpdb: remove temporary directories on aborts
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (19 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 20/26] xcpdb: show re-indexing progress Eric Wong
@ 2019-05-23 9:36 ` Eric Wong
2019-05-23 9:37 ` [PATCH 22/26] compact: reuse infrastructure from xcpdb Eric Wong
` (5 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:36 UTC (permalink / raw)
To: meta
Cleanup temporary directories on common termination signals
(INT, HUP, PIPE, TERM), but only if it's not in the process
of being committed via rename() sequence.
---
lib/PublicInbox/Xapcmd.pm | 39 +++++++++++++++++++++++++++++++++++++--
1 file changed, 37 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 0e44804..06389dd 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -31,13 +31,15 @@ sub commit_changes ($$$) {
$over->connect->sqlite_backup_to_file($tmp_over);
$over = undef;
}
+ chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
+ # Xtmpdir->DESTROY won't remove $new after this:
rename($old, "$new/old") or die "rename $old => $new/old: $!\n";
- chmod($st[2] & 07777, $new) or die "chmod $old: $!\n";
rename($new, $old) or die "rename $new => $old: $!\n";
my $prev = "$old/old";
remove_tree($prev) or die "failed to remove $prev: $!\n";
}
+ $tmp->done;
if ($reindex) {
$opt->{-skip_lock} = 1;
PublicInbox::Admin::index_inbox($ibx, $opt);
@@ -126,7 +128,7 @@ sub run {
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
- my $tmp = {}; # old partition => new (tmp) partition
+ my $tmp = PublicInbox::Xtmpdirs->new;
my $v = $ibx->{version} ||= 1;
my @cmds;
@@ -286,4 +288,37 @@ sub cpdb {
remove_tree($tmp) or die "failed to remove $tmp: $!\n";
}
+# slightly easier-to-manage manage than END{} blocks
+package PublicInbox::Xtmpdirs;
+use strict;
+use warnings;
+use File::Path qw(remove_tree);
+my %owner;
+
+sub new {
+ # http://www.tldp.org/LDP/abs/html/exitcodes.html
+ $SIG{INT} = sub { exit(130) };
+ $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = sub { exit(1) };
+ my $self = bless {}, $_[0]; # old partition => new (tmp) partition
+ $owner{"$self"} = $$;
+ $self;
+}
+
+sub done {
+ my ($self) = @_;
+ delete $owner{"$self"};
+ $SIG{INT} = $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = 'DEFAULT';
+ %$self = ();
+}
+
+sub DESTROY {
+ my ($self) = @_;
+ my $owner_pid = delete $owner{"$self"} or return;
+ return if $owner_pid != $$;
+ foreach my $new (values %$self) {
+ remove_tree($new) unless -d "$new/old";
+ }
+ $SIG{INT} = $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = 'DEFAULT';
+}
+
1;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 22/26] compact: reuse infrastructure from xcpdb
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (20 preceding siblings ...)
2019-05-23 9:36 ` [PATCH 21/26] xcpdb: remove temporary directories on aborts Eric Wong
@ 2019-05-23 9:37 ` Eric Wong
2019-05-23 9:37 ` [PATCH 23/26] xcpdb|compact: support some xapian-compact switches Eric Wong
` (4 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:37 UTC (permalink / raw)
To: meta
Since -xcpdb is a superset of -compact, we can reuse much of
that code used for driving compact.
For compact (only), this is slightly less memory efficient since
it requires an extra process per-partition, but we get to prefix
the output with the partition name for more readable output.
---
lib/PublicInbox/Xapcmd.pm | 132 +++++++++++++++++++-----------------
script/public-inbox-compact | 6 +-
script/public-inbox-xcpdb | 3 +-
3 files changed, 76 insertions(+), 65 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 06389dd..488c616 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -19,7 +19,7 @@ sub commit_changes ($$$) {
my $reindex = $opt->{reindex};
my $im = $ibx->importer(0);
- $im->lock_acquire if $reindex;
+ $im->lock_acquire if !$opt->{-coarse_lock};
while (my ($old, $new) = each %$tmp) {
my @st = stat($old) or die "failed to stat($old): $!\n";
@@ -40,7 +40,7 @@ sub commit_changes ($$$) {
remove_tree($prev) or die "failed to remove $prev: $!\n";
}
$tmp->done;
- if ($reindex) {
+ if (!$opt->{-coarse_lock}) {
$opt->{-skip_lock} = 1;
PublicInbox::Admin::index_inbox($ibx, $opt);
# implicit lock_release
@@ -49,18 +49,13 @@ sub commit_changes ($$$) {
}
}
-sub xspawn {
- my ($cmd, $env, $opt) = @_;
- if (ref($cmd->[0]) eq 'CODE') {
- my $cb = shift(@$cmd); # $cb = cpdb()
- defined(my $pid = fork) or die "fork: $!";
- return $pid if $pid > 0;
- eval { $cb->($cmd, $env, $opt) };
- die $@ if $@;
- exit 0;
- } else {
- spawn($cmd, $env, $opt);
- }
+sub cb_spawn {
+ my ($cb, $args, $opt) = @_; # $cb = cpdb() or compact()
+ defined(my $pid = fork) or die "fork: $!";
+ return $pid if $pid > 0;
+ eval { $cb->($args, $opt) };
+ die $@ if $@;
+ exit 0;
}
sub runnable_or_die ($) {
@@ -108,29 +103,27 @@ sub same_fs_or_die ($$) {
}
sub run {
- my ($ibx, $cmd, $env, $opt) = @_;
+ my ($ibx, $task, $opt) = @_; # task = 'cpdb' or 'compact'
+ my $cb = \&${\"PublicInbox::Xapcmd::$task"};
progress_prepare($opt ||= {});
my $dir = $ibx->{mainrepo} or die "no mainrepo in inbox\n";
- my $exe = $cmd->[0];
runnable_or_die($XAPIAN_COMPACT) if $opt->{compact};
-
my $reindex; # v1:{ from => $x40 }, v2:{ from => [ $x40, $x40, .. ] } }
my $from; # per-epoch ranges
- if (ref($exe) eq 'CODE') {
+ if (!$opt->{-coarse_lock}) {
$reindex = $opt->{reindex} = {};
$from = $reindex->{from} = [];
require Search::Xapian::WritableDatabase;
- } else {
- runnable_or_die($exe);
}
+
$ibx->umask_prepare;
my $old = $ibx->search->xdir(1);
-d $old or die "$old does not exist\n";
my $tmp = PublicInbox::Xtmpdirs->new;
my $v = $ibx->{version} ||= 1;
- my @cmds;
+ my @q;
# we want temporary directories to be as deep as possible,
# so v2 partitions can keep "xap$SCHEMA_VERSION" on a separate FS.
@@ -138,7 +131,7 @@ sub run {
my $old_parent = dirname($old);
same_fs_or_die($old_parent, $old);
$tmp->{$old} = tempdir('xapcmd-XXXXXXXX', DIR => $old_parent);
- push @cmds, [ @$cmd, $old, $tmp->{$old} ];
+ push @q, [ $old, $tmp->{$old} ];
} else {
opendir my $dh, $old or die "Failed to opendir $old: $!\n";
while (defined(my $dn = readdir($dh))) {
@@ -147,7 +140,7 @@ sub run {
my $dst = tempdir($tmpl, DIR => $old);
same_fs_or_die($old, $dst);
my $cur = "$old/$dn";
- push @cmds, [@$cmd, $cur, $dst ];
+ push @q, [ $cur, $dst ];
$tmp->{$cur} = $dst;
} elsif ($dn eq '.' || $dn eq '..') {
} elsif ($dn =~ /\Aover\.sqlite3/) {
@@ -155,30 +148,31 @@ sub run {
warn "W: skipping unknown dir: $old/$dn\n"
}
}
- die "No Xapian parts found in $old\n" unless @cmds;
+ die "No Xapian parts found in $old\n" unless @q;
}
my $im = $ibx->importer(0);
- my $max = $opt->{jobs} || scalar(@cmds);
+ my $max = $opt->{jobs} || scalar(@q);
$ibx->with_umask(sub {
$im->lock_acquire;
# fine-grained locking if we prepare for reindex
- if ($reindex) {
+ if (!$opt->{-coarse_lock}) {
prepare_reindex($ibx, $reindex);
$im->lock_release;
}
+
delete($ibx->{$_}) for (qw(mm over search)); # cleanup
my %pids;
- while (@cmds) {
- while (scalar(keys(%pids)) < $max && scalar(@cmds)) {
- my $x = shift @cmds;
- $pids{xspawn($x, $env, $opt)} = $x;
+ while (@q) {
+ while (scalar(keys(%pids)) < $max && scalar(@q)) {
+ my $args = shift @q;
+ $pids{cb_spawn($cb, $args, $opt)} = $args;
}
while (scalar keys %pids) {
my $pid = waitpid(-1, 0);
- my $x = delete $pids{$pid};
- die join(' ', @$x)." failed: $?\n" if $?;
+ my $args = delete $pids{$pid};
+ die join(' ', @$args)." failed: $?\n" if $?;
}
}
commit_changes($ibx, $tmp, $opt);
@@ -199,10 +193,51 @@ sub cpdb_retryable ($$) {
0;
}
+sub progress_pfx ($) {
+ my @p = split('/', $_[0]);
+
+ # return "xap15/0" for v2, or "xapian15" for v1:
+ ($p[-1] =~ /\A\d+\z/) ? "$p[-2]/$p[-1]" : $p[-1];
+}
+
+# xapian-compact wrapper
+sub compact ($$) {
+ my ($args, $opt) = @_;
+ my ($src, $dst) = @$args;
+ my ($r, $w);
+ my $pfx = $opt->{-progress_pfx} ||= progress_pfx($src);
+ my $pr = $opt->{-progress};
+ my $rdr = {};
+
+ foreach my $fd (0..2) {
+ defined(my $dfd = $opt->{$fd}) or next;
+ $rdr->{$fd} = $dfd;
+ }
+ if ($pr) {
+ $pr->("$pfx compacting...\n");
+ $rdr->{1} = fileno($w) if pipe($r, $w);
+ }
+
+ # we rely on --no-renumber to keep docids synched to NNTP
+ my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $src, $dst ];
+ my $pid = spawn($cmd, undef, $rdr);
+ if ($pr) {
+ close $w or die "close: \$w: $!";
+ foreach (<$r>) {
+ s/\r/\r$pfx /g;
+ $pr->("$pfx $_");
+ }
+ }
+ my $rp = waitpid($pid, 0);
+ if ($? || $rp != $pid) {
+ die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$rp)\n";
+ }
+}
+
# Like copydatabase(1), this is horribly slow; and it doesn't seem due
# to the overhead of Perl.
-sub cpdb {
- my ($args, $env, $opt) = @_;
+sub cpdb ($$) {
+ my ($args, $opt) = @_;
my ($old, $new) = @$args;
my $src = Search::Xapian::Database->new($old);
my $tmp = $opt->{compact} ? "$new.compact" : $new;
@@ -212,9 +247,9 @@ sub cpdb {
my $creat = Search::Xapian::DB_CREATE();
my $dst = Search::Xapian::WritableDatabase->new($tmp, $creat);
my ($it, $end);
- my $pfx = '';
my ($nr, $tot, $fmt); # progress output
my $pr = $opt->{-progress};
+ my $pfx = $opt->{-progress_pfx} = progress_pfx($old);
do {
eval {
@@ -227,8 +262,6 @@ sub cpdb {
if ($pr) {
$nr = 0;
$tot = $src->get_doccount;
- my @p = split('/', $old);
- $pfx = "$p[-2]/$p[-1]:";
$fmt = "$pfx % ".length($tot)."u/$tot\n";
$pr->("$pfx copying $tot documents\n");
}
@@ -259,32 +292,9 @@ sub cpdb {
$src = $dst = undef; # flushes and closes
- $pr->("$pfx compacting...\n") if $pr;
# this is probably the best place to do xapian-compact
# since $dst isn't readable by HTTP or NNTP clients, yet:
- my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $tmp, $new ];
- my $rdr = {};
- foreach my $fd (0..2) {
- defined(my $dst = $opt->{$fd}) or next;
- $rdr->{$fd} = $dst;
- }
-
- my ($r, $w);
- if ($pr && pipe($r, $w)) {
- $rdr->{1} = fileno($w);
- }
- my $pid = spawn($cmd, $env, $rdr);
- if ($pr) {
- close $w or die "close: \$w: $!";
- foreach (<$r>) {
- s/\r/\r$pfx /g;
- $pr->("$pfx $_");
- }
- }
- my $rp = waitpid($pid, 0);
- if ($? || $rp != $pid) {
- die join(' ', @$cmd)." failed: $? (pid=$pid, reaped=$rp)\n";
- }
+ compact([ $tmp, $new ], $opt);
remove_tree($tmp) or die "failed to remove $tmp: $!\n";
}
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 709fb92..4f58d5a 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -3,14 +3,16 @@
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
use strict;
use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
use PublicInbox::InboxWritable;
use PublicInbox::Xapcmd;
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
my $usage = "Usage: public-inbox-compact REPO_DIR\n";
+my $opt = { compact => 1, -coarse_lock => 1 };
+GetOptions($opt, qw(quiet|q)) or die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
- # we rely on --no-renumber to keep docids synched to NNTP
- PublicInbox::Xapcmd::run($ibx, [qw(xapian-compact --no-renumber)]);
+ PublicInbox::Xapcmd::run($ibx, 'compact', $opt);
}
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index 5b66337..bda7be0 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -11,9 +11,8 @@ my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
my $opt = {};
GetOptions($opt, qw(compact quiet|q)) or die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
-my $cmd = [ \&PublicInbox::Xapcmd::cpdb ];
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
# we rely on --no-renumber to keep docids synched to NNTP
- PublicInbox::Xapcmd::run($ibx, $cmd, undef, $opt);
+ PublicInbox::Xapcmd::run($ibx, 'cpdb', $opt);
}
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 23/26] xcpdb|compact: support some xapian-compact switches
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (21 preceding siblings ...)
2019-05-23 9:37 ` [PATCH 22/26] compact: reuse infrastructure from xcpdb Eric Wong
@ 2019-05-23 9:37 ` Eric Wong
2019-05-23 9:37 ` [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact" Eric Wong
` (3 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:37 UTC (permalink / raw)
To: meta
Allow users to specify the --blocksize <B>, --no-full, --fuller
options for xapian-compact(1) for fine-tuning compact behavior
for low-traffic/inactive inboxes.
We also won't support --multipass, since it doesn't seem
compatible with our requirement to use --no-renumber.
We also won't support --single-file, since it only seems
intended for totally dead inboxes; and it doesn't seem
worth the support overhead when "totally dead" turns out
to be a misdiagnosis.
---
lib/PublicInbox/Xapcmd.pm | 17 ++++++++++++-----
script/public-inbox-compact | 3 ++-
script/public-inbox-xcpdb | 3 ++-
3 files changed, 16 insertions(+), 7 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 488c616..74abf99 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -13,6 +13,7 @@ use File::Basename qw(dirname);
# support testing with dev versions of Xapian which installs
# commands with a version number suffix (e.g. "xapian-compact-1.5")
our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
+our @COMPACT_OPT = qw(quiet|q blocksize|b=s no-full|n fuller|F);
sub commit_changes ($$$) {
my ($ibx, $tmp, $opt) = @_;
@@ -213,13 +214,19 @@ sub compact ($$) {
defined(my $dfd = $opt->{$fd}) or next;
$rdr->{$fd} = $dfd;
}
- if ($pr) {
- $pr->("$pfx compacting...\n");
- $rdr->{1} = fileno($w) if pipe($r, $w);
- }
+ $rdr->{1} = fileno($w) if $pr && pipe($r, $w);
# we rely on --no-renumber to keep docids synched to NNTP
- my $cmd = [ $XAPIAN_COMPACT, '--no-renumber', $src, $dst ];
+ my $cmd = [ $XAPIAN_COMPACT, '--no-renumber' ];
+ for my $sw (qw(no-full fuller)) {
+ push @$cmd, "--$sw" if $opt->{$sw};
+ }
+ for my $sw (qw(blocksize)) {
+ defined(my $v = $opt->{$sw}) or next;
+ push @$cmd, "--$sw", $v;
+ }
+ $pr->("$pfx `".join(' ', @$cmd)."'\n") if $pr;
+ push @$cmd, $src, $dst;
my $pid = spawn($cmd, undef, $rdr);
if ($pr) {
close $w or die "close: \$w: $!";
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 4f58d5a..4bdadfc 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -10,7 +10,8 @@ use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
my $usage = "Usage: public-inbox-compact REPO_DIR\n";
my $opt = { compact => 1, -coarse_lock => 1 };
-GetOptions($opt, qw(quiet|q)) or die "bad command-line args\n$usage";
+GetOptions($opt, @PublicInbox::Xapcmd::COMPACT_OPT) or
+ die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
diff --git a/script/public-inbox-xcpdb b/script/public-inbox-xcpdb
index bda7be0..badb95e 100755
--- a/script/public-inbox-xcpdb
+++ b/script/public-inbox-xcpdb
@@ -9,7 +9,8 @@ use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-search');
my $usage = "Usage: public-inbox-xcpdb INBOX_DIR\n";
my $opt = {};
-GetOptions($opt, qw(compact quiet|q)) or die "bad command-line args\n$usage";
+GetOptions($opt, qw(compact), @PublicInbox::Xapcmd::COMPACT_OPT) or
+ die "bad command-line args\n$usage";
my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV) or die $usage;
foreach (@ibxs) {
my $ibx = PublicInbox::InboxWritable->new($_);
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact"
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (22 preceding siblings ...)
2019-05-23 9:37 ` [PATCH 23/26] xcpdb|compact: support some xapian-compact switches Eric Wong
@ 2019-05-23 9:37 ` Eric Wong
2019-05-23 9:37 ` [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1) Eric Wong
` (2 subsequent siblings)
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:37 UTC (permalink / raw)
To: meta
We should not have leftover junk on interrupted invocations.
---
lib/PublicInbox/Xapcmd.pm | 14 ++++++++++++--
1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 74abf99..5b6d06b 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -247,7 +247,16 @@ sub cpdb ($$) {
my ($args, $opt) = @_;
my ($old, $new) = @$args;
my $src = Search::Xapian::Database->new($old);
- my $tmp = $opt->{compact} ? "$new.compact" : $new;
+ my ($xtmp, $tmp);
+ if ($opt->{compact}) {
+ my $newdir = dirname($new);
+ same_fs_or_die($newdir, $new);
+ $tmp = tempdir("$new.compact-XXXXXX", DIR => $newdir);
+ $xtmp = PublicInbox::Xtmpdirs->new;
+ $xtmp->{$new} = $tmp;
+ } else {
+ $tmp = $new;
+ }
# like copydatabase(1), be sure we don't overwrite anything in case
# of other bugs:
@@ -295,7 +304,7 @@ sub cpdb ($$) {
} while (cpdb_retryable($src, $pfx));
$pr->(sprintf($fmt, $nr)) if $pr;
- return unless $opt->{compact};
+ return unless $xtmp;
$src = $dst = undef; # flushes and closes
@@ -303,6 +312,7 @@ sub cpdb ($$) {
# since $dst isn't readable by HTTP or NNTP clients, yet:
compact([ $tmp, $new ], $opt);
remove_tree($tmp) or die "failed to remove $tmp: $!\n";
+ $xtmp->done;
}
# slightly easier-to-manage manage than END{} blocks
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1)
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (23 preceding siblings ...)
2019-05-23 9:37 ` [PATCH 24/26] xapcmd: cleanup on interrupted xcpdb "--compact" Eric Wong
@ 2019-05-23 9:37 ` Eric Wong
2019-05-23 9:37 ` [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done Eric Wong
2019-05-23 10:37 ` [PATCH 27/26] doc: various updates to reflect current state Eric Wong
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:37 UTC (permalink / raw)
To: meta
We don't have to be tied to the number of partitions in case
we made a bad choice at initialization. This doesn't affect
reindexing, but the copying phase is already intensive.
And optimize away the extra process when we only have a single
job which won't parallelize.
The wording for the (v2) reindexing phase could be improved,
later. I also plan to allow repartitioning of existing
Xapian DBs.
---
lib/PublicInbox/Xapcmd.pm | 44 +++++++++++++++++++++++++--------------
1 file changed, 28 insertions(+), 16 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index 5b6d06b..a294d53 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -13,7 +13,7 @@ use File::Basename qw(dirname);
# support testing with dev versions of Xapian which installs
# commands with a version number suffix (e.g. "xapian-compact-1.5")
our $XAPIAN_COMPACT = $ENV{XAPIAN_COMPACT} || 'xapian-compact';
-our @COMPACT_OPT = qw(quiet|q blocksize|b=s no-full|n fuller|F);
+our @COMPACT_OPT = qw(jobs|j=i quiet|q blocksize|b=s no-full|n fuller|F);
sub commit_changes ($$$) {
my ($ibx, $tmp, $opt) = @_;
@@ -54,8 +54,7 @@ sub cb_spawn {
my ($cb, $args, $opt) = @_; # $cb = cpdb() or compact()
defined(my $pid = fork) or die "fork: $!";
return $pid if $pid > 0;
- eval { $cb->($args, $opt) };
- die $@ if $@;
+ $cb->($args, $opt);
exit 0;
}
@@ -103,6 +102,31 @@ sub same_fs_or_die ($$) {
die "$x and $y reside on different filesystems\n";
}
+sub process_queue {
+ my ($queue, $cb, $max, $opt) = @_;
+ if ($max <= 1) {
+ while (defined(my $args = shift @$queue)) {
+ $cb->($args, $opt);
+ }
+ return;
+ }
+
+ # run in parallel:
+ my %pids;
+ while (@$queue) {
+ while (scalar(keys(%pids)) < $max && scalar(@$queue)) {
+ my $args = shift @$queue;
+ $pids{cb_spawn($cb, $args, $opt)} = $args;
+ }
+
+ while (scalar keys %pids) {
+ my $pid = waitpid(-1, 0);
+ my $args = delete $pids{$pid};
+ die join(' ', @$args)." failed: $?\n" if $?;
+ }
+ }
+}
+
sub run {
my ($ibx, $task, $opt) = @_; # task = 'cpdb' or 'compact'
my $cb = \&${\"PublicInbox::Xapcmd::$task"};
@@ -163,19 +187,7 @@ sub run {
}
delete($ibx->{$_}) for (qw(mm over search)); # cleanup
- my %pids;
- while (@q) {
- while (scalar(keys(%pids)) < $max && scalar(@q)) {
- my $args = shift @q;
- $pids{cb_spawn($cb, $args, $opt)} = $args;
- }
-
- while (scalar keys %pids) {
- my $pid = waitpid(-1, 0);
- my $args = delete $pids{$pid};
- die join(' ', @$args)." failed: $?\n" if $?;
- }
- }
+ process_queue(\@q, $cb, $max, $opt);
commit_changes($ibx, $tmp, $opt);
});
}
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (24 preceding siblings ...)
2019-05-23 9:37 ` [PATCH 25/26] xcpdb|compact: support --jobs/-j flag like gmake(1) Eric Wong
@ 2019-05-23 9:37 ` Eric Wong
2019-05-23 10:37 ` [PATCH 27/26] doc: various updates to reflect current state Eric Wong
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 9:37 UTC (permalink / raw)
To: meta
To properly handle compact tmpdir cleanup in single process
situations, we need to carefully account for Xtmpdir not
being a singleton and ensuring we don't clobber signal
handlers which belong to other Xtmpdirs.
---
lib/PublicInbox/Xapcmd.pm | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/Xapcmd.pm b/lib/PublicInbox/Xapcmd.pm
index a294d53..999819c 100644
--- a/lib/PublicInbox/Xapcmd.pm
+++ b/lib/PublicInbox/Xapcmd.pm
@@ -22,6 +22,8 @@ sub commit_changes ($$$) {
my $im = $ibx->importer(0);
$im->lock_acquire if !$opt->{-coarse_lock};
+ $SIG{INT} or die 'BUG: $SIG{INT} not handled';
+
while (my ($old, $new) = each %$tmp) {
my @st = stat($old) or die "failed to stat($old): $!\n";
@@ -346,7 +348,12 @@ sub new {
sub done {
my ($self) = @_;
delete $owner{"$self"};
- $SIG{INT} = $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = 'DEFAULT';
+
+ my %known_pids;
+ $known_pids{$_}++ foreach values %owner;
+ if (!$known_pids{$$}) {
+ $SIG{INT} = $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = 'DEFAULT';
+ }
%$self = ();
}
@@ -357,7 +364,7 @@ sub DESTROY {
foreach my $new (values %$self) {
remove_tree($new) unless -d "$new/old";
}
- $SIG{INT} = $SIG{HUP} = $SIG{PIPE} = $SIG{TERM} = 'DEFAULT';
+ done($self);
}
1;
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread
* [PATCH 27/26] doc: various updates to reflect current state
2019-05-23 9:36 [PATCH 00/26] xcpdb: ease Xapian DB format migrations Eric Wong
` (25 preceding siblings ...)
2019-05-23 9:37 ` [PATCH 26/26] xapcmd: do not reset %SIG until last Xtmpdir is done Eric Wong
@ 2019-05-23 10:37 ` Eric Wong
26 siblings, 0 replies; 28+ messages in thread
From: Eric Wong @ 2019-05-23 10:37 UTC (permalink / raw)
To: meta
-index documentation avoid redundant v1 information and refers
readers to apropriate v1/v2 manpages. Search::Xapian can also
be optional, now, as only the PSGI search interface uses it.
Favor "INBOX_DIR" where appropriate, since "REPO_DIR" can be
confused for code repos which we also support.
XAPIAN_FLUSH_THRESHOLD is documented for all relevant
bulk commands.
---
Documentation/public-inbox-compact.pod | 25 ++++----
Documentation/public-inbox-index.pod | 80 +++++++++---------------
Documentation/public-inbox-v1-format.pod | 12 +++-
Documentation/public-inbox-v2-format.pod | 5 +-
Documentation/public-inbox-xcpdb.pod | 5 +-
lib/PublicInbox/Inbox.pm | 2 +-
script/public-inbox-compact | 2 +-
script/public-inbox-index | 4 +-
script/public-inbox-init | 2 +-
9 files changed, 64 insertions(+), 73 deletions(-)
diff --git a/Documentation/public-inbox-compact.pod b/Documentation/public-inbox-compact.pod
index 4a519ce..7d37f6f 100644
--- a/Documentation/public-inbox-compact.pod
+++ b/Documentation/public-inbox-compact.pod
@@ -9,15 +9,12 @@ public-inbox-compact - compact Xapian DBs
=head1 DESCRIPTION
public-inbox-compact is a wrapper for L<xapian-compact(1)>
-designed for "v2" inboxes. It combines multiple Xapian
-partitions into one to reduce space overhead after an initial
-mass import (using multiple partitions) is done.
+which locks the inbox and prevents other processes such as
+L<public-inbox-watch(1)> or L<public-inbox-mda(1)> from
+writing while it operates.
-It locks the inbox and prevents other processes such as
-L<public-inbox-watch(1)> from writing while it operates.
-
-It also supports "v1" (ssoma) inboxes with limited
-usefulness over L<xapian-compact(1)>
+It enforces the use of the C<--no-renumber> option of
+L<xapian-compact(1)>
=head1 ENVIRONMENT
@@ -28,9 +25,15 @@ usefulness over L<xapian-compact(1)>
The default config file, normally "~/.public-inbox/config".
See L<public-inbox-config(5)>
-=back
+=item XAPIAN_FLUSH_THRESHOLD
+
+The number of documents to update before committing changes to
+disk. This environment is handled directly by Xapian, refer to
+Xapian API documentation for more details.
-=head1 UPGRADING
+Default: 10000
+
+=back
=head1 CONTACT
@@ -41,7 +44,7 @@ and L<http://hjrcffqmbrq6wope.onion/meta/>
=head1 COPYRIGHT
-Copyright 2018 all contributors L<mailto:meta@public-inbox.org>
+Copyright 2018-2019 all contributors L<mailto:meta@public-inbox.org>
License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod
index acc9039..2e0ff69 100644
--- a/Documentation/public-inbox-index.pod
+++ b/Documentation/public-inbox-index.pod
@@ -4,14 +4,15 @@ public-inbox-index - create and update search indices
=head1 SYNOPSIS
-public-inbox-index [OPTIONS] REPO_DIR
+public-inbox-index [OPTIONS] INBOX_DIR
=head1 DESCRIPTION
-public-inbox-index creates and updates the search and NNTP
-article number database used by the read-only public-inbox HTTP
-and NNTP interfaces. Currently, this requires L<Search::Xapian>
-and L<DBD::SQlite> and L<DBI> Perl modules.
+public-inbox-index creates and updates the search, overview and
+NNTP article number database used by the read-only public-inbox
+HTTP and NNTP interfaces. Currently, this requires
+L<DBD::SQlite> and L<DBI> Perl modules. L<Search::Xapian>
+is optional, only to support the PSGI search interface.
Once the initial indices are created by public-inbox-index,
L<public-inbox-mda(1)> and L<public-inbox-watch(1)> will
@@ -22,10 +23,10 @@ relying on L<git-fetch(1)> to mirror an existing public-inbox;
or if upgrading to a new version of public-inbox using
the C<--reindex> option.
-Having a search and article number database is essential to
+Having the overview and article number database is essential to
running the NNTP interface, and strongly recommended for the
-HTTP interface as it provides thread grouping in addition
-to normal search functionality.
+HTTP interface as it provides thread grouping in addition to
+normal search functionality.
=head1 OPTIONS
@@ -45,50 +46,11 @@ This does not touch the NNTP article number database.
=head1 FILES
+For v1 (ssoma) repositories described in L<public-inbox-v1-format>.
All public-inbox-specific files are contained within the
-C<$REPO_DIR/public-inbox/> directory. All files are expected to
-grow in size as more messages are archived, so using compaction
-commands (e.g. L<xapian-compact(1)>) is not recommended unless
-the list is no longer active.
+C<$GIT_DIR/public-inbox/> directory.
-=over
-
-=item $REPO_DIR/public-inbox/msgmap.sqlite3
-
-The stable NNTP article number to Message-ID mapping is
-stored in an SQLite3 database.
-
-This is required for users of L<public-inbox-nntpd(1)>, but
-users of the L<PublicInbox::WWW> interface will find it
-useful for attempting recovery from copy-paste truncations of
-URLs containing long Message-IDs.
-
-Avoid removing this file and regenerating it; it may cause
-existing NNTP readers to lose sync and miss (or see duplicate)
-messages.
-
-This file is relatively small, and typically less than 5%
-of the space of the mail stored in a packed git repository.
-
-=item $REPO_DIR/public-inbox/xapian*
-
-The database used by L<Search::Xapian>. This directory name is
-followed by a number indicating the index schema version this
-installation of public-inbox uses.
-
-These directories may be safely deleted or removed in full
-while the NNTP and HTTP interfaces are no longer accessing
-them.
-
-In addition to providing a search interface for the HTTP
-interface, the Xapian database is used to group and combine
-related messages into threads. For NNTP servers, it also
-provides a cache of metadata and header information often
-requested by NNTP clients.
-
-This directory is large, often two to three times the size of
-the objects stored in a packed git repository. Using the
-C<--reindex> option makes it larger, still.
+v2 repositories are described in L<public-inbox-v2-format>.
=back
@@ -100,8 +62,24 @@ C<--reindex> option makes it larger, still.
Used to override the default "~/.public-inbox/config" value.
+=item XAPIAN_FLUSH_THRESHOLD
+
+The number of documents to update before committing changes to
+disk. This environment is handled directly by Xapian, refer to
+Xapian API documentation for more details.
+
+Default: our indexing code flushes every megabyte of mail seen
+to keep memory usage low. Setting this environment variable to
+any positive value will switch to a document count-based
+threshold in Xapian.
+
=back
+=head1 UPGRADING
+
+Occasionally, public-inbox will update it's schema version and
+require a full index by running this command.
+
=head1 CONTACT
Feedback welcome via plain-text mail to L<mailto:meta@public-inbox.org>
@@ -111,7 +89,7 @@ and L<http://hjrcffqmbrq6wope.onion/meta/>
=head1 COPYRIGHT
-Copyright 2016-2018 all contributors L<mailto:meta@public-inbox.org>
+Copyright 2016-2019 all contributors L<mailto:meta@public-inbox.org>
License: AGPL-3.0+ L<https://www.gnu.org/licenses/agpl-3.0.txt>
diff --git a/Documentation/public-inbox-v1-format.pod b/Documentation/public-inbox-v1-format.pod
index 3b0e70e..c960913 100644
--- a/Documentation/public-inbox-v1-format.pod
+++ b/Documentation/public-inbox-v1-format.pod
@@ -104,6 +104,10 @@ SQLite3 database maintaining a stable mapping of Message-IDs to NNTP
article numbers. Used by L<public-inbox-nntpd(1)> and created
and updated by L<public-inbox-index(1)>.
+Users of the L<PublicInbox::WWW> interface will find it
+useful for attempting recovery from copy-paste truncations of
+URLs containing long Message-IDs.
+
Automatically updated by L<public-inbox-mda(1)>,
L<public-inbox-learn(1)> and L<public-inbox-watch(1)>.
@@ -135,8 +139,12 @@ the "overview" DB also exists in the xapian directory for v1
repositories. See L<public-inbox-v2-format(5)/OVERVIEW DB>
Our use of the L</OVERVIEW DB> requires Xapian document IDs to
-remain stable. Thus, use of L<xapian-compact(1)> and
-L<copydatabase(8)> require the use of C<--no-renumber> switch.
+remain stable. Using L<public-inbox-compact(1)> and
+L<public-inbox-xcpdb(1)> wrappers are recommended over tools
+provided by Xapian.
+
+This directory is large, often two to three times the size of
+the objects stored in a packed git repository.
=item $GIT_DIR/ssoma.index
diff --git a/Documentation/public-inbox-v2-format.pod b/Documentation/public-inbox-v2-format.pod
index bc58074..65a85c1 100644
--- a/Documentation/public-inbox-v2-format.pod
+++ b/Documentation/public-inbox-v2-format.pod
@@ -118,8 +118,9 @@ large mail archives; but are fine for backup and usable for
small instances.
Our use of the L</OVERVIEW DB> requires Xapian document IDs to
-remain stable. Thus, use of L<xapian-compact(1)> and
-L<copydatabase(8)> require the use of C<--no-renumber> switch.
+remain stable. Using L<public-inbox-compact(1)> and
+L<public-inbox-xcpdb(1)> wrappers are recommended over tools
+provided by Xapian.
=head2 OVERVIEW DB
diff --git a/Documentation/public-inbox-xcpdb.pod b/Documentation/public-inbox-xcpdb.pod
index c47500b..5697dcd 100644
--- a/Documentation/public-inbox-xcpdb.pod
+++ b/Documentation/public-inbox-xcpdb.pod
@@ -1,6 +1,6 @@
=head1 NAME
-public-inbox-xcpdb - copy Xapian DBs (for format upgrades)
+public-inbox-xcpdb - upgrade Xapian DB formats
=head1 SYNOPSIS
@@ -16,7 +16,8 @@ L<public-inbox-watch(1)> or L<public-inbox-mda(1)>.
This is intended for upgrading the database format used by
Xapian. It DOES NOT upgrade the schema used by the
-public-inbox search interface (see L<public-inbox-index(1)>).
+public-inbox PSGI search interface (see
+L<public-inbox-index(1)>).
=head1 ENVIRONMENT
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 0d86771..2771a24 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -225,7 +225,7 @@ sub description {
local $/ = "\n";
chomp $desc;
$desc =~ s/\s+/ /smg;
- $desc = '($REPO_DIR/description missing)' if $desc eq '';
+ $desc = '($INBOX_DIR/description missing)' if $desc eq '';
$self->{description} = $desc;
}
diff --git a/script/public-inbox-compact b/script/public-inbox-compact
index 4bdadfc..e8bf31e 100755
--- a/script/public-inbox-compact
+++ b/script/public-inbox-compact
@@ -8,7 +8,7 @@ use PublicInbox::InboxWritable;
use PublicInbox::Xapcmd;
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
-my $usage = "Usage: public-inbox-compact REPO_DIR\n";
+my $usage = "Usage: public-inbox-compact INBOX_DIR\n";
my $opt = { compact => 1, -coarse_lock => 1 };
GetOptions($opt, @PublicInbox::Xapcmd::COMPACT_OPT) or
die "bad command-line args\n$usage";
diff --git a/script/public-inbox-index b/script/public-inbox-index
index b6e3052..40187b3 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -4,12 +4,12 @@
# Basic tool to create a Xapian search index for a git repository
# configured for public-inbox.
# Usage with libeatmydata <https://www.flamingspork.com/projects/libeatmydata/>
-# highly recommended: eatmydata public-inbox-index REPO_DIR
+# highly recommended: eatmydata public-inbox-index INBOX_DIR
use strict;
use warnings;
use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
-my $usage = "public-inbox-index REPO_DIR";
+my $usage = "public-inbox-index INBOX_DIR";
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-index');
diff --git a/script/public-inbox-init b/script/public-inbox-init
index 5724c52..985a09f 100755
--- a/script/public-inbox-init
+++ b/script/public-inbox-init
@@ -5,7 +5,7 @@
# Initializes a public-inbox, basically a wrapper for git-init(1)
use strict;
use warnings;
-my $usage = "public-inbox-init NAME REPO_DIR HTTP_URL ADDRESS [ADDRESS..]";
+my $usage = "public-inbox-init NAME INBOX_DIR HTTP_URL ADDRESS [ADDRESS..]";
use Getopt::Long qw/:config gnu_getopt no_ignore_case auto_abbrev/;
use PublicInbox::Admin;
PublicInbox::Admin::require_or_die('-base');
--
EW
^ permalink raw reply related [flat|nested] 28+ messages in thread