From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 85EEA1F55B for ; Sun, 17 May 2020 19:37:21 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] index: add --batch-size=SIZE option Date: Sun, 17 May 2020 19:37:21 +0000 Message-Id: <20200517193721.29185-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: On powerful systems, having this option is preferable to XAPIAN_FLUSH_THRESHOLD due to lock granularity and contention with other processes (-learn, -mda, -watch). Setting XAPIAN_FLUSH_THRESHOLD can cause -learn, -mda, and -watch to get stuck until an epoch is completely processed. --- Documentation/public-inbox-index.pod | 42 +++++++++++++++++++++++++--- lib/PublicInbox/SearchIdx.pm | 12 ++++---- lib/PublicInbox/V2Writable.pm | 2 +- script/public-inbox-index | 11 ++++++-- 4 files changed, 53 insertions(+), 14 deletions(-) diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 8a37580c..2719b39e 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -56,6 +56,10 @@ Xapian database. Using this with C<--compact> or running L afterwards is recommended to release free space. +public-inbox protects writes to various indices with L, +so it is safe to reindex while L, +L or L run. + This does not touch the NNTP article number database or affect threading. @@ -72,6 +76,12 @@ Sets or overrides L on a per-invocation basis. See L below. +=item --batch-size SIZE + +Sets or overrides L on a +per-invocation basis. See L +below. + =back =head1 FILES @@ -98,6 +108,27 @@ This option is only available in public-inbox 1.5 or later. Default: none +=item publicinbox.indexBatchSize + +Flushes changes to the filesystem and releases locks after +indexing the given number of bytes. The default value of C<1m> +(one megabyte) is low to minimize memory use and reduce +contention with parallel invocations of L, +L, and L. + +Increase this value on powerful systems improve throughput at +the expense of memory use. The reduction of lock granularity +may not be noticeable on fast systems. + +This option is available in public-inbox 1.6 or later. +public-inbox 1.5 and earlier used the current default, C<1m>. + +For L inboxes, this value is +multiplied by the number of Xapian shards. Thus a typical v2 +inbox with 3 shards will flush every 3 megabytes by default. + +Default: 1m (one megabyte) + =back =head1 ENVIRONMENT @@ -114,10 +145,13 @@ The number of documents to update before committing changes to disk. This environment is handled directly by Xapian, refer to Xapian API documentation for more details. -Default: our indexing code flushes every megabyte of mail seen -to keep memory usage low. Setting this environment variable to -any positive value will switch to a document count-based -threshold in Xapian. +For public-inbox 1.6 and later, use C +instead. Setting C for a large C<--reindex> +may cause L, L and +L tasks to wait long periods of time +during C<--reindex>. + +Default: none, uses C =back diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5f5ae895..b4088933 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -22,11 +22,9 @@ use PublicInbox::Git qw(git_unquote); use PublicInbox::MsgTime qw(msg_timestamp msg_datestamp); my $X = \%PublicInbox::Search::X; my ($DB_CREATE_OR_OPEN, $DB_OPEN); -use constant { - BATCH_BYTES => defined($ENV{XAPIAN_FLUSH_THRESHOLD}) ? - 0x7fffffff : 1_000_000, - DEBUG => !!$ENV{DEBUG}, -}; +our $BATCH_BYTES = defined($ENV{XAPIAN_FLUSH_THRESHOLD}) ? + 0x7fffffff : 1_000_000; +use constant DEBUG => !!$ENV{DEBUG}; my $xapianlevels = qr/\A(?:full|medium)\z/; @@ -585,7 +583,7 @@ sub batch_adjust ($$$$$) { my ($max, $bytes, $batch_cb, $latest, $nr) = @_; $$max -= $bytes; if ($$max <= 0) { - $$max = BATCH_BYTES; + $$max = $BATCH_BYTES; $batch_cb->($nr, $latest); } } @@ -610,7 +608,7 @@ sub read_log { my $git = $self->{git}; my $latest; my $bytes; - my $max = BATCH_BYTES; + my $max = $BATCH_BYTES; local $/ = "\n"; my %D; my $line; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index bf5a0df9..c732b98a 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -159,7 +159,7 @@ sub do_idx ($$$$) { my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); $idx->index_raw($msgref, $mime, $smsg); my $n = $self->{transact_bytes} += $smsg->{bytes}; - $n >= (PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); + $n >= ($PublicInbox::SearchIdx::BATCH_BYTES * $self->{shards}); } sub _add { diff --git a/script/public-inbox-index b/script/public-inbox-index index 2d0f0eca..0018668e 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -16,7 +16,7 @@ use PublicInbox::Xapcmd; my $compact_opt; my $opt = { quiet => -1, compact => 0, maxsize => undef }; GetOptions($opt, qw(verbose|v+ reindex compact|c+ jobs|j=i prune - indexlevel|L=s maxsize|max-size=s)) + indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s)) or die "bad command-line args\n$usage"; die "--jobs must be positive\n" if defined $opt->{jobs} && $opt->{jobs} <= 0; @@ -30,13 +30,20 @@ my $cfg = PublicInbox::Config->new; my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } -my $mods = {}; + my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')}; if (defined $max_size) { PublicInbox::Admin::parse_unsigned(\$max_size) or die "`publicInbox.indexMaxSize=$max_size' not parsed\n"; } +if (my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')}) { + PublicInbox::Admin::parse_unsigned(\$bs) or + die "`publicInbox.indexBatchSize=$bs' not parsed\n"; + $PublicInbox::SearchIdx::BATCH_BYTES = $bs; +} + +my $mods = {}; foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} $ibx->{indexlevel} //= $opt->{indexlevel} //