From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 0A6C61FA17 for ; Mon, 10 Aug 2020 02:12:07 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 09/14] index: cleanup internal variables Date: Mon, 10 Aug 2020 02:12:00 +0000 Message-Id: <20200810021205.18909-10-e@yhbt.net> In-Reply-To: <20200810021205.18909-1-e@yhbt.net> References: <20200810021205.18909-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Move away from hard-to-read alllowercase naming and favor snake_case or separated-by-dashes. We'll keep `--indexlevel' as-is for now, since it's been around for several releases; but we'll support `--index-level' in the CLI and update our documentation in a few months. We'll also clarify that publicInbox.indexMaxSize is only intended for -index, and not -watch or -mda. --- Documentation/public-inbox-index.pod | 6 ++++- Documentation/public-inbox-init.pod | 2 +- lib/PublicInbox/SearchIdx.pm | 14 +++++------ lib/PublicInbox/V2Writable.pm | 26 +++++++++------------ script/public-inbox-index | 35 ++++++++++++++-------------- script/public-inbox-init | 8 ++----- 6 files changed, 43 insertions(+), 48 deletions(-) diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 3ae3b008..7a97432e 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -158,7 +158,11 @@ value. A single suffix modifier of C, C or C is supported, thus the value of C<1m> to prevents indexing of messages larger than one megabyte. -This is useful for avoiding memory exhaustion in mirrors. +This is useful for avoiding memory exhaustion in mirrors +via git. It does not prevent L or +L from importing (and indexing) +a message. + This option is only available in public-inbox 1.5 or later. Default: none diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod index fd9fc637..d0c87563 100644 --- a/Documentation/public-inbox-init.pod +++ b/Documentation/public-inbox-init.pod @@ -31,7 +31,7 @@ L. Default: C<1> -=item --indexlevel +=item -L, --indexlevel Controls the indexing level for L diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1cf3e66c..7f2447fe 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -67,7 +67,6 @@ sub new { my $dir = $self->xdir; $self->{over} = PublicInbox::OverIdx->new("$dir/over.sqlite3"); $self->{over}->{-no_fsync} = 1 if $ibx->{-no_fsync}; - $self->{index_max_size} = $ibx->{index_max_size}; } elsif ($version == 2) { defined $shard or die "shard is required for v2\n"; # shard is a number @@ -553,10 +552,10 @@ sub index_sync { sub check_size { # check_async cb for -index --max-size=... my ($oid, $type, $size, $arg, $git) = @_; (($type // '') eq 'blob') or die "E: bad $oid in $git->{git_dir}"; - if ($size <= $arg->{index_max_size}) { + if ($size <= $arg->{max_size}) { $git->cat_async($oid, $arg->{index_oid}, $arg); } else { - warn "W: skipping $oid ($size > $arg->{index_max_size})\n"; + warn "W: skipping $oid ($size > $arg->{max_size})\n"; } } @@ -573,7 +572,7 @@ sub v1_checkpoint ($$;$) { $self->{mm}->last_commit($newest); } } else { - ${$sync->{max}} = $BATCH_BYTES; + ${$sync->{max}} = $self->{batch_bytes}; } $self->{mm}->{dbh}->commit; @@ -603,7 +602,7 @@ sub v1_checkpoint ($$;$) { sub process_stack { my ($self, $sync, $stk) = @_; my $git = $self->{ibx}->git; - my $max = $BATCH_BYTES; + my $max = $self->{batch_bytes}; my $nr = 0; $sync->{nr} = \$nr; $sync->{max} = \$max; @@ -617,13 +616,13 @@ sub process_stack { $git->cat_async($oid, \&unindex_both, $self); } } - if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) { + if ($sync->{max_size} = $sync->{-opt}->{max_size}) { $sync->{index_oid} = \&index_both; } while (my ($f, $at, $ct, $oid) = $stk->pop_rec) { if ($f eq 'm') { my $arg = { %$sync, autime => $at, cotime => $ct }; - if ($sync->{index_max_size}) { + if ($sync->{max_size}) { $git->check_async($oid, \&check_size, $arg); } else { $git->cat_async($oid, \&index_both, $arg); @@ -749,6 +748,7 @@ sub _index_sync { my ($self, $opts) = @_; my $tip = $opts->{ref} || 'HEAD'; my $git = $self->{ibx}->git; + $self->{batch_bytes} = $opts->{batch_size} // $BATCH_BYTES; $git->batch_prepare; my $pr = $opts->{-progress}; my $sync = { reindex => $opts->{reindex}, -opt => $opts }; diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 93646e57..8e36b92c 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -152,12 +152,6 @@ sub add { $self->{ibx}->with_umask(\&_add, $self, $eml, $check_cb); } -sub batch_bytes ($) { - my ($self) = @_; - ($self->{parallel} ? $self->{shards} : 1) * - $PublicInbox::SearchIdx::BATCH_BYTES; -} - # indexes a message, returns true if checkpointing is needed sub do_idx ($$$$) { my ($self, $msgref, $mime, $smsg) = @_; @@ -166,7 +160,7 @@ sub do_idx ($$$$) { my $idx = idx_shard($self, $smsg->{num} % $self->{shards}); $idx->index_raw($msgref, $mime, $smsg); my $n = $self->{transact_bytes} += $smsg->{raw_bytes}; - $n >= batch_bytes($self); + $n >= $self->{batch_bytes}; } sub _add { @@ -287,6 +281,9 @@ sub _idx_init { # with_umask callback # xcpdb can change shard count while -watch is idle my $nshards = count_shards($self); $self->{shards} = $nshards if $nshards && $nshards != $self->{shards}; + $self->{batch_bytes} = $opt->{batch_size} // + $PublicInbox::SearchIdx::BATCH_BYTES; + $self->{batch_bytes} *= $self->{shards} if $self->{parallel}; # need to create all shards before initializing msgmap FD # idx_shards must be visible to all forked processes @@ -891,7 +888,7 @@ sub reindex_checkpoint ($$) { } # allow -watch or -mda to write... - $self->idx_init; # reacquire lock + $self->idx_init($sync->{-opt}); # reacquire lock $mm_tmp->atfork_parent if $mm_tmp; } @@ -1208,12 +1205,11 @@ sub index_xap_step ($$$;$) { $pr->("Xapian indexlevel=$ibx->{indexlevel} ". "$beg..$end (% $step)\n"); } - my $batch_bytes = batch_bytes($self); for (my $num = $beg; $num <= $end; $num += $step) { my $smsg = $ibx->over->get_art($num) or next; $smsg->{v2w} = $self; $ibx->git->cat_async($smsg->{blob}, \&index_xap_only, $smsg); - if ($self->{transact_bytes} >= $batch_bytes) { + if ($self->{transact_bytes} >= $self->{batch_bytes}) { ${$sync->{nr}} = $num; reindex_checkpoint($self, $sync); } @@ -1236,7 +1232,7 @@ sub index_epoch ($$$) { $self->{current_info} = "$i.git $oid"; if ($f eq 'm') { my $arg = { %$sync, autime => $at, cotime => $ct }; - if ($sync->{index_max_size}) { + if ($sync->{max_size}) { $all->check_async($oid, \&check_size, $arg); } else { $all->cat_async($oid, \&index_oid, $arg); @@ -1255,7 +1251,7 @@ sub index_epoch ($$$) { sub xapian_only { my ($self, $opt, $sync, $art_beg) = @_; - my $seq = $opt->{sequentialshard}; + my $seq = $opt->{sequential_shard}; $art_beg //= 0; local $self->{parallel} = 0 if $seq; $self->idx_init($opt); # acquire lock @@ -1285,14 +1281,14 @@ sub xapian_only { sub index_sync { my ($self, $opt) = @_; $opt //= $_[1] //= {}; - goto \&xapian_only if $opt->{xapianonly}; + goto \&xapian_only if $opt->{xapian_only}; my $pr = $opt->{-progress}; my $epoch_max; my $latest = git_dir_latest($self, \$epoch_max); return unless defined $latest; - my $seq = $opt->{sequentialshard}; + my $seq = $opt->{sequential_shard}; my $art_beg; # the NNTP article number we start xapian_only at my $idxlevel = $self->{ibx}->{indexlevel}; local $self->{ibx}->{indexlevel} = 'basic' if $seq; @@ -1324,7 +1320,7 @@ sub index_sync { $art_beg++ if defined($art_beg); } } - if ($sync->{index_max_size} = $self->{ibx}->{index_max_size}) { + if ($sync->{max_size} = $opt->{max_size}) { $sync->{index_oid} = \&index_oid; } # work forwards through history diff --git a/script/public-inbox-index b/script/public-inbox-index index 9e0907be..b1d29ec1 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -17,7 +17,7 @@ usage: $usage options: --no-fsync speed up indexing, risk corruption on power outage - --indexlevel=LEVEL `basic', 'medium', or `full' (default: full) + -L LEVEL `basic', `medium', or `full' (default: full) --compact | -c run public-inbox-compact(1) after indexing --sequential-shard index Xapian shards sequentially for slow storage --jobs=NUM set or disable parallelization (NUM=0) @@ -33,16 +33,17 @@ BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes) See public-inbox-index(1) man page for full documentation. EOF my $compact_opt; -my $opt = { quiet => -1, compact => 0, maxsize => undef, fsync => 1 }; +my $opt = { quiet => -1, compact => 0, max_size => undef, fsync => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune - fsync|sync! xapianonly|xapian-only - indexlevel|L=s maxsize|max-size=s batchsize|batch-size=s - sequentialshard|seq-shard|sequential-shard + fsync|sync! xapian_only|xapian-only + indexlevel|index-level|L=s max_size|max-size=s + batch_size|batch-size=s + sequential_shard|seq-shard|sequential-shard help|?)) or die "bad command-line args\n$usage"; if ($opt->{help}) { print $help; exit 0 }; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; -if ($opt->{xapianonly} && !$opt->{reindex}) { +if ($opt->{xapian_only} && !$opt->{reindex}) { die "--xapian-only requires --reindex\n"; } @@ -64,40 +65,38 @@ my @ibxs = PublicInbox::Admin::resolve_inboxes(\@ARGV, undef, $cfg); PublicInbox::Admin::require_or_die('-index'); unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } -my $max_size = $opt->{maxsize} // $cfg->{lc('publicInbox.indexMaxSize')}; +my $max_size = $opt->{max_size} // $cfg->{lc('publicInbox.indexMaxSize')}; if (defined $max_size) { PublicInbox::Admin::parse_unsigned(\$max_size) or die "`publicInbox.indexMaxSize=$max_size' not parsed\n"; + $opt->{max_size} = $max_size; } -my $bs = $opt->{batchsize} // $cfg->{lc('publicInbox.indexBatchSize')}; +my $bs = $opt->{batch_size} // $cfg->{lc('publicInbox.indexBatchSize')}; if (defined $bs) { PublicInbox::Admin::parse_unsigned(\$bs) or die "`publicInbox.indexBatchSize=$bs' not parsed\n"; + $opt->{batch_size} = $bs; } -no warnings 'once'; -local $PublicInbox::SearchIdx::BATCH_BYTES = $bs if defined($bs); -use warnings 'once'; # out-of-the-box builds of Xapian 1.4.x are still limited to 32-bit # https://getting-started-with-xapian.readthedocs.io/en/latest/concepts/indexing/limitations.html local $ENV{XAPIAN_FLUSH_THRESHOLD} ||= '4294967295' if defined($bs); -my $s = $opt->{sequentialshard} // +my $s = $opt->{sequential_shard} // $cfg->{lc('publicInbox.indexSequentialShard')}; if (defined $s) { my $v = $cfg->git_bool($s); defined($v) or die "`publicInbox.indexSequentialShard=$s' not boolean\n"; - $opt->{sequentialshard} = $v; + $opt->{sequential_shard} = $v; } my $mods = {}; foreach my $ibx (@ibxs) { # XXX: users can shoot themselves in the foot, with opt->{indexlevel} - $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapianonly} ? + $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? 'full' : PublicInbox::Admin::detect_indexlevel($ibx)); - $ibx->{index_max_size} = $max_size; PublicInbox::Admin::scan_ibx_modules($mods, $ibx); } @@ -112,15 +111,15 @@ for my $ibx (@ibxs) { $ibx->{-no_fsync} = 1 if !$opt->{fsync}; my $ibx_opt = $opt; - if (defined(my $s = $ibx->{indexsequentialshard})) { + if (defined(my $s = $ibx->{lc('indexSequentialShard')})) { defined(my $v = $cfg->git_bool($s)) or die <{name}.indexSequentialShard not boolean EOL - $ibx_opt = { %$opt, sequentialshard => $v }; + $ibx_opt = { %$opt, sequential_shard => $v }; } PublicInbox::Admin::index_inbox($ibx, undef, $ibx_opt); if ($compact_opt) { - local $compact_opt->{jobs} = 0 if $ibx_opt->{sequentialshard}; + local $compact_opt->{jobs} = 0 if $ibx_opt->{sequential_shard}; PublicInbox::Xapcmd::run($ibx, 'compact', $compact_opt); } } diff --git a/script/public-inbox-init b/script/public-inbox-init index 6a959db7..1c8066df 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -27,7 +27,7 @@ use Cwd qw/abs_path/; my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs); my %opts = ( 'V|version=i' => \$version, - 'L|indexlevel=s' => \$indexlevel, + 'L|index-level|indexlevel=s' => \$indexlevel, 'S|skip|skip-epoch=i' => \$skip_epoch, 'N|skip-artnum=i' => \$skip_artnum, 'j|jobs=i' => \$jobs, @@ -103,11 +103,7 @@ if (-e $pi_config) { exit(1) if $conflict; my $ibx = $cfg->lookup_name($name); - if ($ibx) { - if (!defined($indexlevel) && $ibx->{indexlevel}) { - $indexlevel = $ibx->{indexlevel}; - } - } + $indexlevel //= $ibx->{indexlevel} if $ibx; } my $pi_config_tmp = $fh->filename; close($fh) or die "failed to close $pi_config_tmp: $!\n";