From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 37FA81FC97 for ; Thu, 20 Aug 2020 20:25:02 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 22/23] init+index: support --skip-docdata for Xapian Date: Thu, 20 Aug 2020 20:24:56 +0000 Message-Id: <20200820202457.21042-23-e@yhbt.net> In-Reply-To: <20200820202457.21042-1-e@yhbt.net> References: <20200820202457.21042-1-e@yhbt.net> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Since we no longer read document data from Xapian, allow users to opt-out of storing it. This breaks compatibility with previous releases of public-inbox, but gives us a ~1.5% space savings on Xapian storage (and associated I/O and page cache pressure reduction). --- Documentation/public-inbox-index.pod | 8 +++++++ Documentation/public-inbox-init.pod | 10 ++++++++ lib/PublicInbox/Admin.pm | 12 ++++++---- lib/PublicInbox/SearchIdx.pm | 35 +++++++++++++++++++++------- lib/PublicInbox/SearchIdxShard.pm | 2 +- script/public-inbox-convert | 3 ++- script/public-inbox-index | 7 ++++-- script/public-inbox-init | 8 +++++++ t/inbox_idle.t | 2 +- t/index-git-times.t | 11 ++++++++- t/init.t | 13 +++++++++++ 11 files changed, 91 insertions(+), 20 deletions(-) diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index 1ed9f5e7..46a53825 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -145,6 +145,14 @@ below. Available in public-inbox 1.6.0 (PENDING). +=item --skip-docdata + +Stop storing document data in Xapian on an existing inbox. + +See L for description and caveats. + +Available in public-inbox 1.6.0 (PENDING). + =back =head1 FILES diff --git a/Documentation/public-inbox-init.pod b/Documentation/public-inbox-init.pod index 4cc7e29f..3f98807a 100644 --- a/Documentation/public-inbox-init.pod +++ b/Documentation/public-inbox-init.pod @@ -95,6 +95,16 @@ default due to contention in the top-level producer process. Default: the number of online CPUs, up to 4 +=item --skip-docdata + +Do not store document data in Xapian, reducing Xapian storage +overhead by around 1.5%. + +Warning: this option prevents rollbacks to public-inbox 1.5.0 +and earlier. + +Available since public-inbox 1.6.0 (PENDING). + =back =head1 ENVIRONMENT diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm index f5427af7..b8ead6f7 100644 --- a/lib/PublicInbox/Admin.pm +++ b/lib/PublicInbox/Admin.pm @@ -48,13 +48,14 @@ sub resolve_repo_dir { sub detect_indexlevel ($) { my ($ibx) = @_; - # brand new or never before indexed inboxes default to full - return 'full' unless $ibx->over; - delete $ibx->{over}; # don't leave open FD lying around + my $over = $ibx->over; + my $srch = $ibx->search; + delete @$ibx{qw(over search)}; # don't leave open FDs lying around + # brand new or never before indexed inboxes default to full + return 'full' unless $over; my $l = 'basic'; - my $srch = $ibx->search or return $l; - delete $ibx->{search}; # don't leave open FD lying around + return $l unless $srch; if (my $xdb = $srch->xdb) { $l = 'full'; my $m = $xdb->get_metadata('indexlevel'); @@ -65,6 +66,7 @@ sub detect_indexlevel ($) { $ibx->{inboxdir} has unexpected indexlevel in Xapian: $m } + $ibx->{-skip_docdata} = 1 if $xdb->get_metadata('skip_docdata'); } $l; } diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 5c39f3d6..be46b2b9 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -61,6 +61,10 @@ sub new { }, $class; $self->xpfx_init; $self->{-set_indexlevel_once} = 1 if $indexlevel eq 'medium'; + if ($ibx->{-skip_docdata}) { + $self->{-set_skip_docdata_once} = 1; + $self->{-skip_docdata} = 1; + } $ibx->umask_prepare; if ($version == 1) { $self->{lock_path} = "$inboxdir/ssoma.lock"; @@ -359,10 +363,18 @@ sub add_xapian ($$$$) { msg_iter($eml, \&index_xapian, [ $self, $doc ]); index_ids($self, $doc, $eml, $mids); - $smsg->{to} = $smsg->{cc} = ''; # WWW doesn't need these, only NNTP - PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); - my $data = $smsg->to_doc_data; - $doc->set_data($data); + + # by default, we maintain compatibility with v1.5.0 and earlier + # by writing to docdata.glass, users who never exect to downgrade can + # use --skip-docdata + if (!$self->{-skip_docdata}) { + # WWW doesn't need {to} or {cc}, only NNTP + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $eml, $mids); + my $data = $smsg->to_doc_data; + $doc->set_data($data); + } + if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { my $pfx = $alt->{xprefix}; @@ -831,23 +843,28 @@ sub begin_txn_lazy { # store 'indexlevel=medium' in v2 shard=0 and v1 (only one shard) # This metadata is read by Admin::detect_indexlevel: -sub set_indexlevel { +sub set_metadata_once { my ($self) = @_; - if (!$self->{shard} && # undef or 0, not >0 - delete($self->{-set_indexlevel_once})) { - my $xdb = $self->{xdb}; + return if $self->{shard}; # only continue if undef or 0, not >0 + my $xdb = $self->{xdb}; + + if (delete($self->{-set_indexlevel_once})) { my $level = $xdb->get_metadata('indexlevel'); if (!$level || $level ne 'medium') { $xdb->set_metadata('indexlevel', 'medium'); } } + if (delete($self->{-set_skip_docdata_once})) { + $xdb->get_metadata('skip_docdata') or + $xdb->set_metadata('skip_docdata', '1'); + } } sub _commit_txn { my ($self) = @_; if (my $xdb = $self->{xdb}) { - set_indexlevel($self); + set_metadata_once($self); $xdb->commit_transaction; } $self->{over}->commit_lazy if $self->{over}; diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm index 59b36087..20077e08 100644 --- a/lib/PublicInbox/SearchIdxShard.pm +++ b/lib/PublicInbox/SearchIdxShard.pm @@ -16,7 +16,7 @@ sub new { my $self = $class->SUPER::new($ibx, 1, $shard); # create the DB before forking: $self->idx_acquire; - $self->set_indexlevel; + $self->set_metadata_once; $self->idx_release; $self->spawn_worker($v2w, $shard) if $v2w->{parallel}; $self; diff --git a/script/public-inbox-convert b/script/public-inbox-convert index d655dcc6..4ff198d1 100755 --- a/script/public-inbox-convert +++ b/script/public-inbox-convert @@ -77,7 +77,8 @@ if ($old) { die "Only conversion from v1 inboxes is supported\n" if $old->version >= 2; require PublicInbox::Admin; -$old->{indexlevel} //= PublicInbox::Admin::detect_indexlevel($old); +my $detected = PublicInbox::Admin::detect_indexlevel($old); +$old->{indexlevel} //= $detected; my $env; if ($opt->{'index'}) { my $mods = {}; diff --git a/script/public-inbox-index b/script/public-inbox-index index 30d24838..9855c67d 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -39,7 +39,7 @@ GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s sequential_shard|seq-shard|sequential-shard - all help|?)) + skip-docdata all help|?)) or die "bad command-line args\n$usage"; if ($opt->{help}) { print $help; exit 0 }; die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0; @@ -58,9 +58,11 @@ unless (@ibxs) { print STDERR "Usage: $usage\n"; exit 1 } my $mods = {}; foreach my $ibx (@ibxs) { + # detect_indexlevel may also set $ibx->{-skip_docdata} + my $detected = PublicInbox::Admin::detect_indexlevel($ibx); # XXX: users can shoot themselves in the foot, with opt->{indexlevel} $ibx->{indexlevel} //= $opt->{indexlevel} // ($opt->{xapian_only} ? - 'full' : PublicInbox::Admin::detect_indexlevel($ibx)); + 'full' : $detected); PublicInbox::Admin::scan_ibx_modules($mods, $ibx); } @@ -75,6 +77,7 @@ for my $ibx (@ibxs) { PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt}); } $ibx->{-no_fsync} = 1 if !$opt->{fsync}; + $ibx->{-skip_docdata} //= $opt->{'skip-docdata'}; my $ibx_opt = $opt; if (defined(my $s = $ibx->{lc('indexSequentialShard')})) { diff --git a/script/public-inbox-init b/script/public-inbox-init index b19c2321..037e8e56 100755 --- a/script/public-inbox-init +++ b/script/public-inbox-init @@ -34,6 +34,7 @@ require PublicInbox::Admin; PublicInbox::Admin::require_or_die('-base'); my ($version, $indexlevel, $skip_epoch, $skip_artnum, $jobs, $show_help); +my $skip_docdata; my $ng = ''; my %opts = ( 'V|version=i' => \$version, @@ -42,6 +43,7 @@ my %opts = ( 'skip-artnum=i' => \$skip_artnum, 'j|jobs=i' => \$jobs, 'ng|newsgroup=s' => \$ng, + 'skip-docdata' => \$skip_docdata, 'help|?' => \$show_help, ); my $usage_cb = sub { @@ -177,6 +179,12 @@ if (defined $jobs) { require PublicInbox::InboxWritable; $ibx = PublicInbox::InboxWritable->new($ibx, $creat_opt); +if ($skip_docdata) { + $ibx->{indexlevel} //= 'full'; # ensure init_inbox writes xdb + $ibx->{indexlevel} eq 'basic' and + die "--skip-docdata ignored with --indexlevel=basic\n"; + $ibx->{-skip_docdata} = $skip_docdata; +} $ibx->init_inbox(0, $skip_epoch, $skip_artnum); # needed for git prior to v2.1.0 diff --git a/t/inbox_idle.t b/t/inbox_idle.t index 61287200..e16ee11b 100644 --- a/t/inbox_idle.t +++ b/t/inbox_idle.t @@ -29,7 +29,7 @@ for my $V (1, 2) { if ($V == 1) { my $sidx = PublicInbox::SearchIdx->new($ibx, 1); $sidx->idx_acquire; - $sidx->set_indexlevel; + $sidx->set_metadata_once; $sidx->idx_release; # allow watching on lockfile } my $pi_config = PublicInbox::Config->new(\< $r }); } -run_script(['-index', $v1dir]) or die 'v1 index failed'; +run_script(['-index', '--skip-docdata', $v1dir]) or die 'v1 index failed'; + my $smsg; { my $cfg = PublicInbox::Config->new; my $ibx = $cfg->lookup($addr); + my $lvl = PublicInbox::Admin::detect_indexlevel($ibx); + is($lvl, 'medium', 'indexlevel detected'); + is($ibx->{-skip_docdata}, 1, '--skip-docdata flag set on -index'); $smsg = $ibx->over->get_art(1); is($smsg->{ds}, 749520000, 'datestamp from git author time'); is($smsg->{ts}, 1285977600, 'timestamp from git committer time'); @@ -70,6 +75,10 @@ SKIP: { my $check_v2 = sub { my $ibx = PublicInbox::Inbox->new({inboxdir => $v2dir, address => $addr}); + my $lvl = PublicInbox::Admin::detect_indexlevel($ibx); + is($lvl, 'medium', 'indexlevel detected after convert'); + is($ibx->{-skip_docdata}, 1, + '--skip-docdata preserved after convert'); my $v2smsg = $ibx->over->get_art(1); is($v2smsg->{ds}, $smsg->{ds}, 'v2 datestamp from git author time'); diff --git a/t/init.t b/t/init.t index 4d2c5049..dad09435 100644 --- a/t/init.t +++ b/t/init.t @@ -95,6 +95,19 @@ SKIP: { my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir }); is(PublicInbox::Admin::detect_indexlevel($ibx), $lvl, 'detected expected level w/o config'); + ok(!$ibx->{-skip_docdata}, 'docdata written by default'); + } + for my $v (1, 2) { + my $name = "v$v-skip-docdata"; + my $dir = "$tmpdir/$name"; + $cmd = [ '-init', $name, "-V$v", '--skip-docdata', + $dir, "http://example.com/$name", + "$name\@example.com" ]; + ok(run_script($cmd), "-init -V$v --skip-docdata"); + my $ibx = PublicInbox::Inbox->new({ inboxdir => $dir }); + is(PublicInbox::Admin::detect_indexlevel($ibx), 'full', + "detected default indexlevel -V$v"); + ok($ibx->{-skip_docdata}, "docdata skip set -V$v"); } # loop for idempotency