From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 14D36200A9 for ; Tue, 27 Oct 2020 07:55:02 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 51/52] extsearchidx: support --batch-size checkpoints Date: Tue, 27 Oct 2020 07:54:52 +0000 Message-Id: <20201027075453.19163-52-e@80x24.org> In-Reply-To: <20201027075453.19163-1-e@80x24.org> References: <20201027075453.19163-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This is needed to limit the RSS of processes and ensure the stored data in over.sqlite3 and Xapian DBs are consistent if interrupted. Without checkpoints, indexing lore causes shard workers to take several GB of memory and thrash/OOM smaller systems. --- lib/PublicInbox/ExtSearchIdx.pm | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 050c4252..9d576adb 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -106,6 +106,18 @@ sub is_bad_blob ($$$$) { $size == 0 ? 1 : 0; # size == 0 means purged } +sub check_batch_limit ($) { + my ($req) = @_; + my $self = $req->{self}; + my $new_smsg = $req->{new_smsg}; + + # {raw_bytes} may be unset, so just use {bytes} + my $n = $self->{transact_bytes} += $new_smsg->{bytes}; + + # set flag for PublicInbox::V2Writable::index_todo: + ${$req->{need_checkpoint}} = 1 if $n >= $self->{batch_bytes}; +} + sub do_xpost ($$) { my ($req, $smsg) = @_; my $self = $req->{self}; @@ -118,6 +130,7 @@ sub do_xpost ($$) { my $xnum = $req->{xnum}; $self->{oidx}->add_xref3($docid, $xnum, $oid, $xibx->eidx_key); $idx->shard_add_eidx_info($docid, $oid, $xibx, $eml); + check_batch_limit($req); } else { # 'd' $self->{oidx}->remove_xref3($docid, $oid, $xibx->eidx_key); $idx->shard_remove_eidx_info($docid, $oid, $xibx, $eml); @@ -141,6 +154,7 @@ sub index_unseen ($) { my $ibx = delete $req->{ibx} or die 'BUG: {ibx} unset'; $self->{oidx}->add_xref3($docid, $req->{xnum}, $oid, $ibx->eidx_key); $idx->index_raw(undef, $eml, $new_smsg, $ibx); + check_batch_limit($req); } sub do_finalize ($) { @@ -301,7 +315,11 @@ sub eidx_sync { # main entry point local $SIG{__WARN__} = sub { $warn_cb->($self->{current_info}, ': ', @_); }; - _sync_inbox($self, $opt, $_) for (@{$self->{ibx_list}}); + + # don't use $_ here, it'll get clobbered by reindex_checkpoint + for my $ibx (@{$self->{ibx_list}}) { + _sync_inbox($self, $opt, $ibx); + } $self->{oidx}->rethread_done($opt);