From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E783D1FB0A for ; Mon, 23 Nov 2020 07:06:03 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 11/12] manifest: support faster generation via [extindex "all"] Date: Mon, 23 Nov 2020 07:06:01 +0000 Message-Id: <20201123070602.9698-12-e@80x24.org> In-Reply-To: <20201123070602.9698-1-e@80x24.org> References: <20201123070602.9698-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: For a mirror of lore.kernel.org with >140 inboxes, this speeds up manifest.js.gz generation from ~1s to 40ms on my HW. This is still unacceptable when dealing with thousands of inboxes, but gets us closer to where we need to be. --- lib/PublicInbox/Config.pm | 3 +++ lib/PublicInbox/Inbox.pm | 2 ++ lib/PublicInbox/InboxWritable.pm | 2 -- lib/PublicInbox/ManifestJsGz.pm | 39 ++++++++++++++++++++++++++------ lib/PublicInbox/MiscSearch.pm | 19 ++++++++++++++++ 5 files changed, 56 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 039eb445..251008a3 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -94,6 +94,9 @@ sub lookup_ei { $self->{-ei_by_name}->{$name} //= _fill_ei($self, "extindex.$name"); } +# special case for [extindex "all"] +sub ALL { lookup_ei($_[0], 'all') } + sub each_inbox { my ($self, $cb, @arg) = @_; # may auto-vivify if config file is non-existent: diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index a1a072ad..5a22e40d 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -429,4 +429,6 @@ sub on_unlock { sub uidvalidity { $_[0]->{uidvalidity} //= $_[0]->mm->created_at } +sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } + 1; diff --git a/lib/PublicInbox/InboxWritable.pm b/lib/PublicInbox/InboxWritable.pm index d3c255c7..e97c7e2d 100644 --- a/lib/PublicInbox/InboxWritable.pm +++ b/lib/PublicInbox/InboxWritable.pm @@ -319,6 +319,4 @@ sub git_dir_latest { $latest; } -sub eidx_key { $_[0]->{newsgroup} // $_[0]->{inboxdir} } - 1; diff --git a/lib/PublicInbox/ManifestJsGz.pm b/lib/PublicInbox/ManifestJsGz.pm index 3b436827..2c4a231d 100644 --- a/lib/PublicInbox/ManifestJsGz.pm +++ b/lib/PublicInbox/ManifestJsGz.pm @@ -21,6 +21,14 @@ sub url_regexp { $ctx->SUPER::url_regexp('publicInbox.grokManifest', 'match=domain'); } +sub inject_entry ($$$;$) { + my ($ctx, $url_path, $ent, $git_dir) = @_; + $ctx->{-abs2urlpath}->{$git_dir // delete $ent->{git_dir}} = $url_path; + my $modified = $ent->{modified}; + $ctx->{-mtime} = $modified if $modified > ($ctx->{-mtime} // 0); + $ctx->{manifest}->{$url_path} = $ent; +} + sub manifest_add ($$;$$) { my ($ctx, $ibx, $epoch, $default_desc) = @_; my $url_path = "/$ibx->{name}"; @@ -32,15 +40,10 @@ sub manifest_add ($$;$$) { $git = $ibx->git; } my $ent = $git->manifest_entry($epoch, $default_desc) or return; - $ctx->{-abs2urlpath}->{$git->{git_dir}} = $url_path; - my $modified = $ent->{modified}; - if ($modified > ($ctx->{-mtime} // 0)) { - $ctx->{-mtime} = $modified; - } - $ctx->{manifest}->{$url_path} = $ent; + inject_entry($ctx, $url_path, $ent, $git->{git_dir}); } -sub ibx_entry { +sub slow_manifest_add ($$) { my ($ctx, $ibx) = @_; eval { if (defined(my $max = $ibx->max_git_epoch)) { @@ -52,6 +55,28 @@ sub ibx_entry { manifest_add($ctx, $ibx); } }; +} + +sub eidx_manifest_add ($$$) { + my ($ctx, $ALL, $ibx) = @_; + if (my $data = $ALL->misc->inbox_data($ibx)) { + $data = $json->decode($data); + while (my ($url_path, $ent) = each %$data) { + inject_entry($ctx, $url_path, $ent); + } + } else { + warn "E: `${\$ibx->eidx_key}' not indexed by $ALL->{topdir}\n"; + } +} + +sub ibx_entry { + my ($ctx, $ibx) = @_; + my $ALL = $ctx->{www}->{pi_config}->ALL; + if ($ALL) { + eidx_manifest_add($ctx, $ALL, $ibx); + } else { + slow_manifest_add($ctx, $ibx); + } warn "E: $@" if $@; } diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm index 8beb8349..5a44d751 100644 --- a/lib/PublicInbox/MiscSearch.pm +++ b/lib/PublicInbox/MiscSearch.pm @@ -76,4 +76,23 @@ sub mset { retry_reopen($self, \&misc_enquire_once, [ $self, $qr, $opt ]); } +sub ibx_data_once { + my ($self, $ibx) = @{$_[0]}; + my $xdb = $self->{xdb}; + my $eidx_key = $ibx->eidx_key; # may be {inboxdir}, so private + my $head = $xdb->postlist_begin('Q'.$eidx_key); + my $tail = $xdb->postlist_end('Q'.$eidx_key); + if ($head != $tail) { + my $doc = $xdb->get_document($head->get_docid); + $doc->get_data; + } else { + undef; + } +} + +sub inbox_data { + my ($self, $ibx) = @_; + retry_reopen($self, \&ibx_data_once, [ $self, $ibx ]); +} + 1;