From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id B5BB81FB0E for ; Fri, 4 Dec 2020 22:03:50 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 4/5] newswww: use ->ALL to avoid O(n) inbox scan Date: Fri, 4 Dec 2020 22:03:48 +0000 Message-Id: <20201204220349.4408-5-e@80x24.org> In-Reply-To: <20201204220349.4408-1-e@80x24.org> References: <20201204220349.4408-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We can avoid doing a Message-ID lookup on every single inbox by using ->ALL to scan its over.sqlite3 DB. This mimics NNTP behavior and picks the first message indexed, though redirecting to /all/$MESSAGE_ID/ could be done. With the current lore.kernel.org set of inboxes (~140), this provides a 10-40% speedup depending on inbox ordering. --- lib/PublicInbox/Config.pm | 4 ++-- lib/PublicInbox/NewsWWW.pm | 30 +++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm index 9b9d5c19..ba0ead6e 100644 --- a/lib/PublicInbox/Config.pm +++ b/lib/PublicInbox/Config.pm @@ -33,6 +33,7 @@ sub new { $self->{-by_list_id} = {}; $self->{-by_name} = {}; $self->{-by_newsgroup} = {}; + $self->{-by_eidx_key} = {}; $self->{-no_obfuscate} = {}; $self->{-limiters} = {}; $self->{-code_repos} = {}; # nick => PublicInbox::Git object @@ -476,8 +477,7 @@ EOF push @$repo_objs, $repo if $repo; } } - - $ibx + $self->{-by_eidx_key}->{$ibx->eidx_key} = $ibx; } sub _fill_ei ($$) { diff --git a/lib/PublicInbox/NewsWWW.pm b/lib/PublicInbox/NewsWWW.pm index 6bed0103..ade8dfd1 100644 --- a/lib/PublicInbox/NewsWWW.pm +++ b/lib/PublicInbox/NewsWWW.pm @@ -63,7 +63,6 @@ sub call { return redirect($code, $url); } - my $res; my @try = (join('/', @parts)); # trailing slash is in the rest of our WWW, so maybe some users @@ -72,13 +71,30 @@ sub call { pop @parts; push @try, join('/', @parts); } - - foreach my $mid (@try) { - my $arg = [ $mid ]; - $pi_config->each_inbox(\&try_inbox, $arg); - defined($res = $arg->[1]) and last; + my $ALL = $pi_config->ALL; + if (my $over = $ALL ? $ALL->over : undef) { + my $by_eidx_key = $pi_config->{-by_eidx_key}; + for my $mid (@try) { + my ($id, $prev); + while (my $x = $over->next_by_mid($mid, \$id, \$prev)) { + my $xr3 = $over->get_xref3($x->{num}); + for (@$xr3) { + s/:[0-9]+:$x->{blob}\z// or next; + my $ibx = $by_eidx_key->{$_} // next; + my $url = $ibx->base_url or next; + $url .= mid_escape($mid) . '/'; + return redirect(302, $url); + } + } + } + } else { # slow path, scan every inbox + for my $mid (@try) { + my $arg = [ $mid ]; # [1] => result + $pi_config->each_inbox(\&try_inbox, $arg); + return $arg->[1] if $arg->[1]; + } } - $res || [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; + [ 404, [qw(Content-Type text/plain)], ["404 Not Found\n"] ]; } 1;