* [PATCH 01/12] nntp: use Inbox->uidvalidity instead of ->mm->created_at
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 02/12] nntpd: share {groups} hash with {-by_newsgroup} in Config Eric Wong
` (10 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
This is memoized, and may allow us some future flexibility w.r.t
PublicInbox::Inbox-like objects. While we're at it, use
defined-or ("//") in case somebody really set a public-inbox
creation time to the Unix epoch.
---
lib/PublicInbox/NNTP.pm | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 2f821fa6..2197d758 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -147,7 +147,7 @@ sub list_active_times ($;$) {
wildmat2re($wildmat);
foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
$ng->{newsgroup} =~ $wildmat or next;
- my $c = eval { $ng->mm->created_at } || time;
+ my $c = eval { $ng->uidvalidity } // time;
more($self, "$ng->{newsgroup} $c $ng->{-primary_address}");
}
}
@@ -255,7 +255,7 @@ sub cmd_newgroups ($$$;$$) {
# TODO dists
more($self, '231 list of new newsgroups follows');
foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- my $c = eval { $ng->mm->created_at } || 0;
+ my $c = eval { $ng->uidvalidity } // 0;
next unless $c > $ts;
group_line($self, $ng);
}
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 02/12] nntpd: share {groups} hash with {-by_newsgroup} in Config
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
2020-11-27 9:52 ` [PATCH 01/12] nntp: use Inbox->uidvalidity instead of ->mm->created_at Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 03/12] mm: min/max: return 0 instead of undef Eric Wong
` (9 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
There's no need to duplicate a potentially large hash,
but we can keep the inexpensive shortcut to it. We may
eventually drop the {groups} shortcut if it's no longer
useful.
---
lib/PublicInbox/Config.pm | 4 +++-
lib/PublicInbox/NNTPD.pm | 23 +++++++++++------------
2 files changed, 14 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 251008a3..e7aea99b 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -438,7 +438,9 @@ EOF
}
}
if (my $ng = $ibx->{newsgroup}) {
- $self->{-by_newsgroup}->{$ng} = $ibx;
+ # PublicInbox::NNTPD does stricter (and more expensive checks),
+ # keep this lean for startup speed
+ $self->{-by_newsgroup}->{$ng} = $ibx unless ref($ng);
}
$self->{-by_name}->{$name} = $ibx;
if ($ibx->{obfuscate}) {
diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm
index 6b762d89..13b0f678 100644
--- a/lib/PublicInbox/NNTPD.pm
+++ b/lib/PublicInbox/NNTPD.pm
@@ -36,11 +36,10 @@ sub new {
sub refresh_groups {
my ($self, $sig) = @_;
my $pi_config = $sig ? PublicInbox::Config->new : $self->{pi_config};
- my $new = {};
- my @list;
+ my $groups = $pi_config->{-by_newsgroup}; # filled during each_inbox
$pi_config->each_inbox(sub {
- my ($ng) = @_;
- my $ngname = $ng->{newsgroup} or return;
+ my ($ibx) = @_;
+ my $ngname = $ibx->{newsgroup} or return;
if (ref $ngname) {
warn 'multiple newsgroups not supported: '.
join(', ', @$ngname). "\n";
@@ -50,21 +49,21 @@ sub refresh_groups {
# '|', '<', '>', ';', '#', '$', '&',
} elsif ($ngname =~ m![^A-Za-z0-9/_\.\-\~\@\+\=:]!) {
warn "newsgroup name invalid: `$ngname'\n";
- } elsif ($ng->nntp_usable) {
+ delete $groups->{$ngname};
+ } elsif ($ibx->nntp_usable) {
# Only valid if msgmap and search works
- $new->{$ngname} = $ng;
- push @list, $ng;
# preload to avoid fragmentation:
- $ng->description;
- $ng->base_url;
+ $ibx->description;
+ $ibx->base_url;
+ } else {
+ delete $groups->{$ngname};
}
});
- @list = sort { $a->{newsgroup} cmp $b->{newsgroup} } @list;
- $self->{grouplist} = \@list;
+ $self->{grouplist} = [ map { $groups->{$_} } sort(keys %$groups) ];
$self->{pi_config} = $pi_config;
# this will destroy old groups that got deleted
- %{$self->{groups}} = %$new;
+ $self->{groups} = $groups;
}
sub idler_start {
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 03/12] mm: min/max: return 0 instead of undef
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
2020-11-27 9:52 ` [PATCH 01/12] nntp: use Inbox->uidvalidity instead of ->mm->created_at Eric Wong
2020-11-27 9:52 ` [PATCH 02/12] nntpd: share {groups} hash with {-by_newsgroup} in Config Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 04/12] nntp: use grep operation for wildmat matching Eric Wong
` (8 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
This simplifies callers and allows empty newsgroups to be
represented (the WWW UI may be insufficient there, too).
---
lib/PublicInbox/IMAPD.pm | 2 +-
lib/PublicInbox/Msgmap.pm | 7 +++----
lib/PublicInbox/NNTP.pm | 4 +---
lib/PublicInbox/V2Writable.pm | 4 ++--
t/msgmap.t | 2 +-
5 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/IMAPD.pm b/lib/PublicInbox/IMAPD.pm
index bb705136..366b6922 100644
--- a/lib/PublicInbox/IMAPD.pm
+++ b/lib/PublicInbox/IMAPD.pm
@@ -45,7 +45,7 @@ sub imapd_refresh_ibx { # pi_config->each_inbox cb
eval { $ibx->uidvalidity };
my $mm = delete($ibx->{mm}) or return;
defined($ibx->{uidvalidity}) or return;
- PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max // 0);
+ PublicInbox::IMAP::ensure_slices_exist($imapd, $ibx, $mm->max);
# preload to avoid fragmentation:
$ibx->description;
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index f15875e3..a8c874af 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -36,8 +36,7 @@ sub new_file {
create_tables($dbh);
$self->created_at(time) unless $self->created_at;
- my $max = $self->max // 0;
- $self->num_highwater($max);
+ $self->num_highwater(max($self));
$dbh->commit;
}
$self;
@@ -144,7 +143,7 @@ sub max {
my $sth = $_[0]->{dbh}->prepare_cached('SELECT MAX(num) FROM msgmap',
undef, 1);
$sth->execute;
- $sth->fetchrow_array;
+ $sth->fetchrow_array // 0;
}
sub minmax {
@@ -153,7 +152,7 @@ sub minmax {
my $sth = $_[0]->{dbh}->prepare_cached('SELECT MIN(num) FROM msgmap',
undef, 1);
$sth->execute;
- ($sth->fetchrow_array, max($_[0]));
+ ($sth->fetchrow_array // 0, max($_[0]));
}
sub mid_delete {
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 2197d758..b641bd23 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -244,7 +244,7 @@ sub parse_time ($$;$) {
sub group_line ($$) {
my ($self, $ng) = @_;
my ($min, $max) = $ng->mm->minmax;
- more($self, "$ng->{newsgroup} $max $min n") if defined $min && defined $max;
+ more($self, "$ng->{newsgroup} $max $min n");
}
sub cmd_newgroups ($$$;$$) {
@@ -343,8 +343,6 @@ sub cmd_group ($$) {
$self->{ng} = $ng;
my ($min, $max) = $ng->mm->minmax;
- $min ||= 0;
- $max ||= 0;
$self->{article} = $min;
my $est_size = $max - $min;
"211 $est_size $min $max $group";
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index afba0220..7bef1c89 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -1367,8 +1367,8 @@ sub index_sync {
# xapian_only works incrementally w/o --reindex
if ($seq && !$opt->{reindex}) {
- $art_beg = $sync->{mm_tmp}->max;
- $art_beg++ if defined($art_beg);
+ $art_beg = $sync->{mm_tmp}->max || -1;
+ $art_beg++;
}
}
# work forwards through history
diff --git a/t/msgmap.t b/t/msgmap.t
index 437e106e..2d31f1de 100644
--- a/t/msgmap.t
+++ b/t/msgmap.t
@@ -12,7 +12,7 @@ my $d = PublicInbox::Msgmap->new($tmpdir, 1);
my %mid2num;
my %num2mid;
my @mids = qw(a@b c@d e@f g@h aa@bb aa@cc);
-is_deeply([$d->minmax], [undef,undef], "empty min max on new DB");
+is_deeply([$d->minmax], [0,0], "zero min max on new DB");
foreach my $mid (@mids) {
my $n = $d->mid_insert($mid);
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 04/12] nntp: use grep operation for wildmat matching
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (2 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 03/12] mm: min/max: return 0 instead of undef Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 05/12] nntp: NEWNEWS: speed up filtering Eric Wong
` (7 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
Based on experiences with the IMAP server, this ought to be
significantly faster (as to be demonstrated in the next
commit).
---
lib/PublicInbox/NNTP.pm | 22 +++++++++++-----------
lib/PublicInbox/NNTPD.pm | 4 +++-
2 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index b641bd23..68222196 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -136,29 +136,29 @@ sub list_headers ($;$) {
sub list_active ($;$) {
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- group_line($self, $ng);
+ my $groups = $self->{nntpd}->{groups};
+ for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
+ group_line($self, $groups->{$ngname});
}
}
sub list_active_times ($;$) {
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- my $c = eval { $ng->uidvalidity } // time;
- more($self, "$ng->{newsgroup} $c $ng->{-primary_address}");
+ my $groups = $self->{nntpd}->{groups};
+ for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
+ my $ibx = $groups->{$ngname};
+ my $c = eval { $ibx->uidvalidity } // time;
+ more($self, "$ngname $c $ibx->{-primary_address}");
}
}
sub list_newsgroups ($;$) {
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $wildmat or next;
- my $d = $ng->description;
- more($self, "$ng->{newsgroup} $d");
+ my $groups = $self->{nntpd}->{groups};
+ for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
+ more($self, "$ngname ".$groups->{$ngname}->description);
}
}
diff --git a/lib/PublicInbox/NNTPD.pm b/lib/PublicInbox/NNTPD.pm
index 13b0f678..4de1944b 100644
--- a/lib/PublicInbox/NNTPD.pm
+++ b/lib/PublicInbox/NNTPD.pm
@@ -60,7 +60,9 @@ sub refresh_groups {
delete $groups->{$ngname};
}
});
- $self->{grouplist} = [ map { $groups->{$_} } sort(keys %$groups) ];
+ my @names = sort(keys %$groups);
+ $self->{grouplist} = [ map { $groups->{$_} } @names ];
+ $self->{groupnames} = \@names;
$self->{pi_config} = $pi_config;
# this will destroy old groups that got deleted
$self->{groups} = $groups;
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 05/12] nntp: NEWNEWS: speed up filtering
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (3 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 04/12] nntp: use grep operation for wildmat matching Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 06/12] miscsearch: implement ->newsgroup_matches Eric Wong
` (6 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
With 50K newsgroups, the filtering phase goes from ~2000 seconds
to ~90 MILLISECONDS by relying on the grep perlop. This moves
->over checking out of the main dispatch and amortizes the cost
via long_response. (Fairly scheduled) long_response time in
newnews_i now takes ~360 seconds as opposed to ~30 seconds
before this change, however; but the initial filtering speedup
eliminating 2000s is more than worth it.
---
lib/PublicInbox/NNTP.pm | 49 ++++++++++++++++++++---------------------
1 file changed, 24 insertions(+), 25 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 68222196..5cbf5a16 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -294,23 +294,28 @@ sub ngpat2re (;$) {
}
sub newnews_i {
- my ($self, $overs, $ts, $prev) = @_;
- my $over = $overs->[0];
- my $msgs = $over->query_ts($ts, $$prev);
- if (scalar @$msgs) {
- more($self, '<' .
- join(">\r\n<", map { $_->{mid} } @$msgs ).
- '>');
- $$prev = $msgs->[-1]->{num};
- } else {
- shift @$overs;
- if (@$overs) { # continue onto next newsgroup
- $$prev = 0;
- return 1;
- } else { # break out of the long response.
- return;
+ my ($self, $names, $ts, $prev) = @_;
+ my $ngname = $names->[0];
+ if (my $ibx = $self->{nntpd}->{groups}->{$ngname}) {
+ if (my $over = $ibx->over) {
+ my $msgs = $over->query_ts($ts, $$prev);
+ if (scalar @$msgs) {
+ more($self, '<' .
+ join(">\r\n<",
+ map { $_->{mid} } @$msgs ) .
+ '>');
+ $$prev = $msgs->[-1]->{num};
+ return 1; # continue on current group
+ }
}
}
+ shift @$names;
+ if (@$names) { # continue onto next newsgroup
+ $$prev = 0;
+ 1;
+ } else { # all done, break out of the long_response
+ undef;
+ }
}
sub cmd_newnews ($$$$;$$) {
@@ -321,17 +326,11 @@ sub cmd_newnews ($$$$;$$) {
my ($keep, $skip) = split('!', $newsgroups, 2);
ngpat2re($keep);
ngpat2re($skip);
- my @overs;
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- $ng->{newsgroup} =~ $keep or next;
- $ng->{newsgroup} =~ $skip and next;
- my $over = $ng->over or next;
- push @overs, $over;
- };
- return '.' unless @overs;
-
+ my @names = grep(!/$skip/, grep(/$keep/,
+ @{$self->{nntpd}->{groupnames}}));
+ return '.' unless scalar(@names);
my $prev = 0;
- long_response($self, \&newnews_i, \@overs, $ts, \$prev);
+ long_response($self, \&newnews_i, \@names, $ts, \$prev);
}
sub cmd_group ($$) {
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 06/12] miscsearch: implement ->newsgroup_matches
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (4 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 05/12] nntp: NEWNEWS: speed up filtering Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 07/12] nntp: LIST ACTIVE.TIMES use angle brackets around address Eric Wong
` (5 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
This may be used to speed up newsgroup searches down-the-line,
but the grep perlop isn't too shabby, at the moment.
---
lib/PublicInbox/MiscSearch.pm | 40 +++++++++++++++++++++++++++++++++++
t/extsearch.t | 4 ++++
2 files changed, 44 insertions(+)
diff --git a/lib/PublicInbox/MiscSearch.pm b/lib/PublicInbox/MiscSearch.pm
index 48ef6914..f2e31443 100644
--- a/lib/PublicInbox/MiscSearch.pm
+++ b/lib/PublicInbox/MiscSearch.pm
@@ -76,6 +76,46 @@ sub mset {
retry_reopen($self, \&misc_enquire_once, $qr, $opt);
}
+sub ibx_matches_once { # retry_reopen callback
+ my ($self, $qr, $by_newsgroup) = @_;
+ # double in case no newsgroups are configured:
+ my $limit = scalar(keys %$by_newsgroup) * 2;
+ my $opt = { limit => $limit, offset => 0, relevance => -1 };
+ my $ret = {}; # newsgroup => $ibx of matches
+ while (1) {
+ my $mset = misc_enquire_once($self, $qr, $opt);
+ for my $mi ($mset->items) {
+ my $doc = $mi->get_document;
+ my $end = $doc->termlist_end;
+ my $cur = $doc->termlist_begin;
+ $cur->skip_to('Q');
+ if ($cur != $end) {
+ my $ng = $cur->get_termname; # eidx_key
+ $ng =~ s/\AQ// or warn "BUG: no `Q': $ng";
+ if (my $ibx = $by_newsgroup->{$ng}) {
+ $ret->{$ng} = $ibx;
+ }
+ } else {
+ warn <<EOF;
+W: docid=${\$mi->get_docid} has no `Q' (eidx_key) term
+EOF
+ }
+ }
+ my $nr = $mset->size;
+ return $ret if $nr < $limit;
+ $opt->{offset} += $nr;
+ }
+}
+
+# returns a newsgroup => PublicInbox::Inbox mapping
+sub newsgroup_matches {
+ my ($self, $qs, $pi_cfg) = @_;
+ my $qp = $self->{qp} //= mi_qp_new($self);
+ $qs .= ' type:inbox';
+ my $qr = $qp->parse_query($qs, $PublicInbox::Search::QP_FLAGS);
+ retry_reopen($self, \&ibx_matches_once, $qr, $pi_cfg->{-by_newsgroup});
+}
+
sub ibx_data_once {
my ($self, $ibx) = @_;
my $xdb = $self->{xdb};
diff --git a/t/extsearch.t b/t/extsearch.t
index 0045294b..85cdf74a 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -79,5 +79,9 @@ my @it = $misc->mset('')->items;
is(scalar(@it), 2, 'two inboxes');
like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
+my $pi_cfg = PublicInbox::Config->new;
+$pi_cfg->fill_all;
+my $ret = $misc->newsgroup_matches('', $pi_cfg);
+is_deeply($pi_cfg->{-by_newsgroup}, $ret, '->newsgroup_matches');
done_testing;
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 07/12] nntp: LIST ACTIVE.TIMES use angle brackets around address
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (5 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 06/12] miscsearch: implement ->newsgroup_matches Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 08/12] nntp: move LIST iterators to long_response Eric Wong
` (4 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
This matches the example shown in RFC 3977, section 7.6.1.3
---
lib/PublicInbox/NNTP.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 5cbf5a16..eb2c0b38 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -149,7 +149,7 @@ sub list_active_times ($;$) {
for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
my $ibx = $groups->{$ngname};
my $c = eval { $ibx->uidvalidity } // time;
- more($self, "$ngname $c $ibx->{-primary_address}");
+ more($self, "$ngname $c <$ibx->{-primary_address}>");
}
}
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 08/12] nntp: move LIST iterators to long_response
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (6 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 07/12] nntp: LIST ACTIVE.TIMES use angle brackets around address Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 09/12] t/extsearch: show a more realistic case Eric Wong
` (3 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
Iterating through many newsgroups can hog the event loop if many
random seeks are required. Avoid monopolizing the event loop in
that case by using the long_response API.
For now, we can still rely on grep() since it seems to work
reasonably well with 50K test newsgroup names.
---
lib/PublicInbox/NNTP.pm | 77 +++++++++++++++++++++++++----------------
1 file changed, 48 insertions(+), 29 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index eb2c0b38..af40b86d 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -31,9 +31,9 @@ use Errno qw(EAGAIN);
my $ONE_MSGID = qr/\A$MID_EXTRACT\z/;
my @OVERVIEW = qw(Subject From Date Message-ID References);
my $OVERVIEW_FMT = join(":\r\n", @OVERVIEW, qw(Bytes Lines), '') .
- "Xref:full\r\n";
+ "Xref:full\r\n.";
my $LIST_HEADERS = join("\r\n", @OVERVIEW,
- qw(:bytes :lines Xref To Cc)) . "\r\n";
+ qw(:bytes :lines Xref To Cc)) . "\r\n.";
my $CAPABILITIES = <<"";
101 Capability list:\r
VERSION 2\r
@@ -120,46 +120,66 @@ sub cmd_xgtitle ($;$) {
my ($self, $wildmat) = @_;
more($self, '282 list of groups and descriptions follows');
list_newsgroups($self, $wildmat);
- '.'
}
-sub list_overview_fmt ($) {
- my ($self) = @_;
- $self->msg_more($OVERVIEW_FMT);
-}
+sub list_overview_fmt ($) { $OVERVIEW_FMT }
-sub list_headers ($;$) {
- my ($self) = @_;
- $self->msg_more($LIST_HEADERS);
-}
+sub list_headers ($;$) { $LIST_HEADERS }
-sub list_active ($;$) {
- my ($self, $wildmat) = @_;
- wildmat2re($wildmat);
- my $groups = $self->{nntpd}->{groups};
- for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
- group_line($self, $groups->{$ngname});
+sub list_active_i { # "LIST ACTIVE" and also just "LIST" (no args)
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $ibx;
+ my $groups = $self->{nntpd}->{pi_config}->{-by_newsgroup};
+ for my $ngname (@window) {
+ $ibx = $groups->{$ngname} and group_line($self, $ibx);
}
+ scalar(@$groupnames); # continue if there's more
}
-sub list_active_times ($;$) {
+sub list_active ($;$) { # called by cmd_list
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- my $groups = $self->{nntpd}->{groups};
- for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
- my $ibx = $groups->{$ngname};
+ long_response($self, \&list_active_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
+}
+
+sub list_active_times_i {
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $groups = $self->{nntpd}->{pi_config}->{-by_newsgroup};
+ for my $ngname (@window) {
+ my $ibx = $groups->{$ngname} or next;
my $c = eval { $ibx->uidvalidity } // time;
more($self, "$ngname $c <$ibx->{-primary_address}>");
}
+ scalar(@$groupnames); # continue if there's more
}
-sub list_newsgroups ($;$) {
+sub list_active_times ($;$) { # called by cmd_list
my ($self, $wildmat) = @_;
wildmat2re($wildmat);
- my $groups = $self->{nntpd}->{groups};
- for my $ngname (grep(/$wildmat/, @{$self->{nntpd}->{groupnames}})) {
- more($self, "$ngname ".$groups->{$ngname}->description);
+ long_response($self, \&list_active_times_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
+}
+
+sub list_newsgroups_i {
+ my ($self, $groupnames) = @_;
+ my @window = splice(@$groupnames, 0, 100) or return 0;
+ my $groups = $self->{nntpd}->{pi_config}->{-by_newsgroup};
+ my $ibx;
+ for my $ngname (@window) {
+ $ibx = $groups->{$ngname} and
+ more($self, "$ngname ".$ibx->description);
}
+ scalar(@$groupnames); # continue if there's more
+}
+
+sub list_newsgroups ($;$) { # called by cmd_list
+ my ($self, $wildmat) = @_;
+ wildmat2re($wildmat);
+ long_response($self, \&list_newsgroups_i, [
+ grep(/$wildmat/, @{$self->{nntpd}->{groupnames}}) ]);
}
# LIST SUBSCRIPTIONS, DISTRIB.PATS are not supported
@@ -168,6 +188,7 @@ sub cmd_list ($;$$) {
if (scalar @args) {
my $arg = shift @args;
$arg =~ tr/A-Z./a-z_/;
+ my $ret = $arg eq 'active';
$arg = "list_$arg";
$arg = $self->can($arg);
return r501 unless $arg && args_ok($arg, scalar @args);
@@ -175,11 +196,9 @@ sub cmd_list ($;$$) {
$arg->($self, @args);
} else {
more($self, '215 list of newsgroups follows');
- foreach my $ng (@{$self->{nntpd}->{grouplist}}) {
- group_line($self, $ng);
- }
+ long_response($self, \&list_active_i, [ # copy array
+ @{$self->{nntpd}->{groupnames}} ]);
}
- '.'
}
sub listgroup_range_i {
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 09/12] t/extsearch: show a more realistic case
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (7 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 08/12] nntp: move LIST iterators to long_response Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 10/12] nntp: some minor golfing Eric Wong
` (2 subsequent siblings)
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
Different messages to different public Inboxes are likely to
have different List-IDs, so show that we can deduplicate based
on content (but per-mailing-list trailers need to go through a
PublicInbox::Filter::* or be disabled by mailing list admins).
---
t/extsearch.t | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)
diff --git a/t/extsearch.t b/t/extsearch.t
index 85cdf74a..778ba32d 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -25,14 +25,28 @@ my $v1addr = 'v1test@example.com';
ok(run_script([qw(-init -V2 v2test --newsgroup v2.example), "$home/v2test",
'http://example.com/v2test', $v2addr ]), 'v2test init');
my $env = { ORIGINAL_RECIPIENT => $v2addr };
-open($fh, '<', 't/utf8.eml') or BAIL_OUT("open t/utf8.eml: $!");
+my $eml = eml_load('t/utf8.eml');
+
+$eml->header_set('List-Id', '<v2.example.com>');
+open($fh, '+>', undef) or BAIL_OUT $!;
+$fh->autoflush(1);
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
ok(run_script([qw(-init -V1 v1test), "$home/v1test",
'http://example.com/v1test', $v1addr ]), 'v1test init');
-$env = { ORIGINAL_RECIPIENT => $v1addr };
+
+$eml->header_set('List-Id', '<v1.example.com>');
seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+truncate($fh, 0) or BAIL_OUT $!;
+print $fh $eml->as_string or BAIL_OUT $!;
+seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
+
+$env = { ORIGINAL_RECIPIENT => $v1addr };
run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
+
run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?";
ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 10/12] nntp: some minor golfing
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (8 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 09/12] t/extsearch: show a more realistic case Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 11/12] nntp: xref: simplify sub signature Eric Wong
2020-11-27 9:52 ` [PATCH 12/12] nntp: xref: use ->ALL extindex if available Eric Wong
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
Reduce screen real estate usage to reduce human attention span
requirements.
---
lib/PublicInbox/NNTP.pm | 7 ++-----
1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index af40b86d..967a36a5 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -92,8 +92,7 @@ sub process_line ($$) {
err($self, 'error from: %s (%s)', $l, $err);
$res = '503 program fault - command not performed';
}
- return 0 unless defined $res;
- res($self, $res);
+ defined($res) ? res($self, $res) : 0;
}
# The keyword argument is not used (rfc3977 5.2.2)
@@ -109,9 +108,7 @@ sub cmd_capabilities ($;$) {
sub cmd_mode ($$) {
my ($self, $arg) = @_;
- $arg = uc $arg;
- return r501 unless $arg eq 'READER';
- '201 Posting prohibited';
+ uc($arg) eq 'READER' ? '201 Posting prohibited' : r501;
}
sub cmd_slave ($) { '202 slave status noted' }
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 11/12] nntp: xref: simplify sub signature
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (9 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 10/12] nntp: some minor golfing Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-27 9:52 ` [PATCH 12/12] nntp: xref: use ->ALL extindex if available Eric Wong
11 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
We'll be using the `xref3' table in extindex to speed up xref(),
and that'll require comparisons against $smsg->{blob}. So pass
the entire $smsg through.
---
lib/PublicInbox/NNTP.pm | 54 +++++++++++++++++++++--------------------
1 file changed, 28 insertions(+), 26 deletions(-)
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 967a36a5..39ff5257 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -408,18 +408,19 @@ sub header_append ($$$) {
$hdr->header_set($k, @v, $v);
}
-sub xref ($$$$) {
- my ($self, $ng, $n, $mid) = @_;
- my $ret = $self->{nntpd}->{servername} . " $ng->{newsgroup}:$n";
-
- # num_for is pretty cheap and sometimes we'll lookup the existence
- # of an article without getting even the OVER info. In other words,
- # I'm not sure if its worth optimizing by scanning To:/Cc: and
- # PublicInbox::ExtMsg on the PSGI end is just as expensive
- foreach my $other (@{$self->{nntpd}->{grouplist}}) {
- next if $ng eq $other;
- my $num = eval { $other->mm->num_for($mid) } or next;
- $ret .= " $other->{newsgroup}:$num";
+sub xref ($$$) {
+ my ($self, $cur_ibx, $smsg) = @_;
+ my $nntpd = $self->{nntpd};
+ my $cur_ngname = $cur_ibx->{newsgroup};
+ my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}";
+
+ my $mid = $smsg->{mid};
+ my $groups = $nntpd->{pi_config}->{-by_newsgroup};
+ for my $xngname (@{$nntpd->{groupnames}}) {
+ next if $cur_ngname eq $xngname;
+ my $xibx = $groups->{$xngname} or next;
+ my $num = eval { $xibx->mm->num_for($mid) } or next;
+ $ret .= " $xngname:$num";
}
$ret;
}
@@ -443,7 +444,7 @@ sub set_nntp_headers ($$) {
# clobber some existing headers
my $ibx = $smsg->{-ibx};
- my $xref = xref($smsg->{nntp}, $ibx, $smsg->{num}, $mid);
+ my $xref = xref($smsg->{nntp}, $ibx, $smsg);
$hdr->header_set('Xref', $xref);
# RFC 5536 3.1.4
@@ -724,12 +725,12 @@ sub mid_lookup ($$) {
sub xref_range_i {
my ($self, $beg, $end) = @_;
my $ng = $self->{ng};
- my $r = $ng->mm->msg_range($beg, $end);
- @$r or return;
+ my $msgs = $ng->over->query_xover($$beg, $end);
+ scalar(@$msgs) or return;
+ $$beg = $msgs->[-1]->{num} + 1;
more($self, join("\r\n", map {
- my $num = $_->[0];
- "$num ".xref($self, $ng, $num, $_->[1]);
- } @$r));
+ "$_->{num} ".xref($self, $ng, $_);
+ } @$msgs));
1;
}
@@ -740,8 +741,9 @@ sub hdr_xref ($$$) { # optimize XHDR Xref [range] for rtin
my $mid = $1;
my ($ng, $n) = mid_lookup($self, $mid);
return r430 unless $n;
+ my $smsg = $ng->over->get_art($n) or return;
hdr_mid_response($self, $xhdr, $ng, $n, $range,
- xref($self, $ng, $n, $mid));
+ xref($self, $ng, $smsg));
} else { # numeric range
$range = $self->{article} unless defined $range;
my $r = get_range($self, $range);
@@ -872,11 +874,11 @@ sub cmd_xrover ($;$) {
long_response($self, \&xrover_i, @$r);
}
-sub over_line ($$$$) {
- my ($self, $ng, $num, $smsg) = @_;
+sub over_line ($$$) {
+ my ($self, $ng, $smsg) = @_;
# n.b. field access and procedural calls can be
# 10%-15% faster than OO method calls:
- my $s = join("\t", $num,
+ my $s = join("\t", $smsg->{num},
$smsg->{subject},
$smsg->{from},
PublicInbox::Smsg::date($smsg),
@@ -884,7 +886,7 @@ sub over_line ($$$$) {
$smsg->{references},
$smsg->{bytes},
$smsg->{lines},
- "Xref: " . xref($self, $ng, $num, $smsg->{mid}));
+ "Xref: " . xref($self, $ng, $smsg));
utf8::encode($s);
$s
}
@@ -899,8 +901,8 @@ sub cmd_over ($;$) {
# Only set article number column if it's the current group
my $self_ng = $self->{ng};
- $n = 0 if (!$self_ng || $self_ng ne $ng);
- more($self, over_line($self, $ng, $n, $smsg));
+ $smsg->{num} = 0 if (!$self_ng || $self_ng ne $ng);
+ more($self, over_line($self, $ng, $smsg));
'.';
} else {
cmd_xover($self, $range);
@@ -915,7 +917,7 @@ sub xover_i {
# OVERVIEW.FMT
more($self, join("\r\n", map {
- over_line($self, $ng, $_->{num}, $_);
+ over_line($self, $ng, $_);
} @$msgs));
$$beg = $msgs->[-1]->{num} + 1;
}
^ permalink raw reply related [flat|nested] 15+ messages in thread
* [PATCH 12/12] nntp: xref: use ->ALL extindex if available
2020-11-27 9:52 [PATCH 00/12] some NNTP-related fixes + speedups Eric Wong
` (10 preceding siblings ...)
2020-11-27 9:52 ` [PATCH 11/12] nntp: xref: simplify sub signature Eric Wong
@ 2020-11-27 9:52 ` Eric Wong
2020-11-30 19:42 ` xref3 + NNTP problems Eric Wong
11 siblings, 1 reply; 15+ messages in thread
From: Eric Wong @ 2020-11-27 9:52 UTC (permalink / raw)
To: meta
Getting Xref for cross-posted messages is an O(n) operation
where `n' is the number of newsgroups on the server. This works
acceptably when there are dozens of groups, but would be
unnacceptable when there's tens of thousands of newsgroups.
With ~140 newsgroups, a lore.kernel.org mirror already handles
"XHDR Xref $MESSAGE_ID" requests around 30% faster after
creating the xref3.idx_nntp index.
The SQL additions to ExtSearch.pm may be a bit strange and
seem more appropriate for Over.pm; however it currently makes
sense to me since those bits of over.sqlite3 access are
exclusive to ExtSearch and can't be used by traditional
v1/v2 inboxes...
---
lib/PublicInbox/ExtSearch.pm | 56 ++++++++++++++++++++++++++++++++++++
lib/PublicInbox/NNTP.pm | 23 +++++++++------
lib/PublicInbox/OverIdx.pm | 5 ++++
t/extsearch.t | 40 ++++++++++++++++++++++----
t/nntp.t | 7 ++++-
5 files changed, 116 insertions(+), 15 deletions(-)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index dd93cd32..20ec3224 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -11,6 +11,7 @@ use PublicInbox::Over;
use PublicInbox::Inbox;
use File::Spec ();
use PublicInbox::MiscSearch;
+use DBI qw(:sql_types); # SQL_BLOB
# for ->reopen, ->mset, ->mset_to_artnums
use parent qw(PublicInbox::Search);
@@ -49,6 +50,61 @@ sub git {
$self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git");
}
+# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using
+# the `xref3' table
+sub nntp_xref_for { # NNTP only
+ my ($self, $xibx, $xsmsg) = @_;
+ my $dbh = over($self)->dbh;
+
+ my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
+
+ $sth->execute($xibx->{newsgroup});
+ my $xibx_id = $sth->fetchrow_array // do {
+ warn "W: `$xibx->{newsgroup}' not found in $self->{topdir}\n";
+ return;
+ };
+
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1
+
+ $sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB);
+ $sth->bind_param(2, $xsmsg->{num});
+ $sth->bind_param(3, $xibx_id);
+ $sth->execute;
+ my $docid = $sth->fetchrow_array // do {
+ warn <<EOF;
+W: `$xibx->{newsgroup}:$xsmsg->{num}' not found in $self->{topdir}"
+EOF
+ return;
+ };
+
+ # LIMIT is number of newsgroups on server:
+ $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT ibx_id,xnum FROM xref3 WHERE docid = ?
+
+ $sth->execute($docid);
+ my $rows = $sth->fetchall_arrayref;
+
+ my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
+
+ my %xref = map {
+ my ($ibx_id, $xnum) = @$_;
+ if ($ibx_id == $xibx_id) {
+ ();
+ } else {
+ $eidx_key_sth->execute($ibx_id);
+ my $eidx_key = $eidx_key_sth->fetchrow_array;
+
+ # only include if there's a newsgroup name
+ $eidx_key && index($eidx_key, '/') >= 0 ?
+ () : ($eidx_key => $xnum)
+ }
+ } @$rows;
+ [ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order
+}
+
sub mm { undef }
sub altid_map { {} }
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 39ff5257..8eec6b91 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -413,14 +413,21 @@ sub xref ($$$) {
my $nntpd = $self->{nntpd};
my $cur_ngname = $cur_ibx->{newsgroup};
my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}";
-
- my $mid = $smsg->{mid};
- my $groups = $nntpd->{pi_config}->{-by_newsgroup};
- for my $xngname (@{$nntpd->{groupnames}}) {
- next if $cur_ngname eq $xngname;
- my $xibx = $groups->{$xngname} or next;
- my $num = eval { $xibx->mm->num_for($mid) } or next;
- $ret .= " $xngname:$num";
+ if (my $ALL = $nntpd->{pi_config}->ALL) {
+ if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) {
+ $ret .= join(' ', '', @$ary) if scalar(@$ary);
+ }
+ # better off wrong than slow if there's thousands of groups,
+ # so no fallback to the slow path below:
+ } else { # slow path
+ my $mid = $smsg->{mid};
+ my $groups = $nntpd->{pi_config}->{-by_newsgroup};
+ for my $xngname (@{$nntpd->{groupnames}}) {
+ next if $cur_ngname eq $xngname;
+ my $xibx = $groups->{$xngname} or next;
+ my $num = eval { $xibx->mm->num_for($mid) } or next;
+ $ret .= " $xngname:$num";
+ }
}
$ret;
}
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 173e3220..8bec08da 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -542,6 +542,11 @@ CREATE TABLE IF NOT EXISTS xref3 (
$dbh->do('CREATE INDEX IF NOT EXISTS idx_docid ON xref3 (docid)');
+ # performance critical, this is not UNIQUE since we may need to
+ # tolerate some old bugs from indexing mirrors
+ $dbh->do('CREATE INDEX IF NOT EXISTS idx_nntp ON '.
+ 'xref3 (oidbin,xnum,ibx_id)');
+
$dbh->do(<<'');
CREATE TABLE IF NOT EXISTS eidx_meta (
key VARCHAR(255) PRIMARY KEY,
diff --git a/t/extsearch.t b/t/extsearch.t
index 778ba32d..f9f74e5c 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -11,6 +11,8 @@ require_git(2.6);
require_mods(qw(DBD::SQLite Search::Xapian));
use_ok 'PublicInbox::ExtSearch';
use_ok 'PublicInbox::ExtSearchIdx';
+my $sock = tcp_server();
+my $host_port = $sock->sockhost . ':' . $sock->sockport;
my ($home, $for_destroy) = tmpdir();
local $ENV{HOME} = $home;
mkdir "$home/.public-inbox" or BAIL_OUT $!;
@@ -35,7 +37,7 @@ seek($fh, 0, SEEK_SET) or BAIL_OUT $!;
run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda';
-ok(run_script([qw(-init -V1 v1test), "$home/v1test",
+ok(run_script([qw(-init -V1 v1test --newsgroup v1.example), "$home/v1test",
'http://example.com/v1test', $v1addr ]), 'v1test init');
$eml->header_set('List-Id', '<v1.example.com>');
@@ -51,6 +53,36 @@ run_script(['-index', "$home/v1test"]) or BAIL_OUT "index $?";
ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init');
+
+{ # TODO: -extindex should write this to config
+ open $fh, '>>', "$home/.public-inbox/config" or BAIL_OUT $!;
+ print $fh <<EOF or BAIL_OUT $!;
+; for ->ALL
+[extindex "all"]
+ topdir = $home/extindex
+EOF
+ close $fh or BAIL_OUT $!;
+
+ my $pi_cfg = PublicInbox::Config->new;
+ $pi_cfg->fill_all;
+ ok($pi_cfg->ALL, '->ALL');
+ my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
+ my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
+ is_deeply($ret, ['v1.example:1'], '->nntp_xref_for');
+}
+
+SKIP: {
+ require_mods(qw(Net::NNTP), 1);
+ my ($out, $err) = ("$home/nntpd.out.log", "$home/nntpd.err.log");
+ my $cmd = [ '-nntpd', '-W0' ]; #, "--stdout=$out", "--stderr=$err" ];
+ my $td = start_script($cmd, undef, { 3 => $sock });
+ my $n = Net::NNTP->new($host_port);
+ $n->group('v1.example');
+ my $res = $n->head(1);
+ @$res = grep(/^Xref: /, @$res);
+ like($res->[0], qr/ v1\.example:1 v2\.example:1/, 'nntp_xref works');
+}
+
my $es = PublicInbox::ExtSearch->new("$home/extindex");
{
my $smsg = $es->over->get_art(1);
@@ -58,7 +90,7 @@ my $es = PublicInbox::ExtSearch->new("$home/extindex");
is($es->over->get_art(2), undef, 'only one added');
my $xref3 = $es->over->get_xref3(1);
like($xref3->[0], qr/\A\Qv2.example\E:1:/, 'order preserved 1');
- like($xref3->[1], qr!\A\Q$home/v1test\E:1:!, 'order preserved 2');
+ like($xref3->[1], qr/\A\Qv1.example\E:1:/, 'order preserved 2');
is(scalar(@$xref3), 2, 'only to entries');
}
@@ -93,9 +125,5 @@ my @it = $misc->mset('')->items;
is(scalar(@it), 2, 'two inboxes');
like($it[0]->get_document->get_data, qr/v2test/, 'docdata matched v2');
like($it[1]->get_document->get_data, qr/v1test/, 'docdata matched v1');
-my $pi_cfg = PublicInbox::Config->new;
-$pi_cfg->fill_all;
-my $ret = $misc->newsgroup_matches('', $pi_cfg);
-is_deeply($pi_cfg->{-by_newsgroup}, $ret, '->newsgroup_matches');
done_testing;
diff --git a/t/nntp.t b/t/nntp.t
index 9a482acb..91a2aff7 100644
--- a/t/nntp.t
+++ b/t/nntp.t
@@ -8,6 +8,7 @@ use PublicInbox::Eml;
require_mods(qw(DBD::SQLite Data::Dumper));
use_ok 'PublicInbox::NNTP';
use_ok 'PublicInbox::Inbox';
+use PublicInbox::Config;
{
sub quote_str {
@@ -110,7 +111,11 @@ use_ok 'PublicInbox::Inbox';
my $mime = PublicInbox::Eml->new("Message-ID: <$mid>\r\n\r\n");
my $hdr = $mime->header_obj;
my $mock_self = {
- nntpd => { grouplist => [], servername => 'example.com' },
+ nntpd => {
+ grouplist => [],
+ servername => 'example.com',
+ pi_config => bless {}, 'PublicInbox::Config',
+ },
ng => $ng,
};
my $smsg = { num => 1, mid => $mid, nntp => $mock_self, -ibx => $ng };
^ permalink raw reply related [flat|nested] 15+ messages in thread
* xref3 + NNTP problems...
2020-11-27 9:52 ` [PATCH 12/12] nntp: xref: use ->ALL extindex if available Eric Wong
@ 2020-11-30 19:42 ` Eric Wong
2020-11-30 23:37 ` [PATCH] nntp: make ->ALL Xref generation more fuzzy Eric Wong
0 siblings, 1 reply; 15+ messages in thread
From: Eric Wong @ 2020-11-30 19:42 UTC (permalink / raw)
To: meta
Eric Wong <e@80x24.org> wrote:
> Subject: Re: [PATCH 12/12] nntp: xref: use ->ALL extindex if available
>
> Getting Xref for cross-posted messages is an O(n) operation
> where `n' is the number of newsgroups on the server. This works
> acceptably when there are dozens of groups, but would be
> unnacceptable when there's tens of thousands of newsgroups.
>
> With ~140 newsgroups, a lore.kernel.org mirror already handles
> "XHDR Xref $MESSAGE_ID" requests around 30% faster after
> creating the xref3.idx_nntp index.
xref3-based on ContentHash is great for internal use to ensure
we don't archive redundant messages; but don't get fooled by
reused Message-IDs, either (because users can and do reuse
Message-IDs for completely different messages)
But there's also more cases where content-matching for messages
we don't match because of mailing list-added trailers
(e.g. unsubscribe and other list info info)
So every message posted to linux-mtd@lists.infradead.org
nntp://rskvuqcfnfizkjg6h5jvovwb3wkikzcwskf54lfpymus6mxrzw67b5ad.onion/org.infradead.lists.linux-mtd
fails to match cross posts to Xref:, which is bad, too...
I have an idea to make it less bad, maybe it'll work...
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH] nntp: make ->ALL Xref generation more fuzzy
2020-11-30 19:42 ` xref3 + NNTP problems Eric Wong
@ 2020-11-30 23:37 ` Eric Wong
0 siblings, 0 replies; 15+ messages in thread
From: Eric Wong @ 2020-11-30 23:37 UTC (permalink / raw)
To: meta
Eric Wong <e@80x24.org> wrote:
> So every message posted to linux-mtd@lists.infradead.org
> nntp://rskvuqcfnfizkjg6h5jvovwb3wkikzcwskf54lfpymus6mxrzw67b5ad.onion/org.infradead.lists.linux-mtd
> fails to match cross posts to Xref:, which is bad, too...
>
> I have an idea to make it less bad, maybe it'll work...
Deployed to
nntp://rskvuqcfnfizkjg6h5jvovwb3wkikzcwskf54lfpymus6mxrzw67b5ad.onion/
and it seems to be OK when comparing messages cross-posted
between org.kernel.vger.* and org.infradead.lists.*
-------------8<------------
Subject: [PATCH] nntp: make ->ALL Xref generation more fuzzy
For ->ALL users, this mitigates the regression introduced
by commit 811b8d3cbaa790f59b7b107140b86248da16499b
("nntp: xref: use ->ALL extindex if available"), since
it's common to cross post messages to some mailing
lists with per-list trailers for unsubscribe information.
We won't bother dealing with Bcc-ed messages since those
are nearly all spam when it comes to public mailing lists.
Fixes: 811b8d3cbaa790f5 ("nntp: xref: use ->ALL extindex if available")
Link: https://public-inbox.org/meta/20201130194201.GA6687@dcvr/
---
lib/PublicInbox/ExtSearch.pm | 31 +++++++++++-----------
lib/PublicInbox/NNTP.pm | 50 ++++++++++++++++++++++++++----------
t/extsearch.t | 3 ++-
3 files changed, 53 insertions(+), 31 deletions(-)
diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm
index 20ec3224..80455d8d 100644
--- a/lib/PublicInbox/ExtSearch.pm
+++ b/lib/PublicInbox/ExtSearch.pm
@@ -50,8 +50,7 @@ sub git {
$self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git");
}
-# returns an arrayref of [ $NEWSGROUP_NAME:$ART_NO ] using
-# the `xref3' table
+# returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table
sub nntp_xref_for { # NNTP only
my ($self, $xibx, $xsmsg) = @_;
my $dbh = over($self)->dbh;
@@ -69,7 +68,9 @@ SELECT ibx_id FROM inboxes WHERE eidx_key = ? LIMIT 1
SELECT docid FROM xref3 WHERE oidbin = ? AND xnum = ? AND ibx_id = ? LIMIT 1
$sth->bind_param(1, pack('H*', $xsmsg->{blob}), SQL_BLOB);
- $sth->bind_param(2, $xsmsg->{num});
+
+ # NNTP::cmd_over can set {num} to zero according to RFC 3977 8.3.2
+ $sth->bind_param(2, $xsmsg->{num} || $xsmsg->{-orig_num});
$sth->bind_param(3, $xibx_id);
$sth->execute;
my $docid = $sth->fetchrow_array // do {
@@ -81,9 +82,9 @@ EOF
# LIMIT is number of newsgroups on server:
$sth = $dbh->prepare_cached(<<'', undef, 1);
-SELECT ibx_id,xnum FROM xref3 WHERE docid = ?
+SELECT ibx_id,xnum FROM xref3 WHERE docid = ? AND ibx_id != ?
- $sth->execute($docid);
+ $sth->execute($docid, $xibx_id);
my $rows = $sth->fetchall_arrayref;
my $eidx_key_sth = $dbh->prepare_cached(<<'', undef, 1);
@@ -91,18 +92,16 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ? LIMIT 1
my %xref = map {
my ($ibx_id, $xnum) = @$_;
- if ($ibx_id == $xibx_id) {
- ();
- } else {
- $eidx_key_sth->execute($ibx_id);
- my $eidx_key = $eidx_key_sth->fetchrow_array;
-
- # only include if there's a newsgroup name
- $eidx_key && index($eidx_key, '/') >= 0 ?
- () : ($eidx_key => $xnum)
- }
+
+ $eidx_key_sth->execute($ibx_id);
+ my $eidx_key = $eidx_key_sth->fetchrow_array;
+
+ # only include if there's a newsgroup name
+ $eidx_key && index($eidx_key, '/') >= 0 ?
+ () : ($eidx_key => $xnum)
} @$rows;
- [ map { "$_:$xref{$_}" } sort keys %xref ]; # match NNTP LIST order
+ $xref{$xibx->{newsgroup}} = $xsmsg->{num};
+ \%xref;
}
sub mm { undef }
diff --git a/lib/PublicInbox/NNTP.pm b/lib/PublicInbox/NNTP.pm
index 3b16a66a..e0916011 100644
--- a/lib/PublicInbox/NNTP.pm
+++ b/lib/PublicInbox/NNTP.pm
@@ -17,6 +17,8 @@ use PublicInbox::DS qw(now);
use Digest::SHA qw(sha1_hex);
use Time::Local qw(timegm timelocal);
use PublicInbox::GitAsyncCat;
+use PublicInbox::Address;
+
use constant {
LINE_MAX => 512, # RFC 977 section 2.3
r501 => '501 command syntax error',
@@ -417,27 +419,42 @@ sub header_append ($$$) {
$hdr->header_set($k, @v, $v);
}
+sub xref_by_tc ($$$) {
+ my ($xref, $pi_cfg, $smsg) = @_;
+ my $by_addr = $pi_cfg->{-by_addr};
+ my $groups = $pi_cfg->{-by_newsgroup};
+ my $mid = $smsg->{mid};
+ for my $f (qw(to cc)) {
+ my @ibxs = map {
+ $by_addr->{lc($_)} // ()
+ } (PublicInbox::Address::emails($smsg->{$f} // ''));
+ for my $ibx (@ibxs) {
+ $groups->{my $ngname = $ibx->{newsgroup}} or next;
+ next if defined $xref->{$ngname};
+ $xref->{$ngname} = eval { $ibx->mm->num_for($mid) };
+ }
+ }
+}
+
sub xref ($$$) {
my ($self, $cur_ibx, $smsg) = @_;
my $nntpd = $self->{nntpd};
- my $cur_ngname = $cur_ibx->{newsgroup};
- my $ret = "$nntpd->{servername} $cur_ngname:$smsg->{num}";
+ my $cur_ng = $cur_ibx->{newsgroup};
+ my $xref;
if (my $ALL = $nntpd->{pi_config}->ALL) {
- if (my $ary = $ALL->nntp_xref_for($cur_ibx, $smsg)) {
- $ret .= join(' ', '', @$ary) if scalar(@$ary);
- }
- # better off wrong than slow if there's thousands of groups,
- # so no fallback to the slow path below:
+ $xref = $ALL->nntp_xref_for($cur_ibx, $smsg);
+ xref_by_tc($xref, $nntpd->{pi_config}, $smsg);
} else { # slow path
+ $xref = { $cur_ng => $smsg->{num} };
my $mid = $smsg->{mid};
- my $groups = $nntpd->{pi_config}->{-by_newsgroup};
- for my $xngname (@{$nntpd->{groupnames}}) {
- next if $cur_ngname eq $xngname;
- my $xibx = $groups->{$xngname} or next;
- my $num = eval { $xibx->mm->num_for($mid) } or next;
- $ret .= " $xngname:$num";
+ for my $ibx (values %{$nntpd->{pi_config}->{-by_newsgroup}}) {
+ next if defined($xref->{$ibx->{newsgroup}});
+ my $num = eval { $ibx->mm->num_for($mid) } // next;
+ $xref->{$ibx->{newsgroup}} = $num;
}
}
+ my $ret = "$nntpd->{servername} $cur_ng:".delete($xref->{$cur_ng});
+ $ret .= " $_:$xref->{$_}" for (sort keys %$xref);
$ret;
}
@@ -930,8 +947,13 @@ sub cmd_over ($;$) {
more($self, '224 Overview information follows (multi-line)');
# Only set article number column if it's the current group
+ # (RFC 3977 8.3.2)
my $self_ng = $self->{ng};
- $smsg->{num} = 0 if (!$self_ng || $self_ng ne $ng);
+ if (!$self_ng || $self_ng ne $ng) {
+ # set {-orig_num} for nntp_xref_for
+ $smsg->{-orig_num} = $smsg->{num};
+ $smsg->{num} = 0;
+ }
more($self, over_line($self, $ng, $smsg));
'.';
} else {
diff --git a/t/extsearch.t b/t/extsearch.t
index 2a1b05c7..2b8b88ea 100644
--- a/t/extsearch.t
+++ b/t/extsearch.t
@@ -72,7 +72,8 @@ EOF
ok($pi_cfg->ALL, '->ALL');
my $ibx = $pi_cfg->{-by_newsgroup}->{'v2.example'};
my $ret = $pi_cfg->ALL->nntp_xref_for($ibx, $ibx->over->get_art(1));
- is_deeply($ret, ['v1.example:1'], '->nntp_xref_for');
+ is_deeply($ret, { 'v1.example' => 1, 'v2.example' => 1 },
+ '->nntp_xref_for');
}
SKIP: {
^ permalink raw reply related [flat|nested] 15+ messages in thread