From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 8DC081F452; Thu, 30 Mar 2023 11:29:51 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1680175791; bh=O9sOJ1Q+ifjSJMZQv8dJnFYbZhpDsQo7bUgNyMFXn6s=; h=Date:From:To:Cc:Subject:References:In-Reply-To:From; b=oAOJTqAfr3kPER9IGGNRh23/loBXBFV09syG3lQHN9cdwF/M4luurcQxFIEvDiUiY hzD/xh8eBs5MKtrbxJojuP0C9IRQaBHrxnIFq7D2S95UUF3E+ngZCpnXCuvbKfdM4Q 8p8KAYuBunsCZiMBCtEcF/0uSVXl9m+YSAVFH8Uc= Date: Thu, 30 Mar 2023 11:29:51 +0000 From: Eric Wong To: Konstantin Ryabitsev Cc: meta@public-inbox.org Subject: Re: Cheap way to check for new messages in a thread Message-ID: <20230330112951.M493025@dcvr> References: <20230327191049.M277377@dcvr> <20230327213849.M743623@dcvr> <20230328194549.M808175@dcvr> <20230328-monsoon-charred-giver-91f26d3024fb@meerkat> <20230328220830.M352242@dcvr> <20230328-oppressed-almighty-61330f9dde22@meerkat> <20230329212558.M622984@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20230329212558.M622984@dcvr> List-Id: Eric Wong wrote: > Konstantin Ryabitsev wrote: > > However, if you do want to add ability to cheaply do a "give me just the > > newest messages in this thread since this datetime", that would be great for > > my needs. :) > > Per-thread search is something I've wanted for a while, anyways, > so I think I'll do /$MSGID/?q= in between ongoing work for This implements the mbox.gz retrieval. I didn't want to deal with HTML nor figuring out how to expose more
elements, yet; but I figure mbox.gz is the most important. Now deployed on 80x24.org/lore: MSGID=20230327080502.GA570847@ziqianlu-desk2 curl -d '' -sSf \ https://80x24.org/lore/all/"$MSGID/?x=m&q=rt:2023-03-29.." | \ zcat | grep -i ^Message-ID: shows the expected messages. -----------8<----------- Subject: [PATCH] www: support POST /$INBOX/$MSGID/?x=m&q= This allows filtering the contents of any existing thread using a search query. It uses the existing THREADID column in Xapian so we can internally add a Xapian OP_FILTER to the results. This new functionality is orthogonal to the existing `t=1' parameter which gives mairix-style thread expansion. It doesn't make sense to use `t=1' with this functionality, but it's not disallowed, either. The indentation change in Over->next_by_mid is to ensure DBI->prepare_cached can share across both ->next_by_mid and ->mid2tid. I also noticed the existing regex for `POST /$INBOX/?x=m&q=' was allowing extra characters. With an added \z, it's now as strict was originally intended and AFAIK nothing was generating invalid URLs for it Reported-by: Konstantin Ryabitsev Link: https://public-inbox.org/meta/aaniyhk7wfm4e6m5mbukcrhevzoc6ftctyrfwvmz4fkykwwtlj@mverfng6ytas/T/ --- lib/PublicInbox/Mbox.pm | 5 ++++ lib/PublicInbox/Over.pm | 24 ++++++++++++++++++- lib/PublicInbox/Search.pm | 6 +++++ lib/PublicInbox/WWW.pm | 4 +++- t/psgi_v2.t | 50 ++++++++++++++++++++++++++++++++++----- 5 files changed, 81 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm index 18db9d38..e1abf7ec 100644 --- a/lib/PublicInbox/Mbox.pm +++ b/lib/PublicInbox/Mbox.pm @@ -229,6 +229,11 @@ sub mbox_all { return PublicInbox::WWW::need($ctx, 'Overview'); my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC + + # {threadid} limits results to a given thread + # {threads} collapses results from messages in the same thread, + # allowing us to use ->expand_thread w/o duplicates in our own code + $qopts->{threadid} = $over->mid2tid($ctx->{mid}) if defined($ctx->{mid}); $qopts->{threads} = 1 if $q->{t}; $srch->query_approxidate($ctx->{ibx}->git, $q_string); my $mset = $srch->mset($q_string, $qopts); diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm index 271e2246..6ba27118 100644 --- a/lib/PublicInbox/Over.pm +++ b/lib/PublicInbox/Over.pm @@ -283,13 +283,35 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ? $rows; } +sub mid2tid { + my ($self, $mid) = @_; + my $dbh = dbh($self); + + my $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT id FROM msgid WHERE mid = ? LIMIT 1 + + $sth->execute($mid); + my $id = $sth->fetchrow_array or return; + $sth = $dbh->prepare_cached(<<'', undef, 1); +SELECT num FROM id2num WHERE id = ? AND num > ? +ORDER BY num ASC LIMIT 1 + + $sth->execute($id, 0); + my $num = $sth->fetchrow_array or return; + $sth = $dbh->prepare(<<''); +SELECT tid FROM over WHERE num = ? LIMIT 1 + + $sth->execute($num); + $sth->fetchrow_array; +} + sub next_by_mid { my ($self, $mid, $id, $prev) = @_; my $dbh = dbh($self); unless (defined $$id) { my $sth = $dbh->prepare_cached(<<'', undef, 1); - SELECT id FROM msgid WHERE mid = ? LIMIT 1 +SELECT id FROM msgid WHERE mid = ? LIMIT 1 $sth->execute($mid); $$id = $sth->fetchrow_array; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 5133a3b7..6c3d9f93 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -386,6 +386,12 @@ sub mset { sortable_serialise($uid_range->[1])); $query = $X{Query}->new(OP_FILTER(), $query, $range); } + if (defined(my $tid = $opt->{threadid})) { + $tid = sortable_serialise($tid); + $query = $X{Query}->new(OP_FILTER(), $query, + $X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid)); + } + my $xdb = xdb($self); my $enq = $X{Enquire}->new($xdb); $enq->set_query($query); diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 9ffcb879..a8f1ad17 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -68,7 +68,9 @@ sub call { my ($idx, $fn) = ($3, $4); return invalid_inbox_mid($ctx, $1, $2) || get_attach($ctx, $idx, $fn); - } elsif ($path_info =~ m!$INBOX_RE/!o) { + } elsif ($path_info =~ m!$INBOX_RE/$MID_RE/\z!o) { + return invalid_inbox_mid($ctx, $1, $2) || mbox_results($ctx); + } elsif ($path_info =~ m!$INBOX_RE/\z!o) { return invalid_inbox($ctx, $1) || mbox_results($ctx); } } diff --git a/t/psgi_v2.t b/t/psgi_v2.t index 5b197a9f..0a77adfb 100644 --- a/t/psgi_v2.t +++ b/t/psgi_v2.t @@ -4,6 +4,7 @@ use strict; use v5.10.1; use PublicInbox::TestCommon; +use IO::Uncompress::Gunzip qw(gunzip); require_git(2.6); use PublicInbox::Eml; use PublicInbox::Config; @@ -76,6 +77,30 @@ $new_mid //= do { local $/; <$fh>; }; + +my $m2t = create_inbox 'mid2tid-1', version => 2, indexlevel => 'medium', sub { + my ($im, $ibx) = @_; + for my $n (1..3) { + $im->add(PublicInbox::Eml->new(< +Subject: tid $n +From: x\@example.com +References: + +$n +EOM + $im->add(PublicInbox::Eml->new(< +Subject: unrelated tid $n +From: x\@example.com +References: + +EOM + } +}; + my $cfgpath = "$ibx->{inboxdir}/pi_config"; { open my $fh, '>', $cfgpath or BAIL_OUT $!; @@ -86,6 +111,9 @@ my $cfgpath = "$ibx->{inboxdir}/pi_config"; [publicinbox "dup"] inboxdir = $dibx->{inboxdir} address = $dibx->{-primary_address} +[publicinbox "m2t"] + inboxdir = $m2t->{inboxdir} + address = $m2t->{-primary_address} EOF close $fh or BAIL_OUT; } @@ -178,20 +206,18 @@ my $client1 = sub { $cfg->each_inbox(sub { $_[0]->search->reopen }); SKIP: { - eval { require IO::Uncompress::Gunzip }; - skip 'IO::Uncompress::Gunzip missing', 6 if $@; my ($in, $out, $status); my $req = GET('/v2test/a-mid@b/raw'); $req->header('Accept-Encoding' => 'gzip'); $res = $cb->($req); is($res->header('Content-Encoding'), 'gzip', 'gzip encoding'); $in = $res->content; - IO::Uncompress::Gunzip::gunzip(\$in => \$out); + gunzip(\$in => \$out); is($out, $raw, 'gzip response matches'); $res = $cb->(GET('/v2test/a-mid@b/t.mbox.gz')); $in = $res->content; - $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + $status = gunzip(\$in => \$out); unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted'); like($out, qr/^hello world$/m, 'got first in t.mbox.gz'); like($out, qr/^hello world!$/m, 'got second in t.mbox.gz'); @@ -202,7 +228,7 @@ my $client1 = sub { # search interface $res = $cb->(POST('/v2test/?q=m:a-mid@b&x=m')); $in = $res->content; - $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + $status = gunzip(\$in => \$out); unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted'); like($out, qr/^hello world$/m, 'got first in mbox POST'); like($out, qr/^hello world!$/m, 'got second in mbox POST'); @@ -213,7 +239,7 @@ my $client1 = sub { # all.mbox.gz interface $res = $cb->(GET('/v2test/all.mbox.gz')); $in = $res->content; - $status = IO::Uncompress::Gunzip::gunzip(\$in => \$out); + $status = gunzip(\$in => \$out); unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted'); like($out, qr/^hello world$/m, 'got first in all.mbox'); like($out, qr/^hello world!$/m, 'got second in all.mbox'); @@ -335,6 +361,18 @@ my $client3 = sub { local $SIG{__WARN__} = sub { push @warn, @_ }; $res = $cb->(GET('/v2test/?t=1970'.'01'.'01')); is_deeply(\@warn, [], 'no warnings on YYYYMMDD only'); + + $res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000300..&x=m")); + is($res->code, 200, 'got 200 on mid2tid query'); + gunzip(\(my $in = $res->content) => \(my $out)); + my @m = ($out =~ m!^Message-ID: <([^>]+)>\n!gms); + is_deeply(\@m, ['t@3'], 'only got latest result from query'); + + $res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000400..&x=m")); + is($res->code, 404, '404 on out-of-range mid2tid query'); + + $res = $cb->(POST("/m2t/t\@1/?q=s:unrelated&x=m")); + is($res->code, 404, '404 on cross-thread search'); }; test_psgi(sub { $www->call(@_) }, $client3); test_httpd($env, $client3, 4);