unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Cc: meta@public-inbox.org
Subject: Re: Cheap way to check for new messages in a thread
Date: Thu, 30 Mar 2023 11:29:51 +0000	[thread overview]
Message-ID: <20230330112951.M493025@dcvr> (raw)
In-Reply-To: <20230329212558.M622984@dcvr>

Eric Wong <e@80x24.org> wrote:
> Konstantin Ryabitsev <konstantin@linuxfoundation.org> wrote:
> > However, if you do want to add ability to cheaply do a "give me just the
> > newest messages in this thread since this datetime", that would be great for
> > my needs. :)
> 
> Per-thread search is something I've wanted for a while, anyways,
> so I think I'll do /$MSGID/?q= in between ongoing work for

This implements the mbox.gz retrieval.  I didn't want to deal
with HTML nor figuring out how to expose more <form> elements,
yet; but I figure mbox.gz is the most important.

Now deployed on 80x24.org/lore:

MSGID=20230327080502.GA570847@ziqianlu-desk2
curl -d '' -sSf \
   https://80x24.org/lore/all/"$MSGID/?x=m&q=rt:2023-03-29.." | \
   zcat | grep -i ^Message-ID:

shows the expected messages.
-----------8<-----------
Subject: [PATCH] www: support POST /$INBOX/$MSGID/?x=m&q=

This allows filtering the contents of any existing thread using
a search query.  It uses the existing THREADID column in Xapian
so we can internally add a Xapian OP_FILTER to the results.

This new functionality is orthogonal to the existing `t=1'
parameter which gives mairix-style thread expansion.  It doesn't
make sense to use `t=1' with this functionality, but it's not
disallowed, either.

The indentation change in Over->next_by_mid is to ensure
DBI->prepare_cached can share across both ->next_by_mid
and ->mid2tid.

I also noticed the existing regex for `POST /$INBOX/?x=m&q=' was
allowing extra characters.  With an added \z, it's now as strict
was originally intended and AFAIK nothing was generating invalid
URLs for it

Reported-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Link: https://public-inbox.org/meta/aaniyhk7wfm4e6m5mbukcrhevzoc6ftctyrfwvmz4fkykwwtlj@mverfng6ytas/T/
---
 lib/PublicInbox/Mbox.pm   |  5 ++++
 lib/PublicInbox/Over.pm   | 24 ++++++++++++++++++-
 lib/PublicInbox/Search.pm |  6 +++++
 lib/PublicInbox/WWW.pm    |  4 +++-
 t/psgi_v2.t               | 50 ++++++++++++++++++++++++++++++++++-----
 5 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 18db9d38..e1abf7ec 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -229,6 +229,11 @@ sub mbox_all {
 		return PublicInbox::WWW::need($ctx, 'Overview');
 
 	my $qopts = $ctx->{qopts} = { relevance => -2 }; # ORDER BY docid DESC
+
+	# {threadid} limits results to a given thread
+	# {threads} collapses results from messages in the same thread,
+	# allowing us to use ->expand_thread w/o duplicates in our own code
+	$qopts->{threadid} = $over->mid2tid($ctx->{mid}) if defined($ctx->{mid});
 	$qopts->{threads} = 1 if $q->{t};
 	$srch->query_approxidate($ctx->{ibx}->git, $q_string);
 	my $mset = $srch->mset($q_string, $qopts);
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index 271e2246..6ba27118 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -283,13 +283,35 @@ SELECT eidx_key FROM inboxes WHERE ibx_id = ?
 	$rows;
 }
 
+sub mid2tid {
+	my ($self, $mid) = @_;
+	my $dbh = dbh($self);
+
+	my $sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT id FROM msgid WHERE mid = ? LIMIT 1
+
+	$sth->execute($mid);
+	my $id = $sth->fetchrow_array or return;
+	$sth = $dbh->prepare_cached(<<'', undef, 1);
+SELECT num FROM id2num WHERE id = ? AND num > ?
+ORDER BY num ASC LIMIT 1
+
+	$sth->execute($id, 0);
+	my $num = $sth->fetchrow_array or return;
+	$sth = $dbh->prepare(<<'');
+SELECT tid FROM over WHERE num = ? LIMIT 1
+
+	$sth->execute($num);
+	$sth->fetchrow_array;
+}
+
 sub next_by_mid {
 	my ($self, $mid, $id, $prev) = @_;
 	my $dbh = dbh($self);
 
 	unless (defined $$id) {
 		my $sth = $dbh->prepare_cached(<<'', undef, 1);
-	SELECT id FROM msgid WHERE mid = ? LIMIT 1
+SELECT id FROM msgid WHERE mid = ? LIMIT 1
 
 		$sth->execute($mid);
 		$$id = $sth->fetchrow_array;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 5133a3b7..6c3d9f93 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -386,6 +386,12 @@ sub mset {
 					sortable_serialise($uid_range->[1]));
 		$query = $X{Query}->new(OP_FILTER(), $query, $range);
 	}
+	if (defined(my $tid = $opt->{threadid})) {
+		$tid = sortable_serialise($tid);
+		$query = $X{Query}->new(OP_FILTER(), $query,
+				$X{Query}->new(OP_VALUE_RANGE(), THREADID, $tid, $tid));
+	}
+
 	my $xdb = xdb($self);
 	my $enq = $X{Enquire}->new($xdb);
 	$enq->set_query($query);
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 9ffcb879..a8f1ad17 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -68,7 +68,9 @@ sub call {
 			my ($idx, $fn) = ($3, $4);
 			return invalid_inbox_mid($ctx, $1, $2) ||
 				get_attach($ctx, $idx, $fn);
-		} elsif ($path_info =~ m!$INBOX_RE/!o) {
+		} elsif ($path_info =~ m!$INBOX_RE/$MID_RE/\z!o) {
+			return invalid_inbox_mid($ctx, $1, $2) || mbox_results($ctx);
+		} elsif ($path_info =~ m!$INBOX_RE/\z!o) {
 			return invalid_inbox($ctx, $1) || mbox_results($ctx);
 		}
 	}
diff --git a/t/psgi_v2.t b/t/psgi_v2.t
index 5b197a9f..0a77adfb 100644
--- a/t/psgi_v2.t
+++ b/t/psgi_v2.t
@@ -4,6 +4,7 @@
 use strict;
 use v5.10.1;
 use PublicInbox::TestCommon;
+use IO::Uncompress::Gunzip qw(gunzip);
 require_git(2.6);
 use PublicInbox::Eml;
 use PublicInbox::Config;
@@ -76,6 +77,30 @@ $new_mid //= do {
 	local $/;
 	<$fh>;
 };
+
+my $m2t = create_inbox 'mid2tid-1', version => 2, indexlevel => 'medium', sub {
+	my ($im, $ibx) = @_;
+	for my $n (1..3) {
+		$im->add(PublicInbox::Eml->new(<<EOM)) or xbail 'add';
+Date: Fri, 02 Oct 1993 00:0$n:00 +0000
+Message-ID: <t\@$n>
+Subject: tid $n
+From: x\@example.com
+References: <a-mid\@b>
+
+$n
+EOM
+		$im->add(PublicInbox::Eml->new(<<EOM)) or xbail 'add';
+Date: Fri, 02 Oct 1993 00:0$n:00 +0000
+Message-ID: <ut\@$n>
+Subject: unrelated tid $n
+From: x\@example.com
+References: <b-mid\@b>
+
+EOM
+	}
+};
+
 my $cfgpath = "$ibx->{inboxdir}/pi_config";
 {
 	open my $fh, '>', $cfgpath or BAIL_OUT $!;
@@ -86,6 +111,9 @@ my $cfgpath = "$ibx->{inboxdir}/pi_config";
 [publicinbox "dup"]
 	inboxdir = $dibx->{inboxdir}
 	address = $dibx->{-primary_address}
+[publicinbox "m2t"]
+	inboxdir = $m2t->{inboxdir}
+	address = $m2t->{-primary_address}
 EOF
 	close $fh or BAIL_OUT;
 }
@@ -178,20 +206,18 @@ my $client1 = sub {
 	$cfg->each_inbox(sub { $_[0]->search->reopen });
 
 	SKIP: {
-		eval { require IO::Uncompress::Gunzip };
-		skip 'IO::Uncompress::Gunzip missing', 6 if $@;
 		my ($in, $out, $status);
 		my $req = GET('/v2test/a-mid@b/raw');
 		$req->header('Accept-Encoding' => 'gzip');
 		$res = $cb->($req);
 		is($res->header('Content-Encoding'), 'gzip', 'gzip encoding');
 		$in = $res->content;
-		IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+		gunzip(\$in => \$out);
 		is($out, $raw, 'gzip response matches');
 
 		$res = $cb->(GET('/v2test/a-mid@b/t.mbox.gz'));
 		$in = $res->content;
-		$status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+		$status = gunzip(\$in => \$out);
 		unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
 		like($out, qr/^hello world$/m, 'got first in t.mbox.gz');
 		like($out, qr/^hello world!$/m, 'got second in t.mbox.gz');
@@ -202,7 +228,7 @@ my $client1 = sub {
 		# search interface
 		$res = $cb->(POST('/v2test/?q=m:a-mid@b&x=m'));
 		$in = $res->content;
-		$status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+		$status = gunzip(\$in => \$out);
 		unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
 		like($out, qr/^hello world$/m, 'got first in mbox POST');
 		like($out, qr/^hello world!$/m, 'got second in mbox POST');
@@ -213,7 +239,7 @@ my $client1 = sub {
 		# all.mbox.gz interface
 		$res = $cb->(GET('/v2test/all.mbox.gz'));
 		$in = $res->content;
-		$status = IO::Uncompress::Gunzip::gunzip(\$in => \$out);
+		$status = gunzip(\$in => \$out);
 		unlike($out, qr/^From oldbug/sm, 'buggy "From_" line omitted');
 		like($out, qr/^hello world$/m, 'got first in all.mbox');
 		like($out, qr/^hello world!$/m, 'got second in all.mbox');
@@ -335,6 +361,18 @@ my $client3 = sub {
 	local $SIG{__WARN__} = sub { push @warn, @_ };
 	$res = $cb->(GET('/v2test/?t=1970'.'01'.'01'));
 	is_deeply(\@warn, [], 'no warnings on YYYYMMDD only');
+
+	$res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000300..&x=m"));
+	is($res->code, 200, 'got 200 on mid2tid query');
+	gunzip(\(my $in = $res->content) => \(my $out));
+	my @m = ($out =~ m!^Message-ID: <([^>]+)>\n!gms);
+	is_deeply(\@m, ['t@3'], 'only got latest result from query');
+
+	$res = $cb->(POST("/m2t/t\@1/?q=dt:19931002000400..&x=m"));
+	is($res->code, 404, '404 on out-of-range mid2tid query');
+
+	$res = $cb->(POST("/m2t/t\@1/?q=s:unrelated&x=m"));
+	is($res->code, 404, '404 on cross-thread search');
 };
 test_psgi(sub { $www->call(@_) }, $client3);
 test_httpd($env, $client3, 4);


  reply	other threads:[~2023-03-30 11:29 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-03-27 15:08 Cheap way to check for new messages in a thread Konstantin Ryabitsev
2023-03-27 19:10 ` Eric Wong
2023-03-27 20:47   ` Konstantin Ryabitsev
2023-03-27 21:38     ` Eric Wong
2023-03-28 14:04       ` Konstantin Ryabitsev
2023-03-28 19:45         ` Eric Wong
2023-03-28 20:00           ` Konstantin Ryabitsev
2023-03-28 22:08             ` Eric Wong
2023-03-28 23:30               ` Konstantin Ryabitsev
2023-03-29 21:25                 ` Eric Wong
2023-03-30 11:29                   ` Eric Wong [this message]
2023-03-30 16:45                     ` Konstantin Ryabitsev
2023-03-31  1:40                       ` Eric Wong
2023-04-11 11:27                         ` Eric Wong
2023-06-16 19:11                     ` Konstantin Ryabitsev
2023-06-16 23:13                       ` [PATCH] www: use correct threadid for per-thread search Eric Wong
2023-06-21 17:11                         ` Konstantin Ryabitsev

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230330112951.M493025@dcvr \
    --to=e@80x24.org \
    --cc=konstantin@linuxfoundation.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).