From: Eric Wong <e@80x24.org>
To: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Cc: meta@public-inbox.org
Subject: [PATCH] view: distinguish strict and loose thread matches
Date: Sun, 5 Aug 2018 06:04:40 +0000 [thread overview]
Message-ID: <20180805060440.fhl7zvyis246e3ym@dcvr> (raw)
In-Reply-To: <a4baf526-e888-9492-b831-11bb9338f647@linuxfoundation.org>
The "loose" (Subject:-based) thread matching yields too many
hits for some common subjects (e.g. "[GIT] Networking" on LKML)
and causes thread skeletons to not show the current messages.
Favor strict matches in the query and only add loose matches
if there's space.
While working on this, I noticed the backwards --reindex walk
breaks `tid' on v1 repositories, at least. That bug was hidden
by the Subject: match logic and not discovered until now. It
will be fixed separately.
Reported-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
---
lib/PublicInbox/Over.pm | 43 ++++++++++++++++++++++------
lib/PublicInbox/View.pm | 63 +++++++++++++++++++++++++++++++++++++----
2 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/Over.pm b/lib/PublicInbox/Over.pm
index b2f6883..dda1e6d 100644
--- a/lib/PublicInbox/Over.pm
+++ b/lib/PublicInbox/Over.pm
@@ -11,6 +11,7 @@ package PublicInbox::Over;
use DBD::SQLite;
use PublicInbox::SearchMsg;
use Compress::Zlib qw(uncompress);
+use constant DEFAULT_LIMIT => 1000;
sub dbh_new {
my ($self) = @_;
@@ -53,7 +54,7 @@ sub load_from_row {
sub do_get {
my ($self, $sql, $opts, @args) = @_;
my $dbh = $self->connect;
- my $lim = (($opts->{limit} || 0) + 0) || 1000;
+ my $lim = (($opts->{limit} || 0) + 0) || DEFAULT_LIMIT;
$sql .= "LIMIT $lim";
my $msgs = $dbh->selectall_arrayref($sql, { Slice => {} }, @args);
load_from_row($_) for @$msgs;
@@ -97,21 +98,47 @@ sub get_thread {
SELECT tid,sid FROM over WHERE num = ? LIMIT 1
defined $tid or return nothing; # $sid may be undef
+
+ my $cond_all = '(tid = ? OR sid = ?) AND num > ?';
my $sort_col = 'ds';
$num = 0;
- if ($prev) {
+ if ($prev) { # mboxrd stream, only
$num = $prev->{num} || 0;
$sort_col = 'num';
}
- my $cond = '(tid = ? OR sid = ?) AND num > ?';
- my $msgs = do_get($self, <<"", {}, $tid, $sid, $num);
-SELECT num,ts,ds,ddd FROM over WHERE $cond ORDER BY $sort_col ASC
- return $msgs unless wantarray;
+ my $cols = 'num,ts,ds,ddd';
+ unless (wantarray) {
+ return do_get($self, <<"", {}, $tid, $sid, $num);
+SELECT $cols FROM over WHERE $cond_all
+ORDER BY $sort_col ASC
- my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num);
-SELECT COUNT(num) FROM over WHERE $cond
+ }
+ # HTML view always wants an array and never uses $prev,
+ # but the mbox stream never wants an array and always has $prev
+ die '$prev not supported with wantarray' if $prev;
+ my $nr = $dbh->selectrow_array(<<"", undef, $tid, $sid, $num);
+SELECT COUNT(num) FROM over WHERE $cond_all
+
+ # giant thread, prioritize strict (tid) matches and throw
+ # in the loose (sid) matches at the end
+ my $msgs = do_get($self, <<"", {}, $tid, $num);
+SELECT $cols FROM over WHERE tid = ? AND num > ?
+ORDER BY $sort_col ASC
+
+ # do we have room for loose matches? get the most recent ones, first:
+ my $lim = DEFAULT_LIMIT - scalar(@$msgs);
+ if ($lim > 0) {
+ my $opts = { limit => $lim };
+ my $loose = do_get($self, <<"", $opts, $tid, $sid, $num);
+SELECT $cols FROM over WHERE tid != ? AND sid = ? AND num > ?
+ORDER BY $sort_col DESC
+
+ # TODO separate strict and loose matches here once --reindex
+ # is fixed to preserve `tid' properly
+ push @$msgs, @$loose;
+ }
($nr, $msgs);
}
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 58851ed..eb002ae 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -365,7 +365,7 @@ sub walk_thread {
while (@q) {
my ($level, $node, $i) = splice(@q, 0, 3);
defined $node or next;
- $cb->($ctx, $level, $node, $i);
+ $cb->($ctx, $level, $node, $i) or return;
++$level;
$i = 0;
unshift @q, map { ($level, $_, $i++) } @{$node->{children}};
@@ -818,10 +818,56 @@ sub indent_for {
$level ? INDENT x ($level - 1) : '';
}
+sub find_mid_root {
+ my ($ctx, $level, $node, $idx) = @_;
+ ++$ctx->{root_idx} if $level == 0;
+ if ($node->{id} eq $ctx->{mid}) {
+ $ctx->{found_mid_at} = $ctx->{root_idx};
+ return 0;
+ }
+ 1;
+}
+
+sub strict_loose_note ($) {
+ my ($nr) = @_;
+ my $msg =
+" -- strict thread matches above, loose matches on Subject: below --\n";
+
+ if ($nr > PublicInbox::Over::DEFAULT_LIMIT()) {
+ $msg .=
+" -- use mbox.gz link to download all $nr messages --\n";
+ }
+ $msg;
+}
+
sub thread_results {
my ($ctx, $msgs) = @_;
require PublicInbox::SearchThread;
- PublicInbox::SearchThread::thread($msgs, *sort_ds, $ctx->{-inbox});
+ my $ibx = $ctx->{-inbox};
+ my $rootset = PublicInbox::SearchThread::thread($msgs, *sort_ds, $ibx);
+
+ # FIXME: `tid' is broken on --reindex, so that needs to be fixed
+ # and preserved in the future. This bug is hidden by `sid' matches
+ # in get_thread, so we never noticed it until now. And even when
+ # reindexing is fixed, we'll keep this code until a SCHEMA_VERSION
+ # bump since reindexing is expensive and users may not do it
+
+ # loose threading could've returned too many results,
+ # put the root the message we care about at the top:
+ my $mid = $ctx->{mid};
+ if (defined($mid) && scalar(@$rootset) > 1) {
+ $ctx->{root_idx} = -1;
+ my $nr = scalar @$msgs;
+ walk_thread($rootset, $ctx, *find_mid_root);
+ my $idx = $ctx->{found_mid_at};
+ if (defined($idx) && $idx != 0) {
+ my $tip = splice(@$rootset, $idx, 1);
+ @$rootset = reverse @$rootset;
+ unshift @$rootset, $tip;
+ $ctx->{sl_note} = strict_loose_note($nr);
+ }
+ }
+ $rootset
}
sub missing_thread {
@@ -864,6 +910,10 @@ sub skel_dump {
my $cur = $ctx->{cur};
my $mid = $smsg->{mid};
+ if ($level == 0 && $ctx->{skel_dump_roots}++) {
+ $$dst .= delete $ctx->{sl_note} || '';
+ }
+
my $f = ascii_html($smsg->from_name);
my $obfs_ibx = $ctx->{-obfs_ibx};
obfuscate_addrs($obfs_ibx, $f) if $obfs_ibx;
@@ -882,7 +932,7 @@ sub skel_dump {
delete $ctx->{cur};
$$dst .= "<b>$d<a\nid=r\nhref=\"#t\">".
"$attr [this message]</a></b>\n";
- return;
+ return 1;
} else {
$ctx->{prev_msg} = $mid;
}
@@ -922,6 +972,7 @@ sub skel_dump {
$m = $ctx->{-upfx}.mid_escape($mid).'/';
}
$$dst .= $d . "<a\nhref=\"$m\"$id>" . $end;
+ 1;
}
sub _skel_ghost {
@@ -947,6 +998,7 @@ sub _skel_ghost {
}
my $dst = $ctx->{dst};
$$dst .= $d;
+ 1;
}
sub sort_ds {
@@ -973,7 +1025,7 @@ sub acc_topic {
$topic = [ $ds, 1, { $subj => $mid }, $subj ];
$ctx->{-cur_topic} = $topic;
push @{$ctx->{order}}, $topic;
- return;
+ return 1;
}
$topic = $ctx->{-cur_topic}; # should never be undef
@@ -987,11 +1039,12 @@ sub acc_topic {
}
$seen->{$subj} = $mid; # latest for subject
} else { # ghost message
- return if $level != 0; # ignore child ghosts
+ return 1 if $level != 0; # ignore child ghosts
$topic = [ -666, 0, {} ];
$ctx->{-cur_topic} = $topic;
push @{$ctx->{order}}, $topic;
}
+ 1;
}
sub dump_topics {
--
EW
next prev parent reply other threads:[~2018-08-05 6:04 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-08-03 18:26 Threading/searching problem Konstantin Ryabitsev
2018-08-03 19:20 ` Eric Wong
2018-08-03 19:38 ` Konstantin Ryabitsev
2018-08-05 6:04 ` Eric Wong [this message]
2018-08-05 8:19 ` [RFC] overidx: preserve `tid' column on re-indexing Eric Wong
2018-08-05 21:41 ` Eric Wong
2018-08-06 20:05 ` [PATCH] view: distinguish strict and loose thread matches Konstantin Ryabitsev
2018-08-06 20:10 ` Konstantin Ryabitsev
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180805060440.fhl7zvyis246e3ym@dcvr \
--to=e@80x24.org \
--cc=konstantin@linuxfoundation.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).