unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] searchidx: preserve thread_id for ghost root vivification
@ 2017-05-07 10:54 Eric Wong
  2017-05-07 11:03 ` [PATCH v2] searchidx: fix " Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2017-05-07 10:54 UTC (permalink / raw)
  To: meta

Due to the asynchronous nature of SMTP, it is possible for the
root message of a thread (with no References/In-Reply-To)
to arrive last in a series.  We must preserve the thread_id
of the ghost message in this case, as we do for vivifiying
non-root ghosts.
---
 MANIFEST                     |  1 +
 lib/PublicInbox/SearchIdx.pm |  2 +-
 t/search-thr-index.t         | 58 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 t/search-thr-index.t

diff --git a/MANIFEST b/MANIFEST
index f16843a..d1e0952 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -157,6 +157,7 @@ t/psgi_attach.t
 t/psgi_mount.t
 t/psgi_text.t
 t/qspawn.t
+t/search-thr-index.t
 t/search.t
 t/spamcheck_spamc.t
 t/spawn.t
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 8a529c6..e4e3c81 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -330,7 +330,7 @@ sub link_message {
 			merge_threads($self, $tid, $ptid);
 		}
 	} else {
-		$tid = $self->next_thread_id;
+		$tid = defined $old_tid ? $old_tid : $self->next_thread_id;
 	}
 	$doc->add_term(xpfx('thread') . $tid);
 }
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
new file mode 100644
index 0000000..6549554
--- /dev/null
+++ b/t/search-thr-index.t
@@ -0,0 +1,58 @@
+# Copyright (C) 2017 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use File::Temp qw/tempdir/;
+use Email::MIME;
+eval { require PublicInbox::SearchIdx; };
+plan skip_all => "Xapian missing for search" if $@;
+my $tmpdir = tempdir('pi-search-thr-index.XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $git_dir = "$tmpdir/a.git";
+
+is(0, system(qw(git init -q --bare), $git_dir), "git init (main)");
+my $rw = PublicInbox::SearchIdx->new($git_dir, 1);
+ok($rw, "search indexer created");
+my $data = <<'EOF';
+Subject: [RFC 00/14]
+Message-Id: <1-bw@g>
+
+Subject: [RFC 09/14]
+Message-Id: <10-bw@g>
+In-Reply-To: <1-bw@g>
+References: <1-bw@g>
+
+Subject: [RFC 03/14]
+Message-Id: <4-bw@g>
+In-Reply-To: <1-bw@g>
+References: <1-bw@g>
+
+EOF
+
+my $num = 0;
+# nb. using internal API, fragile!
+my $xdb = $rw->_xdb_acquire;
+$xdb->begin_transaction;
+my @mids;
+
+foreach (reverse split(/\n\n/, $data)) {
+	$_ .= "\n";
+	my $mime = Email::MIME->new(\$_);
+	$mime->header_set('From' => 'bw@g');
+	$mime->header_set('To' => 'git@vger.kernel.org');
+	my $bytes = bytes::length($mime->as_string);
+	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored');
+	my $mid = $mime->header('Message-Id');
+	push @mids, $mid;
+	ok($doc_id, 'message added: '. $mid);
+}
+
+my $prev;
+foreach my $mid (@mids) {
+	my $res = $rw->get_thread($mid);
+	is(3, $res->{total}, "got all messages from $mid");
+}
+
+done_testing();
+
+1;
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH v2] searchidx: fix ghost root vivification
  2017-05-07 10:54 [PATCH] searchidx: preserve thread_id for ghost root vivification Eric Wong
@ 2017-05-07 11:03 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2017-05-07 11:03 UTC (permalink / raw)
  To: meta

Due to the asynchronous nature of SMTP, it is possible for the
root message of a thread (with no References/In-Reply-To)
to arrive last in a series.  We must preserve the thread_id
of the ghost message in this case, as we do when vivifiying
non-root ghosts.

Otherwise, this causes threads to be broken when the root
arrives last.
---
 MANIFEST                     |  1 +
 lib/PublicInbox/Search.pm    |  3 ++-
 lib/PublicInbox/SearchIdx.pm |  2 +-
 t/search-thr-index.t         | 58 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 62 insertions(+), 2 deletions(-)
 create mode 100644 t/search-thr-index.t

diff --git a/MANIFEST b/MANIFEST
index f16843a..d1e0952 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -157,6 +157,7 @@ t/psgi_attach.t
 t/psgi_mount.t
 t/psgi_text.t
 t/qspawn.t
+t/search-thr-index.t
 t/search.t
 t/spamcheck_spamc.t
 t/spawn.t
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index bc2b698..82a6e54 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -41,7 +41,8 @@ use constant {
 	# 12 - change YYYYMMDD value column to numeric
 	# 13 - fix threading for empty References/In-Reply-To
 	#      (commit 83425ef12e4b65cdcecd11ddcb38175d4a91d5a0)
-	SCHEMA_VERSION => 13,
+	# 14 - fix ghost root vivification
+	SCHEMA_VERSION => 14,
 
 	# n.b. FLAG_PURE_NOT is expensive not suitable for a public website
 	# as it could become a denial-of-service vector
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 8a529c6..e4e3c81 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -330,7 +330,7 @@ sub link_message {
 			merge_threads($self, $tid, $ptid);
 		}
 	} else {
-		$tid = $self->next_thread_id;
+		$tid = defined $old_tid ? $old_tid : $self->next_thread_id;
 	}
 	$doc->add_term(xpfx('thread') . $tid);
 }
diff --git a/t/search-thr-index.t b/t/search-thr-index.t
new file mode 100644
index 0000000..6549554
--- /dev/null
+++ b/t/search-thr-index.t
@@ -0,0 +1,58 @@
+# Copyright (C) 2017 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use File::Temp qw/tempdir/;
+use Email::MIME;
+eval { require PublicInbox::SearchIdx; };
+plan skip_all => "Xapian missing for search" if $@;
+my $tmpdir = tempdir('pi-search-thr-index.XXXXXX', TMPDIR => 1, CLEANUP => 1);
+my $git_dir = "$tmpdir/a.git";
+
+is(0, system(qw(git init -q --bare), $git_dir), "git init (main)");
+my $rw = PublicInbox::SearchIdx->new($git_dir, 1);
+ok($rw, "search indexer created");
+my $data = <<'EOF';
+Subject: [RFC 00/14]
+Message-Id: <1-bw@g>
+
+Subject: [RFC 09/14]
+Message-Id: <10-bw@g>
+In-Reply-To: <1-bw@g>
+References: <1-bw@g>
+
+Subject: [RFC 03/14]
+Message-Id: <4-bw@g>
+In-Reply-To: <1-bw@g>
+References: <1-bw@g>
+
+EOF
+
+my $num = 0;
+# nb. using internal API, fragile!
+my $xdb = $rw->_xdb_acquire;
+$xdb->begin_transaction;
+my @mids;
+
+foreach (reverse split(/\n\n/, $data)) {
+	$_ .= "\n";
+	my $mime = Email::MIME->new(\$_);
+	$mime->header_set('From' => 'bw@g');
+	$mime->header_set('To' => 'git@vger.kernel.org');
+	my $bytes = bytes::length($mime->as_string);
+	my $doc_id = $rw->add_message($mime, $bytes, ++$num, 'ignored');
+	my $mid = $mime->header('Message-Id');
+	push @mids, $mid;
+	ok($doc_id, 'message added: '. $mid);
+}
+
+my $prev;
+foreach my $mid (@mids) {
+	my $res = $rw->get_thread($mid);
+	is(3, $res->{total}, "got all messages from $mid");
+}
+
+done_testing();
+
+1;
-- 
EW


^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2017-05-07 11:03 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-05-07 10:54 [PATCH] searchidx: preserve thread_id for ghost root vivification Eric Wong
2017-05-07 11:03 ` [PATCH v2] searchidx: fix " Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).