unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [RFC 6/7] index: allow search/lookups on X-Alt-Message-ID
Date: Thu, 24 Oct 2019 00:12:40 +0000	[thread overview]
Message-ID: <20191024001241.14224-7-e@80x24.org> (raw)
In-Reply-To: <20191024001241.14224-1-e@80x24.org>

Since we replace extra Message-ID headers with X-Alt-Message-ID
to placate NNTP clients, we should allow searching and indexing
on X-Alt-Message-ID just like we do with Message-ID.
---
 lib/PublicInbox/MID.pm       | 27 +++++++++++++++++++++------
 lib/PublicInbox/OverIdx.pm   |  4 ++--
 lib/PublicInbox/SearchIdx.pm |  4 ++--
 t/mid.t                      |  7 ++++++-
 t/v2writable.t               | 16 ++++++++++++++++
 5 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/MID.pm b/lib/PublicInbox/MID.pm
index 14089f91..d7a42c38 100644
--- a/lib/PublicInbox/MID.pm
+++ b/lib/PublicInbox/MID.pm
@@ -7,7 +7,7 @@ use strict;
 use warnings;
 use base qw/Exporter/;
 our @EXPORT_OK = qw/mid_clean id_compress mid2path mid_mime mid_escape MID_ESC
-	mids references/;
+	mids references mids_for_index/;
 use URI::Escape qw(uri_escape_utf8);
 use Digest::SHA qw/sha1_hex/;
 require PublicInbox::Address;
@@ -54,11 +54,10 @@ sub mid2path {
 # Only for v1 code paths:
 sub mid_mime ($) { mids($_[0]->header_obj)->[0] }
 
-sub mids ($) {
-	my ($hdr) = @_;
+# only intended for Message-ID and X-Alt-Message-ID
+sub extract_mids {
 	my @mids;
-	my @v = $hdr->header_raw('Message-Id');
-	foreach my $v (@v) {
+	for my $v (@_) {
 		my @cur = ($v =~ /<([^>]+)>/sg);
 		if (@cur) {
 			push(@mids, @cur);
@@ -66,7 +65,23 @@ sub mids ($) {
 			push(@mids, $v);
 		}
 	}
-	uniq_mids(\@mids);
+	\@mids;
+}
+
+sub mids ($) {
+	my ($hdr) = @_;
+	my @mids = $hdr->header_raw('Message-Id');
+	uniq_mids(extract_mids(@mids));
+}
+
+# we allow searching on X-Alt-Message-ID since PublicInbox::NNTP uses them
+# to placate some clients, and we want to ensure NNTP-only clients can
+# import and index without relying on HTTP endpoints
+sub mids_for_index ($) {
+	my ($hdr) = @_;
+	my @mids = $hdr->header_raw('Message-Id');
+	my @alts = $hdr->header_raw('X-Alt-Message-ID');
+	uniq_mids(extract_mids(@mids, @alts));
 }
 
 # last References should be IRT, but some mail clients do things
diff --git a/lib/PublicInbox/OverIdx.pm b/lib/PublicInbox/OverIdx.pm
index 01ca6f11..189bd21d 100644
--- a/lib/PublicInbox/OverIdx.pm
+++ b/lib/PublicInbox/OverIdx.pm
@@ -13,7 +13,7 @@ use warnings;
 use base qw(PublicInbox::Over);
 use IO::Handle;
 use DBI qw(:sql_types); # SQL_BLOB
-use PublicInbox::MID qw/id_compress mids references/;
+use PublicInbox::MID qw/id_compress mids_for_index references/;
 use PublicInbox::SearchMsg qw(subject_normalized);
 use Compress::Zlib qw(compress);
 use PublicInbox::Search;
@@ -256,7 +256,7 @@ sub add_overview {
 		lines => $lines,
 		blob => $oid,
 	}, 'PublicInbox::SearchMsg';
-	my $mids = mids($mime->header_obj);
+	my $mids = mids_for_index($mime->header_obj);
 	my $refs = parse_references($smsg, $mid0, $mids);
 	my $subj = $smsg->subject;
 	my $xpath;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index aed3875a..b2d71a1f 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -12,7 +12,7 @@ use warnings;
 use base qw(PublicInbox::Search PublicInbox::Lock);
 use PublicInbox::MIME;
 use PublicInbox::InboxWritable;
-use PublicInbox::MID qw/mid_clean id_compress mid_mime mids/;
+use PublicInbox::MID qw/mid_clean id_compress mid_mime mids_for_index/;
 use PublicInbox::MsgIter;
 use Carp qw(croak);
 use POSIX qw(strftime);
@@ -344,7 +344,7 @@ sub add_xapian ($$$$$) {
 sub add_message {
 	# mime = Email::MIME object
 	my ($self, $mime, $bytes, $num, $oid, $mid0) = @_;
-	my $mids = mids($mime->header_obj);
+	my $mids = mids_for_index($mime->header_obj);
 	$mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
 	unless (defined $num) { # v1
 		$self->_msgmap_init;
diff --git a/t/mid.t b/t/mid.t
index 9ad10a99..98b0c200 100644
--- a/t/mid.t
+++ b/t/mid.t
@@ -1,7 +1,7 @@
 # Copyright (C) 2016-2019 all contributors <meta@public-inbox.org>
 # License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
 use Test::More;
-use PublicInbox::MID qw(mid_escape mids references);
+use PublicInbox::MID qw(mid_escape mids references mids_for_index);
 
 is(mid_escape('foo!@(bar)'), 'foo!@(bar)');
 is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
@@ -10,6 +10,7 @@ is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
 {
 	use Email::MIME;
 	my $mime = Email::MIME->create;
+	$mime->header_set('X-Alt-Message-ID', '<alt-id-for-nntp>');
 	$mime->header_set('Message-Id', '<mid-1@a>');
 	is_deeply(['mid-1@a'], mids($mime->header_obj), 'mids in common case');
 	$mime->header_set('Message-Id', '<mid-1@a>', '<mid-2@b>');
@@ -40,6 +41,10 @@ is(mid_escape('foo%!@(bar)'), 'foo%25!@(bar)');
 	$mime->header_set('To', 'u@example.com');
 	$mime->header_set('References', '<hello> <world> <n> <u@example.com>');
 	is_deeply(references($mime->header_obj), [qw(hello world)]);
+
+	is_deeply([qw(helloworld alt-id-for-nntp)],
+		mids_for_index($mime->header_obj),
+		'X-Alt-Message-ID can be indexed');
 }
 
 done_testing();
diff --git a/t/v2writable.t b/t/v2writable.t
index c2daac2f..2b825768 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -115,6 +115,7 @@ if ('ensure git configs are correct') {
 
 {
 	$mime->header_set('Message-Id', '<abcde@1>', '<abcde@2>');
+	$mime->header_set('X-Alt-Message-Id', '<alt-id-for-nntp>');
 	$mime->header_set('References', '<zz-mid@b>');
 	ok($im->add($mime), 'message with multiple Message-ID');
 	$im->done;
@@ -127,6 +128,21 @@ if ('ensure git configs are correct') {
 	is($mset2->size, 1, 'message found by second MID');
 	is((($mset1->items)[0])->get_docid, (($mset2->items)[0])->get_docid,
 		'same document') if ($mset1->size);
+
+	my $alt = $srch->reopen->query('m:alt-id-for-nntp', { mset => 1 });
+	is($alt->size, 1, 'message found by alt MID (NNTP)');
+	is((($alt->items)[0])->get_docid, (($mset1->items)[0])->get_docid,
+		'same document') if ($mset1->size);
+	$mime->header_set('X-Alt-Message-Id');
+
+	my %uniq;
+	for my $mid (qw(abcde@1 abcde@2 alt-id-for-nntp)) {
+		my $msgs = $ibx->over->get_thread($mid);
+		my $key = join(' ', sort(map { $_->{num} } @$msgs));
+		$uniq{$key}++;
+	}
+	is(scalar(keys(%uniq)), 1, 'all alt Message-ID queries give same smsg');
+	is_deeply([values(%uniq)], [3], '3 queries, 3 results');
 }
 
 {

  parent reply	other threads:[~2019-10-24  0:12 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-24  0:12 [PATCH 0/7] redundant header madness Eric Wong
2019-10-24  0:12 ` [PATCH 1/7] search: support multiple From/To/Cc/Subject headers Eric Wong
2019-10-24  0:12 ` [PATCH 2/7] view: display redundant headers in permalink Eric Wong
2019-10-24  0:12 ` [PATCH 3/7] view: move '<' and '>' outside <a> Eric Wong
2019-10-24  0:12 ` [PATCH 4/7] view: improve warning for multiple Message-IDs Eric Wong
2019-10-24  0:12 ` [PATCH 5/7] linkify: support adding "(raw)" link for Message-IDs Eric Wong
2019-10-24  0:12 ` Eric Wong [this message]
2019-10-24  0:12 ` [RFC 7/7] view: show X-Alt-Message-ID in permalink view, too Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20191024001241.14224-7-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).