unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 14/34] searchidx: support indexing multiple MIDs
Date: Tue,  6 Mar 2018 08:42:22 +0000	[thread overview]
Message-ID: <20180306084242.19988-15-e@80x24.org> (raw)
In-Reply-To: <20180306084242.19988-1-e@80x24.org>

It's possible to have a message handle multiple terms;
so use this feature to ensure messages with multiple MIDs
can be found by either one.
---
 lib/PublicInbox/Search.pm            |   1 -
 lib/PublicInbox/SearchIdx.pm         | 121 ++++++++++++++++++++++-------------
 lib/PublicInbox/SearchIdxSkeleton.pm |  26 ++------
 lib/PublicInbox/SearchMsg.pm         |   7 ++
 t/v2writable.t                       |  15 ++++-
 5 files changed, 105 insertions(+), 65 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 74f406a..fb7a126 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -373,7 +373,6 @@ sub lookup_mail { # no ghosts!
 
 sub each_smsg_by_mid {
 	my ($self, $mid, $cb) = @_;
-	$mid = mid_clean($mid);
 	my $xdb = $self->{xdb};
 	# XXX retry_reopen isn't necessary for V2Writable, but the PSGI
 	# interface will need it...
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 61dc057..1c10728 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -281,29 +281,19 @@ sub index_body ($$$) {
 
 sub add_message {
 	my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object
-	my $db = $self->{xdb};
-
-	my ($doc_id, $old_tid);
-	my @mids = mid_mime($mime);
-	if (@mids > 1) {
-		warn "Multi-MID: ( ",join(' | ', @mids)," )\n";
-	}
-	my $mid = mid_clean($mids[0]);
+	my $doc_id;
+	my $mids = mids($mime->header_obj);
 	my $skel = $self->{skeleton};
 
 	eval {
-		die 'Message-ID too long' if length($mid) > MAX_MID_SIZE;
-		my $smsg = $self->lookup_message($mid);
-		if ($smsg) {
-			# convert a ghost to a regular message
-			# it will also clobber any existing regular message
-			$doc_id = $smsg->{doc_id};
-			$old_tid = $smsg->thread_id unless $skel;
-		}
-		$smsg = PublicInbox::SearchMsg->new($mime);
+		my $smsg = PublicInbox::SearchMsg->new($mime);
 		my $doc = $smsg->{doc};
-		$doc->add_term('Q' . $mid);
-
+		foreach my $mid (@$mids) {
+			# FIXME: may be abused to prevent archival
+			length($mid) > MAX_MID_SIZE and
+				die 'Message-ID too long';
+			$doc->add_term('Q' . $mid);
+		}
 		my $subj = $smsg->subject;
 		my $xpath;
 		if ($subj ne '') {
@@ -366,31 +356,30 @@ sub add_message {
 		# populates smsg->references for smsg->to_doc_data
 		my $refs = parse_references($smsg);
 		my $data = $smsg->to_doc_data($blob);
-		if ($skel) {
-			push @values, $mid, $xpath, $data;
-			$skel->index_skeleton(\@values);
-		} else {
-			link_message($self, $smsg, $refs, $old_tid);
+		foreach my $mid (@$mids) {
+			$tg->index_text($mid, 1, 'XM');
 		}
-		$tg->index_text($mid, 1, 'XM');
 		$doc->set_data($data);
-
 		if (my $altid = $self->{-altid}) {
 			foreach my $alt (@$altid) {
-				my $id = $alt->mid2alt($mid);
-				next unless defined $id;
-				$doc->add_term($alt->{xprefix} . $id);
+				foreach my $mid (@$mids) {
+					my $id = $alt->mid2alt($mid);
+					next unless defined $id;
+					$doc->add_term($alt->{xprefix} . $id);
+				}
 			}
 		}
-		if (defined $doc_id) {
-			$db->replace_document($doc_id, $doc);
+		if ($skel) {
+			push @values, $mids, $xpath, $data;
+			$skel->index_skeleton(\@values);
+			$doc_id = $self->{xdb}->add_document($doc);
 		} else {
-			$doc_id = $db->add_document($doc);
+			$doc_id = link_and_save($self, $doc, $mids, $refs);
 		}
 	};
 
 	if ($@) {
-		warn "failed to index message <$mid>: $@\n";
+		warn "failed to index message <".join('> <',@$mids).">: $@\n";
 		return undef;
 	}
 	$doc_id;
@@ -467,27 +456,62 @@ sub parse_references ($) {
 	\@keep;
 }
 
-sub link_message {
-	my ($self, $smsg, $refs, $old_tid) = @_;
+sub link_doc {
+	my ($self, $doc, $refs, $old_tid) = @_;
 	my $tid;
 
 	if (@$refs) {
-
 		# first ref *should* be the thread root,
 		# but we can never trust clients to do the right thing
 		my $ref = shift @$refs;
-		$tid = $self->_resolve_mid_to_tid($ref);
-		$self->merge_threads($tid, $old_tid) if defined $old_tid;
+		$tid = resolve_mid_to_tid($self, $ref);
+		merge_threads($self, $tid, $old_tid) if defined $old_tid;
 
 		# the rest of the refs should point to this tid:
 		foreach $ref (@$refs) {
-			my $ptid = $self->_resolve_mid_to_tid($ref);
+			my $ptid = resolve_mid_to_tid($self, $ref);
 			merge_threads($self, $tid, $ptid);
 		}
 	} else {
 		$tid = defined $old_tid ? $old_tid : $self->next_thread_id;
 	}
-	$smsg->{doc}->add_term('G' . $tid);
+	$doc->add_term('G' . $tid);
+	$tid;
+}
+
+sub link_and_save {
+	my ($self, $doc, $mids, $refs) = @_;
+	my $db = $self->{xdb};
+	my $old_tid;
+	my $doc_id;
+	my $vivified = 0;
+	foreach my $mid (@$mids) {
+		$self->each_smsg_by_mid($mid, sub {
+			my ($cur) = @_;
+			my $type = $cur->type;
+			my $cur_tid = $cur->thread_id;
+			$old_tid = $cur_tid unless defined $old_tid;
+			if ($type eq 'mail') {
+				# do not break existing mail messages,
+				# just merge the threads
+				merge_threads($self, $old_tid, $cur_tid);
+				return 1;
+			}
+			if ($type ne 'ghost') {
+				die "<$mid> has a bad type: $type\n";
+			}
+			my $tid = link_doc($self, $doc, $refs, $old_tid);
+			$old_tid = $tid unless defined $old_tid;
+			$doc_id = $cur->{doc_id};
+			$self->{xdb}->replace_document($doc_id, $doc);
+			++$vivified;
+			1;
+		});
+	}
+	# not really important, but we return any vivified ghost docid, here:
+	return $doc_id if defined $doc_id;
+	link_doc($self, $doc, $refs, $old_tid);
+	$self->{xdb}->add_document($doc);
 }
 
 sub index_git_blob_id {
@@ -709,11 +733,22 @@ sub _index_sync {
 }
 
 # this will create a ghost as necessary
-sub _resolve_mid_to_tid {
+sub resolve_mid_to_tid {
 	my ($self, $mid) = @_;
+	my $tid;
+	$self->each_smsg_by_mid($mid, sub {
+		my ($smsg) = @_;
+		my $cur_tid = $smsg->thread_id;
+		if (defined $tid) {
+			merge_threads($self, $tid, $cur_tid);
+		} else {
+			$tid = $smsg->thread_id;
+		}
+		1;
+	});
+	return $tid if defined $tid;
 
-	my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
-	$smsg->thread_id;
+	$self->create_ghost($mid)->thread_id;
 }
 
 sub create_ghost {
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 333f965..063c83e 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -92,34 +92,20 @@ sub index_skeleton_real ($$) {
 	my ($self, $values) = @_;
 	my $doc_data = pop @$values;
 	my $xpath = pop @$values;
-	my $mid = pop @$values;
+	my $mids = pop @$values;
 	my $ts = $values->[PublicInbox::Search::TS];
-	my $smsg = $self->lookup_message($mid);
-	my ($old_tid, $doc_id);
-	if ($smsg) {
-		# convert a ghost to a regular message
-		# it will also clobber any existing regular message
-		$doc_id = $smsg->{doc_id};
-		$old_tid = $smsg->thread_id;
-	} else {
-		$smsg = PublicInbox::SearchMsg->new(undef);
-		$smsg->{mid} = $mid;
-	}
+	my $smsg = PublicInbox::SearchMsg->new(undef);
 	my $doc = $smsg->{doc};
 	$doc->add_term('XPATH' . $xpath) if defined $xpath;
-	$doc->add_term('Q' . $mid);
+	foreach my $mid (@$mids) {
+		$doc->add_term('Q' . $mid);
+	}
 	PublicInbox::SearchIdx::add_values($doc, $values);
 	$doc->set_data($doc_data);
 	$smsg->{ts} = $ts;
 	$smsg->load_from_data($doc_data);
 	my @refs = ($smsg->references =~ /<([^>]+)>/g);
-	$self->link_message($smsg, \@refs, $old_tid);
-	my $db = $self->{xdb};
-	if (defined $doc_id) {
-		$db->replace_document($doc_id, $doc);
-	} else {
-		$doc_id = $db->add_document($doc);
-	}
+	$self->link_and_save($doc, $mids, \@refs);
 }
 
 1;
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index 014f490..a556534 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -176,4 +176,11 @@ sub path {
 	$self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path
 }
 
+sub type {
+	my ($self) = @_;
+	my $type = $self->{type};
+	return $type if defined $type;
+	$self->{type} = _get_term_val($self, 'T', qr/\AT/);
+}
+
 1;
diff --git a/t/v2writable.t b/t/v2writable.t
index bc2437a..44191c1 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -80,5 +80,18 @@ ok($im->add($mime), 'ordinary message added');
 	is(scalar(@mids), 1, 'new generated');
 }
 
-$im->done;
+{
+	$mime->header_set('Message-Id', '<abcde@1>', '<abcde@2>');
+	ok($im->add($mime), 'message with multiple Message-ID');
+	$im->done;
+	my @found;
+	$ibx->search->each_smsg_by_mid('abcde@1', sub { push @found, @_; 1 });
+	is(scalar(@found), 1, 'message found by first MID');
+	$ibx->search->each_smsg_by_mid('abcde@2', sub { push @found, @_; 1 });
+	is(scalar(@found), 2, 'message found by second MID');
+	is($found[0]->{doc_id}, $found[1]->{doc_id}, 'same document');
+	ok($found[1]->{doc_id} > 0, 'doc_id is positive');
+}
+
+
 done_testing();
-- 
EW


  parent reply	other threads:[~2018-03-06  8:42 UTC|newest]

Thread overview: 36+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-06  8:42 [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 01/34] v2writable: delete ::Import obj when ->done Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 02/34] search: remove informational "warning" message Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 03/34] searchidx: add PID to error message when die-ing Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 04/34] content_id: special treatment for Message-Id headers Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 05/34] evcleanup: disable outside of daemon Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 06/34] v2writable: deduplicate detection on add Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 07/34] evcleanup: do not create event loop if nothing was registered Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 08/34] mid: add `mids' and `references' methods for extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 09/34] content_id: use `mids' and `references' for MID extraction Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 10/34] searchidx: use new `references' method for parsing References Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 11/34] content_id: no need to be human-friendly Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 12/34] v2writable: inject new Message-IDs on true duplicates Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 13/34] search: revert to using 'Q' as a uniQue id per-Xapian conventions Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-06  8:42 ` [PATCH 15/34] mid: be strict with References, but loose on Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 16/34] searchidx: avoid excessive XNQ indexing with diffs Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 17/34] searchidxskeleton: add a note about locking Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 18/34] v2writable: generated Message-ID goes first Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 19/34] searchidx: use add_boolean_term for internal terms Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 20/34] searchidx: add NNTP article number as a searchable term Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 21/34] mid: truncate excessively long MIDs early Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 22/34] nntp: use NNTP article numbers for lookups Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 23/34] nntp: fix NEWNEWS command Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 24/34] searchidx: store the primary MID in doc data for NNTP Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 25/34] import: consolidate object info for v2 imports Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 26/34] v2: avoid redundant/repeated configs for git partition repos Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 27/34] INSTALL: document more optional dependencies Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 28/34] search: favor skeleton DB for lookup_mail Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 29/34] search: each_smsg_by_mid uses skeleton if available Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 30/34] v2writable: remove unnecessary skeleton commit Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 31/34] favor Received: date over Date: header globally Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 32/34] import: fall back to Sender for extracting name and email Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 33/34] scripts/import_vger_from_mbox: perform mboxrd or mboxo escaping Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:42 ` [PATCH 34/34] v2writable: detect and use previous partition count Eric Wong (Contractor, The Linux Foundation)
2018-03-06  8:53 ` [v2 PATCH 00/34] duplicate handling, smaller Xapian DBs, date fixes Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180306084242.19988-15-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).