From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 3BFC11FAF0 for ; Tue, 6 Mar 2018 08:42:43 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 14/34] searchidx: support indexing multiple MIDs Date: Tue, 6 Mar 2018 08:42:22 +0000 Message-Id: <20180306084242.19988-15-e@80x24.org> In-Reply-To: <20180306084242.19988-1-e@80x24.org> References: <20180306084242.19988-1-e@80x24.org> List-Id: It's possible to have a message handle multiple terms; so use this feature to ensure messages with multiple MIDs can be found by either one. --- lib/PublicInbox/Search.pm | 1 - lib/PublicInbox/SearchIdx.pm | 121 ++++++++++++++++++++++------------- lib/PublicInbox/SearchIdxSkeleton.pm | 26 ++------ lib/PublicInbox/SearchMsg.pm | 7 ++ t/v2writable.t | 15 ++++- 5 files changed, 105 insertions(+), 65 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 74f406a..fb7a126 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -373,7 +373,6 @@ sub lookup_mail { # no ghosts! sub each_smsg_by_mid { my ($self, $mid, $cb) = @_; - $mid = mid_clean($mid); my $xdb = $self->{xdb}; # XXX retry_reopen isn't necessary for V2Writable, but the PSGI # interface will need it... diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 61dc057..1c10728 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -281,29 +281,19 @@ sub index_body ($$$) { sub add_message { my ($self, $mime, $bytes, $num, $blob) = @_; # mime = Email::MIME object - my $db = $self->{xdb}; - - my ($doc_id, $old_tid); - my @mids = mid_mime($mime); - if (@mids > 1) { - warn "Multi-MID: ( ",join(' | ', @mids)," )\n"; - } - my $mid = mid_clean($mids[0]); + my $doc_id; + my $mids = mids($mime->header_obj); my $skel = $self->{skeleton}; eval { - die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; - my $smsg = $self->lookup_message($mid); - if ($smsg) { - # convert a ghost to a regular message - # it will also clobber any existing regular message - $doc_id = $smsg->{doc_id}; - $old_tid = $smsg->thread_id unless $skel; - } - $smsg = PublicInbox::SearchMsg->new($mime); + my $smsg = PublicInbox::SearchMsg->new($mime); my $doc = $smsg->{doc}; - $doc->add_term('Q' . $mid); - + foreach my $mid (@$mids) { + # FIXME: may be abused to prevent archival + length($mid) > MAX_MID_SIZE and + die 'Message-ID too long'; + $doc->add_term('Q' . $mid); + } my $subj = $smsg->subject; my $xpath; if ($subj ne '') { @@ -366,31 +356,30 @@ sub add_message { # populates smsg->references for smsg->to_doc_data my $refs = parse_references($smsg); my $data = $smsg->to_doc_data($blob); - if ($skel) { - push @values, $mid, $xpath, $data; - $skel->index_skeleton(\@values); - } else { - link_message($self, $smsg, $refs, $old_tid); + foreach my $mid (@$mids) { + $tg->index_text($mid, 1, 'XM'); } - $tg->index_text($mid, 1, 'XM'); $doc->set_data($data); - if (my $altid = $self->{-altid}) { foreach my $alt (@$altid) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_term($alt->{xprefix} . $id); + foreach my $mid (@$mids) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_term($alt->{xprefix} . $id); + } } } - if (defined $doc_id) { - $db->replace_document($doc_id, $doc); + if ($skel) { + push @values, $mids, $xpath, $data; + $skel->index_skeleton(\@values); + $doc_id = $self->{xdb}->add_document($doc); } else { - $doc_id = $db->add_document($doc); + $doc_id = link_and_save($self, $doc, $mids, $refs); } }; if ($@) { - warn "failed to index message <$mid>: $@\n"; + warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } $doc_id; @@ -467,27 +456,62 @@ sub parse_references ($) { \@keep; } -sub link_message { - my ($self, $smsg, $refs, $old_tid) = @_; +sub link_doc { + my ($self, $doc, $refs, $old_tid) = @_; my $tid; if (@$refs) { - # first ref *should* be the thread root, # but we can never trust clients to do the right thing my $ref = shift @$refs; - $tid = $self->_resolve_mid_to_tid($ref); - $self->merge_threads($tid, $old_tid) if defined $old_tid; + $tid = resolve_mid_to_tid($self, $ref); + merge_threads($self, $tid, $old_tid) if defined $old_tid; # the rest of the refs should point to this tid: foreach $ref (@$refs) { - my $ptid = $self->_resolve_mid_to_tid($ref); + my $ptid = resolve_mid_to_tid($self, $ref); merge_threads($self, $tid, $ptid); } } else { $tid = defined $old_tid ? $old_tid : $self->next_thread_id; } - $smsg->{doc}->add_term('G' . $tid); + $doc->add_term('G' . $tid); + $tid; +} + +sub link_and_save { + my ($self, $doc, $mids, $refs) = @_; + my $db = $self->{xdb}; + my $old_tid; + my $doc_id; + my $vivified = 0; + foreach my $mid (@$mids) { + $self->each_smsg_by_mid($mid, sub { + my ($cur) = @_; + my $type = $cur->type; + my $cur_tid = $cur->thread_id; + $old_tid = $cur_tid unless defined $old_tid; + if ($type eq 'mail') { + # do not break existing mail messages, + # just merge the threads + merge_threads($self, $old_tid, $cur_tid); + return 1; + } + if ($type ne 'ghost') { + die "<$mid> has a bad type: $type\n"; + } + my $tid = link_doc($self, $doc, $refs, $old_tid); + $old_tid = $tid unless defined $old_tid; + $doc_id = $cur->{doc_id}; + $self->{xdb}->replace_document($doc_id, $doc); + ++$vivified; + 1; + }); + } + # not really important, but we return any vivified ghost docid, here: + return $doc_id if defined $doc_id; + link_doc($self, $doc, $refs, $old_tid); + $self->{xdb}->add_document($doc); } sub index_git_blob_id { @@ -709,11 +733,22 @@ sub _index_sync { } # this will create a ghost as necessary -sub _resolve_mid_to_tid { +sub resolve_mid_to_tid { my ($self, $mid) = @_; + my $tid; + $self->each_smsg_by_mid($mid, sub { + my ($smsg) = @_; + my $cur_tid = $smsg->thread_id; + if (defined $tid) { + merge_threads($self, $tid, $cur_tid); + } else { + $tid = $smsg->thread_id; + } + 1; + }); + return $tid if defined $tid; - my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid); - $smsg->thread_id; + $self->create_ghost($mid)->thread_id; } sub create_ghost { diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 333f965..063c83e 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -92,34 +92,20 @@ sub index_skeleton_real ($$) { my ($self, $values) = @_; my $doc_data = pop @$values; my $xpath = pop @$values; - my $mid = pop @$values; + my $mids = pop @$values; my $ts = $values->[PublicInbox::Search::TS]; - my $smsg = $self->lookup_message($mid); - my ($old_tid, $doc_id); - if ($smsg) { - # convert a ghost to a regular message - # it will also clobber any existing regular message - $doc_id = $smsg->{doc_id}; - $old_tid = $smsg->thread_id; - } else { - $smsg = PublicInbox::SearchMsg->new(undef); - $smsg->{mid} = $mid; - } + my $smsg = PublicInbox::SearchMsg->new(undef); my $doc = $smsg->{doc}; $doc->add_term('XPATH' . $xpath) if defined $xpath; - $doc->add_term('Q' . $mid); + foreach my $mid (@$mids) { + $doc->add_term('Q' . $mid); + } PublicInbox::SearchIdx::add_values($doc, $values); $doc->set_data($doc_data); $smsg->{ts} = $ts; $smsg->load_from_data($doc_data); my @refs = ($smsg->references =~ /<([^>]+)>/g); - $self->link_message($smsg, \@refs, $old_tid); - my $db = $self->{xdb}; - if (defined $doc_id) { - $db->replace_document($doc_id, $doc); - } else { - $doc_id = $db->add_document($doc); - } + $self->link_and_save($doc, $mids, \@refs); } 1; diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 014f490..a556534 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -176,4 +176,11 @@ sub path { $self->{path} = _get_term_val($self, 'XPATH', qr/\AXPATH/); # path } +sub type { + my ($self) = @_; + my $type = $self->{type}; + return $type if defined $type; + $self->{type} = _get_term_val($self, 'T', qr/\AT/); +} + 1; diff --git a/t/v2writable.t b/t/v2writable.t index bc2437a..44191c1 100644 --- a/t/v2writable.t +++ b/t/v2writable.t @@ -80,5 +80,18 @@ ok($im->add($mime), 'ordinary message added'); is(scalar(@mids), 1, 'new generated'); } -$im->done; +{ + $mime->header_set('Message-Id', '', ''); + ok($im->add($mime), 'message with multiple Message-ID'); + $im->done; + my @found; + $ibx->search->each_smsg_by_mid('abcde@1', sub { push @found, @_; 1 }); + is(scalar(@found), 1, 'message found by first MID'); + $ibx->search->each_smsg_by_mid('abcde@2', sub { push @found, @_; 1 }); + is(scalar(@found), 2, 'message found by second MID'); + is($found[0]->{doc_id}, $found[1]->{doc_id}, 'same document'); + ok($found[1]->{doc_id} > 0, 'doc_id is positive'); +} + + done_testing(); -- EW