From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 9E8821FAE4 for ; Thu, 22 Feb 2018 21:42:24 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH 03/12] search: stop assuming Message-ID is unique Date: Thu, 22 Feb 2018 21:42:13 +0000 Message-Id: <20180222214222.1086-4-e@80x24.org> In-Reply-To: <20180222214222.1086-1-e@80x24.org> References: <20180222214222.1086-1-e@80x24.org> List-Id: In general, they are, but there's no way for or general purpose mail server to enforce that. This is a step in allowing us to handle more corner cases which existing lists throw at us. --- lib/PublicInbox/ExtMsg.pm | 2 +- lib/PublicInbox/Search.pm | 14 ++++++++++++-- lib/PublicInbox/SearchIdx.pm | 10 ++++++---- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/ExtMsg.pm b/lib/PublicInbox/ExtMsg.pm index 4e31ef0..90d68db 100644 --- a/lib/PublicInbox/ExtMsg.pm +++ b/lib/PublicInbox/ExtMsg.pm @@ -46,7 +46,7 @@ sub ext_msg { } # try to find the URL with Xapian to avoid forking - my $doc_id = eval { $s->find_unique_doc_id('XMID' . $mid) }; + my $doc_id = eval { $s->find_first_doc_id('XMID' . $mid) }; if ($@) { # xapian not configured properly for this repo push @nox, $other; diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 3ec96ca..33a1f2d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -56,7 +56,7 @@ my %bool_pfx_internal = ( ); my %bool_pfx_external = ( - mid => 'XMID', # uniQue id (Message-ID) + mid => 'XMID', # Message-ID (full/exact) ); my %prob_prefix = ( @@ -285,7 +285,7 @@ sub lookup_message { my ($self, $mid) = @_; $mid = mid_clean($mid); - my $doc_id = $self->find_unique_doc_id('XMID' . $mid); + my $doc_id = $self->find_first_doc_id('XMID' . $mid); my $smsg; if (defined $doc_id) { # raises on error: @@ -327,6 +327,16 @@ sub find_doc_ids { ($db->postlist_begin($termval), $db->postlist_end($termval)); } +sub find_first_doc_id { + my ($self, $termval) = @_; + + my ($begin, $end) = $self->find_doc_ids($termval); + + return undef if $begin->equal($end); # not found + + $begin->get_docid; +} + # normalize subjects so they are suitable as pathnames for URLs # XXX: consider for removal sub subject_path { diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index fa5057f..265403a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -366,12 +366,14 @@ sub remove_message { $mid = mid_clean($mid); eval { - $doc_id = $self->find_unique_doc_id('XMID' . $mid); - if (defined $doc_id) { - $db->delete_document($doc_id); - } else { + my ($head, $tail) = $self->find_doc_ids('XMID' . $mid); + if ($head->equal($tail)) { warn "cannot remove non-existent <$mid>\n"; } + for (; $head != $tail; $head->inc) { + my $docid = $head->get_docid; + $db->delete_document($docid); + } }; if ($@) { -- EW