From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.1 (2015-04-28) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS6315 166.70.0.0/16 X-Spam-Status: No, score=-3.7 required=3.0 tests=AWL,BAYES_00, RCVD_IN_DNSWL_LOW,SPF_PASS shortcircuit=no autolearn=ham autolearn_force=no version=3.4.1 Received: from out03.mta.xmission.com (out03.mta.xmission.com [166.70.13.233]) (using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits)) (No client certificate requested) by dcvr.yhbt.net (Postfix) with ESMTPS id 4E573208E9; Tue, 17 Jul 2018 23:32:00 +0000 (UTC) Received: from in01.mta.xmission.com ([166.70.13.51]) by out03.mta.xmission.com with esmtps (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.87) (envelope-from ) id 1ffZRn-0003Hv-K7; Tue, 17 Jul 2018 17:31:59 -0600 Received: from [97.119.167.31] (helo=x220.int.ebiederm.org) by in01.mta.xmission.com with esmtpsa (TLS1.2:ECDHE_RSA_AES_128_GCM_SHA256:128) (Exim 4.87) (envelope-from ) id 1ffZRl-0003Fa-W5; Tue, 17 Jul 2018 17:31:59 -0600 From: "Eric W. Biederman" To: Eric Wong Cc: meta@public-inbox.org, "Eric W. Biederman" Date: Tue, 17 Jul 2018 18:30:57 -0500 Message-Id: <20180717233058.30820-2-ebiederm@xmission.com> X-Mailer: git-send-email 2.17.1 In-Reply-To: <87a7qpjve8.fsf@xmission.com> References: <87a7qpjve8.fsf@xmission.com> X-XM-SPF: eid=1ffZRl-0003Fa-W5;;;mid=<20180717233058.30820-2-ebiederm@xmission.com>;;;hst=in01.mta.xmission.com;;;ip=97.119.167.31;;;frm=ebiederm@xmission.com;;;spf=neutral X-XM-AID: U2FsdGVkX18S/nMkeO2arvO2K1qVoCju2S6VM1Z4nRI= X-SA-Exim-Connect-IP: 97.119.167.31 X-SA-Exim-Mail-From: ebiederm@xmission.com Subject: [PATCH 2/3] SearchIdx: Add the mechanism for making all Xapian indexing optional X-SA-Exim-Version: 4.2.1 (built Thu, 05 May 2016 13:38:54 -0600) X-SA-Exim-Scanned: Yes (on in01.mta.xmission.com) List-Id: Create a new method index_message that holds all of the code to create Xapian indexes. The creation of this method simpliy involved idenitifying the relevant code and moving it from add_message. A call is added to index_message from add_message to keep everything working as it currently does. The new call is made conditional upon index levels of 'position' and 'terms' The two things public-inbox uses Xapian to index. Signed-off-by: "Eric W. Biederman" --- lib/PublicInbox/SearchIdx.pm | 171 ++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 83 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index cc92c389a152..deb87db3f88a 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -268,10 +268,94 @@ sub index_body ($$$) { @$lines = (); } +sub index_message ($$$$$) { + my ($self, $mime, $num, $oid, $mids, $mid0) = @_; + my $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + my $subj = $smsg->subject; + add_val($doc, PublicInbox::Search::TS(), $smsg->ts); + my @ds = gmtime($smsg->ds); + my $yyyymmdd = strftime('%Y%m%d', @ds); + add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); + my $dt = strftime('%Y%m%d%H%M%S', @ds); + add_val($doc, PublicInbox::Search::DT(), $dt); + + my $tg = $self->term_generator; + + $tg->set_document($doc); + $self->index_text($subj, 1, 'S') if $subj; + $self->index_users($smsg); + + msg_iter($mime, sub { + my ($part, $depth, @idx) = @{$_[0]}; + my $ct = $part->content_type || 'text/plain'; + my $fn = $part->filename; + if (defined $fn && $fn ne '') { + $self->index_text($fn, 1, 'XFN'); + } + + return if $ct =~ m!\btext/x?html\b!i; + + my $s = eval { $part->body_str }; + if ($@) { + if ($ct =~ m!\btext/plain\b!i) { + # Try to assume UTF-8 because Alpine + # seems to do wacky things and set + # charset=X-UNKNOWN + $part->charset_set('UTF-8'); + $s = eval { $part->body_str }; + $s = $part->body if $@; + } + } + defined $s or return; + + my (@orig, @quot); + my $body = $part->body; + my @lines = split(/\n/, $body); + while (defined(my $l = shift @lines)) { + if ($l =~ /^>/) { + $self->index_body(\@orig, $doc) if @orig; + push @quot, $l; + } else { + $self->index_body(\@quot, 0) if @quot; + push @orig, $l; + } + } + $self->index_body(\@quot, 0) if @quot; + $self->index_body(\@orig, $doc) if @orig; + }); + + foreach my $mid (@$mids) { + $self->index_text($mid, 1, 'XM'); + + # because too many Message-IDs are prefixed with + # "Pine.LNX."... + if ($mid =~ /\w{12,}/) { + my @long = ($mid =~ /(\w{3,}+)/g); + $self->index_text(join(' ', @long), 1, 'XM'); + } + } + $smsg->{to} = $smsg->{cc} = ''; + PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids); + my $data = $smsg->to_doc_data($oid, $mid0); + $doc->set_data($data); + if (my $altid = $self->{-altid}) { + foreach my $alt (@$altid) { + my $pfx = $alt->{xprefix}; + foreach my $mid (@$mids) { + my $id = $alt->mid2alt($mid); + next unless defined $id; + $doc->add_boolean_term($pfx . $id); + } + } + } + $doc->add_boolean_term('Q' . $_) foreach @$mids; + $self->{xdb}->replace_document($num, $doc); +} + sub add_message { # mime = Email::MIME object my ($self, $mime, $bytes, $num, $oid, $mid0) = @_; - my $doc_id; my $mids = mids($mime->header_obj); $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility unless (defined $num) { # v1 @@ -279,98 +363,19 @@ sub add_message { $num = index_mm($self, $mime); } eval { - my $smsg = PublicInbox::SearchMsg->new($mime); - my $doc = $smsg->{doc}; - my $subj = $smsg->subject; - add_val($doc, PublicInbox::Search::TS(), $smsg->ts); - my @ds = gmtime($smsg->ds); - my $yyyymmdd = strftime('%Y%m%d', @ds); - add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd); - my $dt = strftime('%Y%m%d%H%M%S', @ds); - add_val($doc, PublicInbox::Search::DT(), $dt); - - my $tg = $self->term_generator; - - $tg->set_document($doc); - $self->index_text($subj, 1, 'S') if $subj; - $self->index_users($smsg); - - msg_iter($mime, sub { - my ($part, $depth, @idx) = @{$_[0]}; - my $ct = $part->content_type || 'text/plain'; - my $fn = $part->filename; - if (defined $fn && $fn ne '') { - $self->index_text($fn, 1, 'XFN'); - } - - return if $ct =~ m!\btext/x?html\b!i; - - my $s = eval { $part->body_str }; - if ($@) { - if ($ct =~ m!\btext/plain\b!i) { - # Try to assume UTF-8 because Alpine - # seems to do wacky things and set - # charset=X-UNKNOWN - $part->charset_set('UTF-8'); - $s = eval { $part->body_str }; - $s = $part->body if $@; - } - } - defined $s or return; - - my (@orig, @quot); - my $body = $part->body; - my @lines = split(/\n/, $body); - while (defined(my $l = shift @lines)) { - if ($l =~ /^>/) { - $self->index_body(\@orig, $doc) if @orig; - push @quot, $l; - } else { - $self->index_body(\@quot, 0) if @quot; - push @orig, $l; - } - } - $self->index_body(\@quot, 0) if @quot; - $self->index_body(\@orig, $doc) if @orig; - }); - - foreach my $mid (@$mids) { - $self->index_text($mid, 1, 'XM'); - - # because too many Message-IDs are prefixed with - # "Pine.LNX."... - if ($mid =~ /\w{12,}/) { - my @long = ($mid =~ /(\w{3,}+)/g); - $self->index_text(join(' ', @long), 1, 'XM'); - } + if ($self->{indexlevel} =~ m/(positions|terms)/) { + $self->index_message($mime, $num, $oid, $mids, $mid0) } - $smsg->{to} = $smsg->{cc} = ''; - PublicInbox::OverIdx::parse_references($smsg, $mid0, $mids); - my $data = $smsg->to_doc_data($oid, $mid0); - $doc->set_data($data); - if (my $altid = $self->{-altid}) { - foreach my $alt (@$altid) { - my $pfx = $alt->{xprefix}; - foreach my $mid (@$mids) { - my $id = $alt->mid2alt($mid); - next unless defined $id; - $doc->add_boolean_term($pfx . $id); - } - } - } - if (my $over = $self->{over}) { $over->add_overview($mime, $bytes, $num, $oid, $mid0); } - $doc->add_boolean_term('Q' . $_) foreach @$mids; - $self->{xdb}->replace_document($doc_id = $num, $doc); }; if ($@) { warn "failed to index message <".join('> <',@$mids).">: $@\n"; return undef; } - $doc_id; + $num; } # returns begin and end PostingIterator -- 2.17.1