From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.0 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id EBD121F404 for ; Sun, 1 Apr 2018 23:23:57 +0000 (UTC) From: "Eric Wong (Contractor, The Linux Foundation)" To: meta@public-inbox.org Subject: [PATCH] search: reduce columns stored in Xapian Date: Sun, 1 Apr 2018 23:23:57 +0000 Message-Id: <20180401232357.5438-1-e@80x24.org> List-Id: We can store :bytes and :lines in doc_data since we never sort or search by them. We don't have much use for the Date: stamp at the moment, either. --- lib/PublicInbox/Search.pm | 9 ++---- lib/PublicInbox/SearchIdx.pm | 29 ++++++------------ lib/PublicInbox/SearchIdxSkeleton.pm | 10 ++----- lib/PublicInbox/SearchMsg.pm | 58 +++++++++++++++++++++--------------- 4 files changed, 49 insertions(+), 57 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index de296e1..ca389e3 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -8,12 +8,9 @@ use strict; use warnings; # values for searching -use constant DS => 0; # Date: header in Unix time -use constant NUM => 1; # NNTP article number -use constant BYTES => 2; # :bytes as defined in RFC 3977 -use constant LINES => 3; # :lines as defined in RFC 3977 -use constant TS => 4; # Received: header in Unix time -use constant YYYYMMDD => 5; # for searching in the WWW UI +use constant TS => 0; # Received: header in Unix time +use constant YYYYMMDD => 1; # for searching in the WWW UI +use constant NUM => 2; # NNTP article number use Search::Xapian qw/:standard/; use PublicInbox::SearchMsg; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 36f97b3..2e0b9a4 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -114,25 +114,12 @@ sub add_val ($$$) { $doc->add_value($col, $num); } -sub add_values ($$) { - my ($doc, $values) = @_; - - my $ts = $values->[PublicInbox::Search::TS]; +sub add_values { + my ($doc, $ts, $ds, $num) = @_; add_val($doc, PublicInbox::Search::TS, $ts); - - my $num = $values->[PublicInbox::Search::NUM]; - defined($num) and add_val($doc, PublicInbox::Search::NUM, $num); - - my $bytes = $values->[PublicInbox::Search::BYTES]; - defined($bytes) and add_val($doc, PublicInbox::Search::BYTES, $bytes); - - my $lines = $values->[PublicInbox::Search::LINES]; - add_val($doc, PublicInbox::Search::LINES, $lines); - - my $ds = $values->[PublicInbox::Search::DS]; - add_val($doc, PublicInbox::Search::DS, $ds); my $yyyymmdd = strftime('%Y%m%d', gmtime($ds)); add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd); + defined($num) and add_val($doc, PublicInbox::Search::NUM, $num); } sub index_users ($$) { @@ -295,8 +282,10 @@ sub add_message { } my $lines = $mime->body_raw =~ tr!\n!\n!; - my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts); - add_values($doc, \@values); + $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!; + defined $bytes or $bytes = length($mime->as_string); + $smsg->{bytes} = $bytes; + add_values($doc, $smsg->ts, $smsg->ds, $num); my $tg = $self->term_generator; @@ -366,8 +355,8 @@ sub add_message { $self->delete_article($num) if defined $num; # for reindexing if ($skel) { - push @values, $mids, $xpath, $data; - $skel->index_skeleton(\@values); + my @vals = ($smsg->ts, $num, $mids, $xpath, $data); + $skel->index_skeleton(\@vals); $doc->add_boolean_term('Q' . $_) foreach @$mids; $doc->add_boolean_term('XNUM' . $num) if defined $num; $doc_id = $self->{xdb}->add_document($doc); diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm index 4f15816..2be6496 100644 --- a/lib/PublicInbox/SearchIdxSkeleton.pm +++ b/lib/PublicInbox/SearchIdxSkeleton.pm @@ -121,18 +121,14 @@ sub remote_remove { die $err if $err; } -# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ] sub index_skeleton_real ($$) { my ($self, $values) = @_; - my $doc_data = pop @$values; - my $xpath = pop @$values; - my $mids = pop @$values; + my ($ts, $num, $mids, $xpath, $doc_data) = @$values; my $smsg = PublicInbox::SearchMsg->new(undef); + $smsg->load_from_data($doc_data); my $doc = $smsg->{doc}; - PublicInbox::SearchIdx::add_values($doc, $values); $doc->set_data($doc_data); - $smsg->load_from_data($doc_data); - my $num = $values->[PublicInbox::Search::NUM]; + PublicInbox::SearchIdx::add_values($doc, $ts, $smsg->ds, $num); my @refs = ($smsg->references =~ /<([^>]+)>/g); $self->delete_article($num) if defined $num; # for reindexing $self->link_and_save($doc, $mids, \@refs, $num, $xpath); diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index e55d401..f5510b8 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -35,20 +35,41 @@ sub get_val ($$) { Search::Xapian::sortable_unserialise($doc->get_value($col)); } +sub to_doc_data { + my ($self, $oid, $mid0) = @_; + $oid = '' unless defined $oid; + join("\n", + $self->subject, + $self->from, + $self->references, + $self->to, + $self->cc, + $oid, + $mid0, + $self->ds, + $self->{bytes}, + $self->{lines} + ); +} + sub load_from_data ($$) { my ($self) = $_[0]; # data = $_[1] - my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]); - $self->{subject} = $subj; - $self->{from} = $from; - $self->{references} = $refs; + ( + $self->{subject}, + $self->{from}, + $self->{references}, - # To: and Cc: are stored to optimize HDR/XHDR in NNTP since - # some NNTP clients will use that for message displays. - $self->{to} = $to; - $self->{cc} = $cc; + # To: and Cc: are stored to optimize HDR/XHDR in NNTP since + # some NNTP clients will use that for message displays. + $self->{to}, + $self->{cc}, - $self->{blob} = $blob; - $self->{mid} = $mid0; + $self->{blob}, + $self->{mid}, + $self->{ds}, + $self->{bytes}, + $self->{lines} + ) = split(/\n/, $_[1]); } sub load_expand { @@ -56,7 +77,6 @@ sub load_expand { my $doc = $self->{doc}; my $data = $doc->get_data or return; $self->{ts} = get_val($doc, &PublicInbox::Search::TS); - $self->{ds} = get_val($doc, &PublicInbox::Search::DS); utf8::decode($data); load_from_data($self, $data); $self; @@ -69,11 +89,9 @@ sub load_doc { } # :bytes and :lines metadata in RFC 3977 -sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) } -sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) } -sub num ($) { - $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM()) -} +sub bytes ($) { $_[0]->{bytes} } +sub lines ($) { $_[0]->{lines} } +sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) } sub __hdr ($$) { my ($self, $field) = @_; @@ -134,14 +152,6 @@ sub ds { $self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0; } -sub to_doc_data { - my ($self, $oid, $mid0) = @_; - my @rows = ($self->subject, $self->from, $self->references, - $self->to, $self->cc); - $oid = '' unless defined $oid; - join("\n", @rows, $oid, $mid0); -} - sub references { my ($self) = @_; my $x = $self->{references}; -- EW