* [PATCH] search: reduce columns stored in Xapian
@ 2018-04-01 23:23 Eric Wong (Contractor, The Linux Foundation)
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong (Contractor, The Linux Foundation) @ 2018-04-01 23:23 UTC (permalink / raw)
To: meta
We can store :bytes and :lines in doc_data since we never
sort or search by them. We don't have much use for the Date:
stamp at the moment, either.
---
lib/PublicInbox/Search.pm | 9 ++----
lib/PublicInbox/SearchIdx.pm | 29 ++++++------------
lib/PublicInbox/SearchIdxSkeleton.pm | 10 ++-----
lib/PublicInbox/SearchMsg.pm | 58 +++++++++++++++++++++---------------
4 files changed, 49 insertions(+), 57 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index de296e1..ca389e3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,12 +8,9 @@ use strict;
use warnings;
# values for searching
-use constant DS => 0; # Date: header in Unix time
-use constant NUM => 1; # NNTP article number
-use constant BYTES => 2; # :bytes as defined in RFC 3977
-use constant LINES => 3; # :lines as defined in RFC 3977
-use constant TS => 4; # Received: header in Unix time
-use constant YYYYMMDD => 5; # for searching in the WWW UI
+use constant TS => 0; # Received: header in Unix time
+use constant YYYYMMDD => 1; # for searching in the WWW UI
+use constant NUM => 2; # NNTP article number
use Search::Xapian qw/:standard/;
use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 36f97b3..2e0b9a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -114,25 +114,12 @@ sub add_val ($$$) {
$doc->add_value($col, $num);
}
-sub add_values ($$) {
- my ($doc, $values) = @_;
-
- my $ts = $values->[PublicInbox::Search::TS];
+sub add_values {
+ my ($doc, $ts, $ds, $num) = @_;
add_val($doc, PublicInbox::Search::TS, $ts);
-
- my $num = $values->[PublicInbox::Search::NUM];
- defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
-
- my $bytes = $values->[PublicInbox::Search::BYTES];
- defined($bytes) and add_val($doc, PublicInbox::Search::BYTES, $bytes);
-
- my $lines = $values->[PublicInbox::Search::LINES];
- add_val($doc, PublicInbox::Search::LINES, $lines);
-
- my $ds = $values->[PublicInbox::Search::DS];
- add_val($doc, PublicInbox::Search::DS, $ds);
my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+ defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
}
sub index_users ($$) {
@@ -295,8 +282,10 @@ sub add_message {
}
my $lines = $mime->body_raw =~ tr!\n!\n!;
- my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
- add_values($doc, \@values);
+ $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+ defined $bytes or $bytes = length($mime->as_string);
+ $smsg->{bytes} = $bytes;
+ add_values($doc, $smsg->ts, $smsg->ds, $num);
my $tg = $self->term_generator;
@@ -366,8 +355,8 @@ sub add_message {
$self->delete_article($num) if defined $num; # for reindexing
if ($skel) {
- push @values, $mids, $xpath, $data;
- $skel->index_skeleton(\@values);
+ my @vals = ($smsg->ts, $num, $mids, $xpath, $data);
+ $skel->index_skeleton(\@vals);
$doc->add_boolean_term('Q' . $_) foreach @$mids;
$doc->add_boolean_term('XNUM' . $num) if defined $num;
$doc_id = $self->{xdb}->add_document($doc);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4f15816..2be6496 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,14 @@ sub remote_remove {
die $err if $err;
}
-# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
sub index_skeleton_real ($$) {
my ($self, $values) = @_;
- my $doc_data = pop @$values;
- my $xpath = pop @$values;
- my $mids = pop @$values;
+ my ($ts, $num, $mids, $xpath, $doc_data) = @$values;
my $smsg = PublicInbox::SearchMsg->new(undef);
+ $smsg->load_from_data($doc_data);
my $doc = $smsg->{doc};
- PublicInbox::SearchIdx::add_values($doc, $values);
$doc->set_data($doc_data);
- $smsg->load_from_data($doc_data);
- my $num = $values->[PublicInbox::Search::NUM];
+ PublicInbox::SearchIdx::add_values($doc, $ts, $smsg->ds, $num);
my @refs = ($smsg->references =~ /<([^>]+)>/g);
$self->delete_article($num) if defined $num; # for reindexing
$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index e55d401..f5510b8 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -35,20 +35,41 @@ sub get_val ($$) {
Search::Xapian::sortable_unserialise($doc->get_value($col));
}
+sub to_doc_data {
+ my ($self, $oid, $mid0) = @_;
+ $oid = '' unless defined $oid;
+ join("\n",
+ $self->subject,
+ $self->from,
+ $self->references,
+ $self->to,
+ $self->cc,
+ $oid,
+ $mid0,
+ $self->ds,
+ $self->{bytes},
+ $self->{lines}
+ );
+}
+
sub load_from_data ($$) {
my ($self) = $_[0]; # data = $_[1]
- my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]);
- $self->{subject} = $subj;
- $self->{from} = $from;
- $self->{references} = $refs;
+ (
+ $self->{subject},
+ $self->{from},
+ $self->{references},
- # To: and Cc: are stored to optimize HDR/XHDR in NNTP since
- # some NNTP clients will use that for message displays.
- $self->{to} = $to;
- $self->{cc} = $cc;
+ # To: and Cc: are stored to optimize HDR/XHDR in NNTP since
+ # some NNTP clients will use that for message displays.
+ $self->{to},
+ $self->{cc},
- $self->{blob} = $blob;
- $self->{mid} = $mid0;
+ $self->{blob},
+ $self->{mid},
+ $self->{ds},
+ $self->{bytes},
+ $self->{lines}
+ ) = split(/\n/, $_[1]);
}
sub load_expand {
@@ -56,7 +77,6 @@ sub load_expand {
my $doc = $self->{doc};
my $data = $doc->get_data or return;
$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
- $self->{ds} = get_val($doc, &PublicInbox::Search::DS);
utf8::decode($data);
load_from_data($self, $data);
$self;
@@ -69,11 +89,9 @@ sub load_doc {
}
# :bytes and :lines metadata in RFC 3977
-sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
-sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) {
- $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM())
-}
+sub bytes ($) { $_[0]->{bytes} }
+sub lines ($) { $_[0]->{lines} }
+sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
sub __hdr ($$) {
my ($self, $field) = @_;
@@ -134,14 +152,6 @@ sub ds {
$self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
}
-sub to_doc_data {
- my ($self, $oid, $mid0) = @_;
- my @rows = ($self->subject, $self->from, $self->references,
- $self->to, $self->cc);
- $oid = '' unless defined $oid;
- join("\n", @rows, $oid, $mid0);
-}
-
sub references {
my ($self) = @_;
my $x = $self->{references};
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread
* [PATCH] search: reduce columns stored in Xapian
@ 2018-04-01 23:31 Eric Wong
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2018-04-01 23:31 UTC (permalink / raw)
To: meta
We can store :bytes and :lines in doc_data since we never
sort or search by them. We don't have much use for the Date:
stamp at the moment, either.
---
Publishing for documentation purposes, will be obsolete,
next, as v2 changes will bump the SCHEMA_VERSION.
lib/PublicInbox/Search.pm | 9 ++----
lib/PublicInbox/SearchIdx.pm | 29 ++++++------------
lib/PublicInbox/SearchIdxSkeleton.pm | 10 ++-----
lib/PublicInbox/SearchMsg.pm | 58 +++++++++++++++++++++---------------
4 files changed, 49 insertions(+), 57 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index de296e1..ca389e3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -8,12 +8,9 @@ use strict;
use warnings;
# values for searching
-use constant DS => 0; # Date: header in Unix time
-use constant NUM => 1; # NNTP article number
-use constant BYTES => 2; # :bytes as defined in RFC 3977
-use constant LINES => 3; # :lines as defined in RFC 3977
-use constant TS => 4; # Received: header in Unix time
-use constant YYYYMMDD => 5; # for searching in the WWW UI
+use constant TS => 0; # Received: header in Unix time
+use constant YYYYMMDD => 1; # for searching in the WWW UI
+use constant NUM => 2; # NNTP article number
use Search::Xapian qw/:standard/;
use PublicInbox::SearchMsg;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 36f97b3..2e0b9a4 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -114,25 +114,12 @@ sub add_val ($$$) {
$doc->add_value($col, $num);
}
-sub add_values ($$) {
- my ($doc, $values) = @_;
-
- my $ts = $values->[PublicInbox::Search::TS];
+sub add_values {
+ my ($doc, $ts, $ds, $num) = @_;
add_val($doc, PublicInbox::Search::TS, $ts);
-
- my $num = $values->[PublicInbox::Search::NUM];
- defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
-
- my $bytes = $values->[PublicInbox::Search::BYTES];
- defined($bytes) and add_val($doc, PublicInbox::Search::BYTES, $bytes);
-
- my $lines = $values->[PublicInbox::Search::LINES];
- add_val($doc, PublicInbox::Search::LINES, $lines);
-
- my $ds = $values->[PublicInbox::Search::DS];
- add_val($doc, PublicInbox::Search::DS, $ds);
my $yyyymmdd = strftime('%Y%m%d', gmtime($ds));
add_val($doc, PublicInbox::Search::YYYYMMDD, $yyyymmdd);
+ defined($num) and add_val($doc, PublicInbox::Search::NUM, $num);
}
sub index_users ($$) {
@@ -295,8 +282,10 @@ sub add_message {
}
my $lines = $mime->body_raw =~ tr!\n!\n!;
- my @values = ($smsg->ds, $num, $bytes, $lines, $smsg->ts);
- add_values($doc, \@values);
+ $smsg->{lines} = $mime->body_raw =~ tr!\n!\n!;
+ defined $bytes or $bytes = length($mime->as_string);
+ $smsg->{bytes} = $bytes;
+ add_values($doc, $smsg->ts, $smsg->ds, $num);
my $tg = $self->term_generator;
@@ -366,8 +355,8 @@ sub add_message {
$self->delete_article($num) if defined $num; # for reindexing
if ($skel) {
- push @values, $mids, $xpath, $data;
- $skel->index_skeleton(\@values);
+ my @vals = ($smsg->ts, $num, $mids, $xpath, $data);
+ $skel->index_skeleton(\@vals);
$doc->add_boolean_term('Q' . $_) foreach @$mids;
$doc->add_boolean_term('XNUM' . $num) if defined $num;
$doc_id = $self->{xdb}->add_document($doc);
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 4f15816..2be6496 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -121,18 +121,14 @@ sub remote_remove {
die $err if $err;
}
-# values: [ DS, NUM, BYTES, LINES, TS, MIDS, XPATH, doc_data ]
sub index_skeleton_real ($$) {
my ($self, $values) = @_;
- my $doc_data = pop @$values;
- my $xpath = pop @$values;
- my $mids = pop @$values;
+ my ($ts, $num, $mids, $xpath, $doc_data) = @$values;
my $smsg = PublicInbox::SearchMsg->new(undef);
+ $smsg->load_from_data($doc_data);
my $doc = $smsg->{doc};
- PublicInbox::SearchIdx::add_values($doc, $values);
$doc->set_data($doc_data);
- $smsg->load_from_data($doc_data);
- my $num = $values->[PublicInbox::Search::NUM];
+ PublicInbox::SearchIdx::add_values($doc, $ts, $smsg->ds, $num);
my @refs = ($smsg->references =~ /<([^>]+)>/g);
$self->delete_article($num) if defined $num; # for reindexing
$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm
index e55d401..f5510b8 100644
--- a/lib/PublicInbox/SearchMsg.pm
+++ b/lib/PublicInbox/SearchMsg.pm
@@ -35,20 +35,41 @@ sub get_val ($$) {
Search::Xapian::sortable_unserialise($doc->get_value($col));
}
+sub to_doc_data {
+ my ($self, $oid, $mid0) = @_;
+ $oid = '' unless defined $oid;
+ join("\n",
+ $self->subject,
+ $self->from,
+ $self->references,
+ $self->to,
+ $self->cc,
+ $oid,
+ $mid0,
+ $self->ds,
+ $self->{bytes},
+ $self->{lines}
+ );
+}
+
sub load_from_data ($$) {
my ($self) = $_[0]; # data = $_[1]
- my ($subj, $from, $refs, $to, $cc, $blob, $mid0) = split(/\n/, $_[1]);
- $self->{subject} = $subj;
- $self->{from} = $from;
- $self->{references} = $refs;
+ (
+ $self->{subject},
+ $self->{from},
+ $self->{references},
- # To: and Cc: are stored to optimize HDR/XHDR in NNTP since
- # some NNTP clients will use that for message displays.
- $self->{to} = $to;
- $self->{cc} = $cc;
+ # To: and Cc: are stored to optimize HDR/XHDR in NNTP since
+ # some NNTP clients will use that for message displays.
+ $self->{to},
+ $self->{cc},
- $self->{blob} = $blob;
- $self->{mid} = $mid0;
+ $self->{blob},
+ $self->{mid},
+ $self->{ds},
+ $self->{bytes},
+ $self->{lines}
+ ) = split(/\n/, $_[1]);
}
sub load_expand {
@@ -56,7 +77,6 @@ sub load_expand {
my $doc = $self->{doc};
my $data = $doc->get_data or return;
$self->{ts} = get_val($doc, &PublicInbox::Search::TS);
- $self->{ds} = get_val($doc, &PublicInbox::Search::DS);
utf8::decode($data);
load_from_data($self, $data);
$self;
@@ -69,11 +89,9 @@ sub load_doc {
}
# :bytes and :lines metadata in RFC 3977
-sub bytes ($) { get_val($_[0]->{doc}, &PublicInbox::Search::BYTES) }
-sub lines ($) { get_val($_[0]->{doc}, &PublicInbox::Search::LINES) }
-sub num ($) {
- $_[0]->{num} ||= get_val($_[0]->{doc}, PublicInbox::Search::NUM())
-}
+sub bytes ($) { $_[0]->{bytes} }
+sub lines ($) { $_[0]->{lines} }
+sub num ($) { $_[0]->{num} ||= _get_term_val($_[0], 'XNUM', qr/\AXNUM/) }
sub __hdr ($$) {
my ($self, $field) = @_;
@@ -134,14 +152,6 @@ sub ds {
$self->{ds} ||= eval { msg_datestamp($self->{mime}->header_obj); } || 0;
}
-sub to_doc_data {
- my ($self, $oid, $mid0) = @_;
- my @rows = ($self->subject, $self->from, $self->references,
- $self->to, $self->cc);
- $oid = '' unless defined $oid;
- join("\n", @rows, $oid, $mid0);
-}
-
sub references {
my ($self) = @_;
my $x = $self->{references};
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread
end of thread, other threads:[~2018-04-01 23:31 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-04-01 23:31 [PATCH] search: reduce columns stored in Xapian Eric Wong
-- strict thread matches above, loose matches on Subject: below --
2018-04-01 23:23 Eric Wong (Contractor, The Linux Foundation)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).