* [PATCH 1/6] search: split search indexing to a separate file
@ 2015-08-22 11:41 Eric Wong
2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
` (4 more replies)
0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
This makes organization easier and reduces the amount of code
loaded for a PSGI, mod_perl or CGI instance.
---
lib/PublicInbox/Search.pm | 363 +------------------------------------------
lib/PublicInbox/SearchIdx.pm | 363 +++++++++++++++++++++++++++++++++++++++++++
public-inbox-index | 4 +-
public-inbox-learn | 4 +-
public-inbox-mda | 2 +-
t/search.t | 6 +-
6 files changed, 379 insertions(+), 363 deletions(-)
create mode 100644 lib/PublicInbox/SearchIdx.pm
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 580b79f..753f5f3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -57,160 +57,20 @@ while (my ($k, $v) = each %all_pfx) {
my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
+sub xdir {
+ my (undef, $git_dir) = @_;
+ "$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
+}
+
sub new {
- my ($class, $git_dir, $writable) = @_;
- # allow concurrent versions for easier rollback:
- my $dir = "$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
- my $db;
-
- if ($writable) { # not used by the WWW interface
- require Search::Xapian::WritableDatabase;
- my $flag = Search::Xapian::DB_OPEN;
- if ($writable == 1) {
- require File::Path;
- File::Path::mkpath($dir);
- $flag = Search::Xapian::DB_CREATE_OR_OPEN;
- }
- $db = Search::Xapian::WritableDatabase->new($dir, $flag);
- } else {
- $db = Search::Xapian::Database->new($dir);
- }
+ my ($class, $git_dir) = @_;
+ my $dir = $class->xdir($git_dir);
+ my $db = Search::Xapian::Database->new($dir);
bless { xdb => $db, git_dir => $git_dir }, $class;
}
sub reopen { $_[0]->{xdb}->reopen }
-sub add_message {
- my ($self, $mime) = @_; # mime = Email::MIME object
- my $db = $self->{xdb};
-
- my $doc_id;
- my $mid_orig = mid_clean($mime->header_obj->header('Message-ID'));
- my $mid = mid_compressed($mid_orig);
- my $was_ghost = 0;
- my $ct_msg = $mime->header('Content-Type') || 'text/plain';
-
- eval {
- my $smsg = $self->lookup_message($mid);
- my $doc;
-
- if ($smsg) {
- $smsg->ensure_metadata;
- # convert a ghost to a regular message
- # it will also clobber any existing regular message
- $smsg->mime($mime);
- $doc = $smsg->{doc};
-
- my $type = xpfx('type');
- eval {
- $doc->remove_term($type . 'ghost');
- $was_ghost = 1;
- };
-
- # probably does not exist:
- eval { $doc->remove_term($type . 'mail') };
- $doc->add_term($type . 'mail');
- } else {
- $smsg = PublicInbox::SearchMsg->new($mime);
- $doc = $smsg->{doc};
- $doc->add_term(xpfx('mid') . $mid);
- }
-
- my $subj = $smsg->subject;
-
- if (length $subj) {
- $doc->add_term(xpfx('subject') . $subj);
-
- my $path = subject_path($subj);
- $doc->add_term(xpfx('path') . mid_compressed($path));
- }
-
- my $from = $smsg->from_name;
- my $date = $smsg->date;
- my $ts = Search::Xapian::sortable_serialise($smsg->ts);
- $doc->add_value(PublicInbox::Search::TS, $ts);
-
- my $tg = $self->term_generator;
-
- $tg->set_document($doc);
- $tg->index_text($subj, 1, 'S') if $subj;
- $tg->increase_termpos;
- $tg->index_text($subj) if $subj;
- $tg->increase_termpos;
-
- $tg->index_text($smsg->from->format);
- $tg->increase_termpos;
-
- $mime->walk_parts(sub {
- my ($part) = @_;
- return if $part->subparts; # walk_parts already recurses
- my $ct = $part->content_type || $ct_msg;
-
- # account for filter bugs...
- $ct =~ m!\btext/plain\b!i or return;
-
- my (@orig, @quot);
- my $body = $part->body;
- $part->body_set('');
- my @lines = split(/\n/, $body);
- while (defined(my $l = shift @lines)) {
- if ($l =~ /^\s*>/) {
- push @quot, $l;
- } else {
- push @orig, $l;
- }
- }
- if (@quot) {
- $tg->index_text(join("\n", @quot), 0);
- @quot = ();
- $tg->increase_termpos;
- }
- if (@orig) {
- $tg->index_text(join("\n", @orig));
- @orig = ();
- $tg->increase_termpos;
- }
- });
-
- if ($was_ghost) {
- $doc_id = $smsg->doc_id;
- $self->link_message($smsg, 0);
- $doc->set_data($smsg->to_doc_data);
- $db->replace_document($doc_id, $doc);
- } else {
- $self->link_message($smsg, 0);
- $doc->set_data($smsg->to_doc_data);
- $doc_id = $db->add_document($doc);
- }
- };
-
- if ($@) {
- warn "failed to index message <$mid_orig>: $@\n";
- return undef;
- }
- $doc_id;
-}
-
-# returns deleted doc_id on success, undef on missing
-sub remove_message {
- my ($self, $mid_orig) = @_;
- my $db = $self->{xdb};
- my $doc_id;
- $mid_orig = mid_clean($mid_orig);
- my $mid = mid_compressed($mid_orig);
-
- eval {
- $doc_id = $self->find_unique_doc_id('mid', $mid);
- $db->delete_document($doc_id) if defined $doc_id;
- };
-
- if ($@) {
- warn "failed to remove message <$mid_orig>: $@\n";
- return undef;
- }
- $doc_id;
-}
-
# read-only
sub query {
my ($self, $query_string, $opts) = @_;
@@ -300,32 +160,6 @@ sub qp {
$self->{query_parser} = $qp;
}
-sub term_generator { # write-only
- my ($self) = @_;
-
- my $tg = $self->{term_generator};
- return $tg if $tg;
-
- $tg = Search::Xapian::TermGenerator->new;
- $tg->set_stemmer($self->stemmer);
-
- $self->{term_generator} = $tg;
-}
-
-sub next_doc_id { $_[0]->{xdb}->get_lastdocid + 1 }
-
-# increments last_thread_id counter
-# returns a 64-bit integer represented as a hex string
-sub next_thread_id {
- my ($self) = @_;
- my $db = $self->{xdb};
- my $last_thread_id = int($db->get_metadata('last_thread_id') || 0);
-
- $db->set_metadata('last_thread_id', ++$last_thread_id);
-
- $last_thread_id;
-}
-
sub ts_range_processor {
$_[0]->{tsrp} ||= Search::Xapian::NumberValueRangeProcessor->new(TS);
}
@@ -334,76 +168,6 @@ sub date_range_processor {
$_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS);
}
-sub link_message {
- my ($self, $smsg, $is_ghost) = @_;
-
- if ($is_ghost) {
- $smsg->ensure_metadata;
- } else {
- $self->link_message_to_parents($smsg);
- }
-}
-
-sub link_message_to_parents {
- my ($self, $smsg) = @_;
- my $doc = $smsg->{doc};
- my $mid = mid_compressed($smsg->mid);
- my $mime = $smsg->mime;
- my $refs = $mime->header_obj->header('References');
- my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
- my $irt = $mime->header_obj->header('In-Reply-To');
- if ($irt) {
- $irt = mid_compressed(mid_clean($irt));
-
- # maybe some crazies will try to make a circular reference:
- if ($irt eq $mid) {
- $irt = undef;
- } else {
- # last References should be $irt
- # we will de-dupe later
- push @refs, $irt;
- }
- }
-
- my $tid;
- if (@refs) {
- my @crefs = map { mid_compressed($_) } @refs;
- my %uniq = ($mid => 1);
-
- # prevent circular references via References: here:
- @refs = ();
- foreach my $ref (@crefs) {
- next if $uniq{$ref};
- $uniq{$ref} = 1;
- push @refs, $ref;
- }
- }
- if (@refs) {
- $doc->add_term(xpfx('inreplyto') . $irt) if defined $irt;
- $smsg->{references_sorted} = '<'.join('><', @refs).'>';
-
- my $ref_pfx = xpfx('references');
-
- # first ref *should* be the thread root,
- # but we can never trust clients to do the right thing
- my $ref = shift @refs;
- $doc->add_term($ref_pfx . $ref);
- $tid = $self->_resolve_mid_to_tid($ref);
-
- # the rest of the refs should point to this tid:
- foreach $ref (@refs) {
- $doc->add_term($ref_pfx . $ref);
- my $ptid = $self->_resolve_mid_to_tid($ref);
- if ($tid ne $ptid) {
- $self->merge_threads($tid, $ptid);
- }
- }
- } else {
- $tid = $self->next_thread_id;
- }
- $doc->add_term(xpfx('thread') . $tid);
-}
-
sub lookup_message {
my ($self, $mid) = @_;
$mid = mid_clean($mid);
@@ -450,47 +214,6 @@ sub find_doc_ids_for_term {
($db->postlist_begin($term), $db->postlist_end($term));
}
-# this will create a ghost as necessary
-sub _resolve_mid_to_tid {
- my ($self, $mid) = @_;
-
- my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
- $smsg->thread_id;
-}
-
-sub create_ghost {
- my ($self, $mid, $tid) = @_;
-
- $mid = mid_compressed($mid);
- $tid = $self->next_thread_id unless defined $tid;
-
- my $doc = Search::Xapian::Document->new;
- $doc->add_term(xpfx('mid') . $mid);
- $doc->add_term(xpfx('thread') . $tid);
- $doc->add_term(xpfx('type') . 'ghost');
-
- my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
- $self->link_message($smsg, 1);
- $self->{xdb}->add_document($doc);
-
- $smsg;
-}
-
-sub merge_threads {
- my ($self, $winner_tid, $loser_tid) = @_;
- my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
- my $thread_pfx = xpfx('thread');
- my $db = $self->{xdb};
-
- for (; $head != $tail; $head->inc) {
- my $docid = $head->get_docid;
- my $doc = $db->get_document($docid);
- $doc->remove_term($thread_pfx . $loser_tid);
- $doc->add_term($thread_pfx . $winner_tid);
- $db->replace_document($docid, $doc);
- }
-}
-
# normalize subjects so they are suitable as pathnames for URLs
sub subject_path {
my $subj = pop;
@@ -509,79 +232,9 @@ sub subject_normalized {
$subj;
}
-sub do_cat_mail {
- my ($git, $blob) = @_;
- my $mime = eval {
- my $str = $git->cat_file($blob);
- Email::MIME->new($str);
- };
- $@ ? undef : $mime;
-}
-
-sub index_blob {
- my ($self, $git, $blob) = @_;
- my $mime = do_cat_mail($git, $blob) or return;
- eval { $self->add_message($mime) };
- warn "W: index_blob $blob: $@\n" if $@;
-}
-
-sub unindex_blob {
- my ($self, $git, $blob) = @_;
- my $mime = do_cat_mail($git, $blob) or return;
- my $mid = $mime->header_obj->header('Message-ID');
- eval { $self->remove_message($mid) } if defined $mid;
- warn "W: unindex_blob $blob: $@\n" if $@;
-}
-
sub enquire {
my ($self) = @_;
$self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb});
}
-# indexes all unindexed messages
-sub index_sync {
- my ($self, $head) = @_;
- require PublicInbox::GitCatFile;
- my $db = $self->{xdb};
- my $hex = '[a-f0-9]';
- my $h40 = $hex .'{40}';
- my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
- my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
- $head ||= 'HEAD';
-
- $db->begin_transaction;
- eval {
- my $git = PublicInbox::GitCatFile->new($self->{git_dir});
-
- my $latest = $db->get_metadata('last_commit');
- my $range = length $latest ? "$latest..$head" : $head;
- $latest = undef;
-
- # get indexed messages
- my @cmd = ('git', "--git-dir=$self->{git_dir}", "log",
- qw/--reverse --no-notes --no-color --raw -r
- --no-abbrev/, $range);
- my $pid = open(my $log, '-|', @cmd) or
- die('open` '.join(' ', @cmd) . " pipe failed: $!\n");
-
- while (my $line = <$log>) {
- if ($line =~ /$addmsg/o) {
- $self->index_blob($git, $1);
- } elsif ($line =~ /$delmsg/o) {
- $self->unindex_blob($git, $1);
- } elsif ($line =~ /^commit ($h40)/o) {
- $latest = $1;
- }
- }
- close $log;
- $db->set_metadata('last_commit', $latest) if defined $latest;
- };
- if ($@) {
- warn "indexing failed: $@\n";
- $db->cancel_transaction;
- } else {
- $db->commit_transaction;
- }
-}
-
1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
new file mode 100644
index 0000000..408b21f
--- /dev/null
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -0,0 +1,363 @@
+# Copyright (C) 2015, all contributors <meta@public-inbox.org>
+# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+# based on notmuch, but with no concept of folders, files or flags
+package PublicInbox::SearchIdx;
+use strict;
+use warnings;
+use base qw(PublicInbox::Search);
+use PublicInbox::MID qw/mid_clean mid_compressed/;
+*xpfx = *PublicInbox::Search::xpfx;
+
+sub new {
+ my ($class, $git_dir, $writable) = @_;
+ my $dir = $class->xdir($git_dir);
+ require Search::Xapian::WritableDatabase;
+ my $flag = Search::Xapian::DB_OPEN;
+ if ($writable == 1) {
+ require File::Path;
+ File::Path::mkpath($dir);
+ $flag = Search::Xapian::DB_CREATE_OR_OPEN;
+ }
+ my $db = Search::Xapian::WritableDatabase->new($dir, $flag);
+ bless { xdb => $db, git_dir => $git_dir }, $class;
+}
+
+sub add_message {
+ my ($self, $mime) = @_; # mime = Email::MIME object
+ my $db = $self->{xdb};
+
+ my $doc_id;
+ my $mid_orig = mid_clean($mime->header_obj->header('Message-ID'));
+ my $mid = mid_compressed($mid_orig);
+ my $was_ghost = 0;
+ my $ct_msg = $mime->header('Content-Type') || 'text/plain';
+
+ eval {
+ my $smsg = $self->lookup_message($mid);
+ my $doc;
+
+ if ($smsg) {
+ $smsg->ensure_metadata;
+ # convert a ghost to a regular message
+ # it will also clobber any existing regular message
+ $smsg->mime($mime);
+ $doc = $smsg->{doc};
+
+ my $type = xpfx('type');
+ eval {
+ $doc->remove_term($type . 'ghost');
+ $was_ghost = 1;
+ };
+
+ # probably does not exist:
+ eval { $doc->remove_term($type . 'mail') };
+ $doc->add_term($type . 'mail');
+ } else {
+ $smsg = PublicInbox::SearchMsg->new($mime);
+ $doc = $smsg->{doc};
+ $doc->add_term(xpfx('mid') . $mid);
+ }
+
+ my $subj = $smsg->subject;
+
+ if (length $subj) {
+ $doc->add_term(xpfx('subject') . $subj);
+
+ my $path = $self->subject_path($subj);
+ $doc->add_term(xpfx('path') . mid_compressed($path));
+ }
+
+ my $from = $smsg->from_name;
+ my $date = $smsg->date;
+ my $ts = Search::Xapian::sortable_serialise($smsg->ts);
+ $doc->add_value(PublicInbox::Search::TS, $ts);
+
+ my $tg = $self->term_generator;
+
+ $tg->set_document($doc);
+ $tg->index_text($subj, 1, 'S') if $subj;
+ $tg->increase_termpos;
+ $tg->index_text($subj) if $subj;
+ $tg->increase_termpos;
+
+ $tg->index_text($smsg->from->format);
+ $tg->increase_termpos;
+
+ $mime->walk_parts(sub {
+ my ($part) = @_;
+ return if $part->subparts; # walk_parts already recurses
+ my $ct = $part->content_type || $ct_msg;
+
+ # account for filter bugs...
+ $ct =~ m!\btext/plain\b!i or return;
+
+ my (@orig, @quot);
+ my $body = $part->body;
+ $part->body_set('');
+ my @lines = split(/\n/, $body);
+ while (defined(my $l = shift @lines)) {
+ if ($l =~ /^\s*>/) {
+ push @quot, $l;
+ } else {
+ push @orig, $l;
+ }
+ }
+ if (@quot) {
+ $tg->index_text(join("\n", @quot), 0);
+ @quot = ();
+ $tg->increase_termpos;
+ }
+ if (@orig) {
+ $tg->index_text(join("\n", @orig));
+ @orig = ();
+ $tg->increase_termpos;
+ }
+ });
+
+ if ($was_ghost) {
+ $doc_id = $smsg->doc_id;
+ $self->link_message($smsg, 0);
+ $doc->set_data($smsg->to_doc_data);
+ $db->replace_document($doc_id, $doc);
+ } else {
+ $self->link_message($smsg, 0);
+ $doc->set_data($smsg->to_doc_data);
+ $doc_id = $db->add_document($doc);
+ }
+ };
+
+ if ($@) {
+ warn "failed to index message <$mid_orig>: $@\n";
+ return undef;
+ }
+ $doc_id;
+}
+
+# returns deleted doc_id on success, undef on missing
+sub remove_message {
+ my ($self, $mid_orig) = @_;
+ my $db = $self->{xdb};
+ my $doc_id;
+ $mid_orig = mid_clean($mid_orig);
+ my $mid = mid_compressed($mid_orig);
+
+ eval {
+ $doc_id = $self->find_unique_doc_id('mid', $mid);
+ $db->delete_document($doc_id) if defined $doc_id;
+ };
+
+ if ($@) {
+ warn "failed to remove message <$mid_orig>: $@\n";
+ return undef;
+ }
+ $doc_id;
+}
+
+sub term_generator { # write-only
+ my ($self) = @_;
+
+ my $tg = $self->{term_generator};
+ return $tg if $tg;
+
+ $tg = Search::Xapian::TermGenerator->new;
+ $tg->set_stemmer($self->stemmer);
+
+ $self->{term_generator} = $tg;
+}
+
+sub next_doc_id { $_[0]->{xdb}->get_lastdocid + 1 }
+
+# increments last_thread_id counter
+# returns a 64-bit integer represented as a hex string
+sub next_thread_id {
+ my ($self) = @_;
+ my $db = $self->{xdb};
+ my $last_thread_id = int($db->get_metadata('last_thread_id') || 0);
+
+ $db->set_metadata('last_thread_id', ++$last_thread_id);
+
+ $last_thread_id;
+}
+
+sub link_message {
+ my ($self, $smsg, $is_ghost) = @_;
+
+ if ($is_ghost) {
+ $smsg->ensure_metadata;
+ } else {
+ $self->link_message_to_parents($smsg);
+ }
+}
+
+sub link_message_to_parents {
+ my ($self, $smsg) = @_;
+ my $doc = $smsg->{doc};
+ my $mid = mid_compressed($smsg->mid);
+ my $mime = $smsg->mime;
+ my $refs = $mime->header_obj->header('References');
+ my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
+ my $irt = $mime->header_obj->header('In-Reply-To');
+ if ($irt) {
+ $irt = mid_compressed(mid_clean($irt));
+
+ # maybe some crazies will try to make a circular reference:
+ if ($irt eq $mid) {
+ $irt = undef;
+ } else {
+ # last References should be $irt
+ # we will de-dupe later
+ push @refs, $irt;
+ }
+ }
+
+ my $tid;
+ if (@refs) {
+ my @crefs = map { mid_compressed($_) } @refs;
+ my %uniq = ($mid => 1);
+
+ # prevent circular references via References: here:
+ @refs = ();
+ foreach my $ref (@crefs) {
+ next if $uniq{$ref};
+ $uniq{$ref} = 1;
+ push @refs, $ref;
+ }
+ }
+ if (@refs) {
+ $doc->add_term(xpfx('inreplyto') . $irt) if defined $irt;
+ $smsg->{references_sorted} = '<'.join('><', @refs).'>';
+
+ my $ref_pfx = xpfx('references');
+
+ # first ref *should* be the thread root,
+ # but we can never trust clients to do the right thing
+ my $ref = shift @refs;
+ $doc->add_term($ref_pfx . $ref);
+ $tid = $self->_resolve_mid_to_tid($ref);
+
+ # the rest of the refs should point to this tid:
+ foreach $ref (@refs) {
+ $doc->add_term($ref_pfx . $ref);
+ my $ptid = $self->_resolve_mid_to_tid($ref);
+ if ($tid ne $ptid) {
+ $self->merge_threads($tid, $ptid);
+ }
+ }
+ } else {
+ $tid = $self->next_thread_id;
+ }
+ $doc->add_term(xpfx('thread') . $tid);
+}
+
+sub index_blob {
+ my ($self, $git, $blob) = @_;
+ my $mime = do_cat_mail($git, $blob) or return;
+ eval { $self->add_message($mime) };
+ warn "W: index_blob $blob: $@\n" if $@;
+}
+
+sub unindex_blob {
+ my ($self, $git, $blob) = @_;
+ my $mime = do_cat_mail($git, $blob) or return;
+ my $mid = $mime->header_obj->header('Message-ID');
+ eval { $self->remove_message($mid) } if defined $mid;
+ warn "W: unindex_blob $blob: $@\n" if $@;
+}
+
+sub do_cat_mail {
+ my ($git, $blob) = @_;
+ my $mime = eval {
+ my $str = $git->cat_file($blob);
+ Email::MIME->new($str);
+ };
+ $@ ? undef : $mime;
+}
+
+# indexes all unindexed messages
+sub index_sync {
+ my ($self, $head) = @_;
+ require PublicInbox::GitCatFile;
+ my $db = $self->{xdb};
+ my $hex = '[a-f0-9]';
+ my $h40 = $hex .'{40}';
+ my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
+ my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
+ $head ||= 'HEAD';
+
+ $db->begin_transaction;
+ eval {
+ my $git = PublicInbox::GitCatFile->new($self->{git_dir});
+
+ my $latest = $db->get_metadata('last_commit');
+ my $range = length $latest ? "$latest..$head" : $head;
+ $latest = undef;
+
+ # get indexed messages
+ my @cmd = ('git', "--git-dir=$self->{git_dir}", "log",
+ qw/--reverse --no-notes --no-color --raw -r
+ --no-abbrev/, $range);
+ my $pid = open(my $log, '-|', @cmd) or
+ die('open` '.join(' ', @cmd) . " pipe failed: $!\n");
+
+ while (my $line = <$log>) {
+ if ($line =~ /$addmsg/o) {
+ $self->index_blob($git, $1);
+ } elsif ($line =~ /$delmsg/o) {
+ $self->unindex_blob($git, $1);
+ } elsif ($line =~ /^commit ($h40)/o) {
+ $latest = $1;
+ }
+ }
+ close $log;
+ $db->set_metadata('last_commit', $latest) if defined $latest;
+ };
+ if ($@) {
+ warn "indexing failed: $@\n";
+ $db->cancel_transaction;
+ } else {
+ $db->commit_transaction;
+ }
+}
+
+# this will create a ghost as necessary
+sub _resolve_mid_to_tid {
+ my ($self, $mid) = @_;
+
+ my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
+ $smsg->thread_id;
+}
+
+sub create_ghost {
+ my ($self, $mid, $tid) = @_;
+
+ $mid = mid_compressed($mid);
+ $tid = $self->next_thread_id unless defined $tid;
+
+ my $doc = Search::Xapian::Document->new;
+ $doc->add_term(xpfx('mid') . $mid);
+ $doc->add_term(xpfx('thread') . $tid);
+ $doc->add_term(xpfx('type') . 'ghost');
+
+ my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+ $self->link_message($smsg, 1);
+ $self->{xdb}->add_document($doc);
+
+ $smsg;
+}
+
+sub merge_threads {
+ my ($self, $winner_tid, $loser_tid) = @_;
+ my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
+ my $thread_pfx = xpfx('thread');
+ my $db = $self->{xdb};
+
+ for (; $head != $tail; $head->inc) {
+ my $docid = $head->get_docid;
+ my $doc = $db->get_document($docid);
+ $doc->remove_term($thread_pfx . $loser_tid);
+ $doc->add_term($thread_pfx . $winner_tid);
+ $db->replace_document($docid, $doc);
+ }
+}
+
+1;
diff --git a/public-inbox-index b/public-inbox-index
index 1104bbc..f39ad9e 100755
--- a/public-inbox-index
+++ b/public-inbox-index
@@ -10,7 +10,7 @@ use strict;
use warnings;
my $usage = "public-inbox-index GIT_DIR";
use PublicInbox::Config;
-eval { require PublicInbox::Search };
+eval { require PublicInbox::SearchIdx };
if ($@) {
print STDERR "Search::Xapian required for $0\n";
exit 1;
@@ -26,6 +26,6 @@ foreach my $dir (@ARGV) {
sub index_dir {
my ($git_dir) = @_;
-d $git_dir or die "$git_dir does not appear to be a git repository\n";
- my $s = PublicInbox::Search->new($git_dir, 1);
+ my $s = PublicInbox::SearchIdx->new($git_dir, 1);
$s->index_sync;
}
diff --git a/public-inbox-learn b/public-inbox-learn
index bd59247..c89ffb5 100755
--- a/public-inbox-learn
+++ b/public-inbox-learn
@@ -78,9 +78,9 @@ foreach my $recipient (keys %dests) {
}
$err or eval {
- require PublicInbox::Search;
+ require PublicInbox::SearchIdx;
umask 0022; # XXX FIXME use git config core.sharedRepository
- my $s = PublicInbox::Search->new($git_dir, 2);
+ my $s = PublicInbox::SearchIdx->new($git_dir, 2);
$s->index_sync;
};
}
diff --git a/public-inbox-mda b/public-inbox-mda
index a3c959a..630ffcb 100755
--- a/public-inbox-mda
+++ b/public-inbox-mda
@@ -90,7 +90,7 @@ sub search_index_sync {
eval {
require PublicInbox::Search;
umask 0022; # XXX FIXME use git config core.sharedRepository
- my $s = PublicInbox::Search->new($git_dir, 2);
+ my $s = PublicInbox::SearchIdx->new($git_dir, 2);
$s->index_sync;
};
}
diff --git a/t/search.t b/t/search.t
index 89a2862..be39410 100644
--- a/t/search.t
+++ b/t/search.t
@@ -3,7 +3,7 @@
use strict;
use warnings;
use Test::More;
-eval { require PublicInbox::Search; };
+eval { require PublicInbox::SearchIdx; };
plan skip_all => "Xapian missing for search" if $@;
use File::Temp qw/tempdir/;
use Email::MIME;
@@ -16,11 +16,11 @@ is(0, system(qw(git init -q --bare), $git_dir), "git init (main)");
eval { PublicInbox::Search->new($git_dir) };
ok($@, "exception raised on non-existent DB");
-my $rw = PublicInbox::Search->new($git_dir, 1);
+my $rw = PublicInbox::SearchIdx->new($git_dir, 1);
my $ro = PublicInbox::Search->new($git_dir);
my $rw_commit = sub {
$rw = undef;
- $rw = PublicInbox::Search->new($git_dir, 1);
+ $rw = PublicInbox::SearchIdx->new($git_dir, 1);
};
{
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/6] view: misc cleanups and simplifications
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
` (3 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
Less code should be easier-to-read.
---
lib/PublicInbox/View.pm | 22 +++++++---------------
1 file changed, 7 insertions(+), 15 deletions(-)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 4e01507..922e8e9 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -93,8 +93,7 @@ sub index_entry {
unless (defined $ts) {
$ts = msg_timestamp($mime);
}
- my $fmt = '%Y-%m-%d %H:%M';
- $ts = POSIX::strftime($fmt, gmtime($ts));
+ $ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
my $rv = "<table\nsummary=l$level><tr>";
if ($level) {
@@ -167,14 +166,8 @@ sub emit_thread_html {
}
my $final_anchor = $state->[3];
my $next = "<a\nid=\"s$final_anchor\">";
-
- if ($final_anchor == 1) {
- $next .= 'only message in thread';
- } else {
- $next .= 'end of thread';
- }
- $next .= "</a>, back to <a\nhref=\"../\">index</a>\n";
-
+ $next .= $final_anchor == 1 ? 'only message in' : 'end of';
+ $next .= " thread</a>, back to <a\nhref=\"../\">index</a>\n";
$fh->write("<hr />" . PRE_WRAP . $next . $foot .
"</pre></body></html>");
$fh->close;
@@ -258,9 +251,9 @@ sub flush_quote {
if ($full_pfx) {
if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) {
# show quote inline
- my $rv = join("\n", map { linkify($_); $_ } @$quot);
+ my $rv = join('', map { linkify($_); $_ } @$quot);
@$quot = ();
- return $rv . "\n";
+ return $rv;
}
# show a short snippet of quoted text and link to full version:
@@ -286,7 +279,7 @@ sub flush_quote {
# short version (see above)
my $nr = ++$$n;
my $rv = "<a\nid=q${part_nr}_$nr></a>";
- $rv .= join("\n", map { linkify($_); $_ } @$quot) . "\n";
+ $rv .= join('', map { linkify($_); $_ } @$quot);
@$quot = ();
$rv;
}
@@ -309,7 +302,7 @@ sub add_text_body {
$part->body_set('');
$s = $enc->decode($s);
$s = ascii_html($s);
- my @lines = split(/\n/, $s);
+ my @lines = split(/^/m, $s);
$s = '';
if ($$part_nr > 0) {
@@ -330,7 +323,6 @@ sub add_text_body {
# regular line, OK
linkify($cur);
$s .= $cur;
- $s .= "\n";
} else {
push @quot, $cur;
}
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 3/6] view: reference total followups
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
` (2 subsequent siblings)
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
In case there's huge threads, readers should know about them
even though we currently lack the navigation to display them.
---
lib/PublicInbox/View.pm | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 922e8e9..e333906 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -437,7 +437,12 @@ sub html_footer {
"threadlink</a>$idx";
my $res = $srch->get_followups($mid);
if (my $c = $res->{total}) {
- $c = $c == 1 ? '1 followup' : "$c followups";
+ my $nr = scalar @{$res->{msgs}};
+ if ($nr < $c) {
+ $c = "$nr of $c followups";
+ } else {
+ $c = $c == 1 ? '1 followup' : "$c followups";
+ }
$idx .= "\n$c:\n";
$res->{srch} = $srch;
thread_followups(\$idx, $mime, $res);
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 4/6] search: consistently pass options and flags
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
Most of our special query functions require exact matches, so none
of the flags we normally use are necessary for query parsing.
---
lib/PublicInbox/Search.pm | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 753f5f3..c61d4cf 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -82,7 +82,7 @@ sub query {
sub get_subject_path {
my ($self, $path, $opts) = @_;
my $query = $self->qp->parse_query("path:".mid_compressed($path), 0);
- $self->do_enquire($query);
+ $self->do_enquire($query, $opts);
}
# given a message ID, get followups to a message
@@ -94,8 +94,7 @@ sub get_followups {
my $irt = $qp->parse_query("inreplyto:$mid", 0);
my $ref = $qp->parse_query("references:$mid", 0);
my $query = Search::Xapian::Query->new(OP_OR, $irt, $ref);
-
- $self->do_enquire($query);
+ $self->do_enquire($query, $opts);
}
sub get_thread {
@@ -104,8 +103,8 @@ sub get_thread {
return { total => 0, msgs => [] } unless $smsg;
my $qp = $self->qp;
- my $qtid = $qp->parse_query('thread:'.$smsg->thread_id);
- my $qsub = $qp->parse_query('path:'.mid_compressed($smsg->path));
+ my $qtid = $qp->parse_query('thread:'.$smsg->thread_id, 0);
+ my $qsub = $qp->parse_query('path:'.mid_compressed($smsg->path), 0);
my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub);
$self->do_enquire($query, $opts);
}
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 5/6] mbox: support uncompressed mbox
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
` (2 preceding siblings ...)
2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
Some folks may want to view the mbox inline as a string of raw text,
when guessing URLs. Let them do this...
---
lib/PublicInbox/Mbox.pm | 18 +++++++++++-------
lib/PublicInbox/WWW.pm | 12 +++++++-----
2 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 5f5612a..d49e9b3 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -7,10 +7,10 @@ use warnings;
use PublicInbox::MID qw/mid_compressed mid2path/;
sub thread_mbox {
- my ($ctx, $srch) = @_;
+ my ($ctx, $srch, $sfx) = @_;
sub {
my ($response) = @_; # Plack callback
- emit_mbox($response, $ctx, $srch);
+ emit_mbox($response, $ctx, $srch, $sfx);
}
}
@@ -38,14 +38,18 @@ sub emit_msg {
}
sub emit_mbox {
- my ($response, $ctx, $srch) = @_;
- eval { require IO::Compress::Gzip };
- return need_gzip($response) if $@;
+ my ($response, $ctx, $srch, $sfx) = @_;
+ my $type = 'mbox';
+ if ($sfx) {
+ eval { require IO::Compress::Gzip };
+ return need_gzip($response) if $@;
+ $type = 'gzip';
+ }
# http://www.iana.org/assignments/media-types/application/gzip
# http://www.iana.org/assignments/media-types/application/mbox
- my $fh = $response->([200, ['Content-Type' => 'application/gzip']]);
- $fh = PublicInbox::MboxGz->new($fh);
+ my $fh = $response->([200, ['Content-Type' => "application/$type"]]);
+ $fh = PublicInbox::MboxGz->new($fh) if $sfx;
require PublicInbox::GitCatFile;
require Email::Simple;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 30a7a43..33c7110 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,9 +53,10 @@ sub run {
} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) {
invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi);
- } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\.gz!o) {
+ } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox(\.gz)?\z!o) {
my $sfx = $3;
- invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $cgi);
+ invalid_list_mid(\%ctx, $1, $2) ||
+ get_thread_mbox(\%ctx, $cgi, $sfx);
} elsif ($path_info =~ m!$LISTNAME_RE/f/\S+\.txt\z!o) {
invalid_list_mid(\%ctx, $1, $2) ||
@@ -331,15 +332,16 @@ sub msg_pfx {
"../f/$href.html";
}
-# /$LISTNAME/t/$MESSAGE_ID.mbox.gz -> search results as gzipped mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox -> thread as mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox.gz -> thread as gzipped mbox
# note: I'm not a big fan of other compression formats since they're
# significantly more expensive on CPU than gzip and less-widely available,
# especially on older systems. Stick to zlib since that's what git uses.
sub get_thread_mbox {
- my ($ctx, $cgi) = @_;
+ my ($ctx, $cgi, $sfx) = @_;
my $srch = searcher($ctx) or return need_search($ctx);
require PublicInbox::Mbox;
- PublicInbox::Mbox::thread_mbox($ctx, $srch);
+ PublicInbox::Mbox::thread_mbox($ctx, $srch, $sfx);
}
1;
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 6/6] view: wire up mbox.gz links
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
` (3 preceding siblings ...)
2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
To: meta
To reduce clutter, we will not link to uncompressed versions.
Users should be able to download entire threads for offline
reading, enable this feature for them.
---
lib/PublicInbox/View.pm | 2 ++
1 file changed, 2 insertions(+)
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e333906..fe37a9f 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -168,6 +168,8 @@ sub emit_thread_html {
my $next = "<a\nid=\"s$final_anchor\">";
$next .= $final_anchor == 1 ? 'only message in' : 'end of';
$next .= " thread</a>, back to <a\nhref=\"../\">index</a>\n";
+ $mid = PublicInbox::Hval->new_msgid($mid)->as_href;
+ $next .= "download: <a\nhref=\"$mid.mbox.gz\">mbox.gz</a>\n\n";
$fh->write("<hr />" . PRE_WRAP . $next . $foot .
"</pre></body></html>");
$fh->close;
--
EW
^ permalink raw reply related [flat|nested] 6+ messages in thread
end of thread, other threads:[~2015-08-22 11:41 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).