unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 1/6] search: split search indexing to a separate file
@ 2015-08-22 11:41 Eric Wong
  2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
                   ` (4 more replies)
  0 siblings, 5 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

This makes organization easier and reduces the amount of code
loaded for a PSGI, mod_perl or CGI instance.
---
 lib/PublicInbox/Search.pm    | 363 +------------------------------------------
 lib/PublicInbox/SearchIdx.pm | 363 +++++++++++++++++++++++++++++++++++++++++++
 public-inbox-index           |   4 +-
 public-inbox-learn           |   4 +-
 public-inbox-mda             |   2 +-
 t/search.t                   |   6 +-
 6 files changed, 379 insertions(+), 363 deletions(-)
 create mode 100644 lib/PublicInbox/SearchIdx.pm

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 580b79f..753f5f3 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -57,160 +57,20 @@ while (my ($k, $v) = each %all_pfx) {
 
 my $mail_query = Search::Xapian::Query->new(xpfx('type') . 'mail');
 
+sub xdir {
+	my (undef, $git_dir) = @_;
+	"$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
+}
+
 sub new {
-	my ($class, $git_dir, $writable) = @_;
-	# allow concurrent versions for easier rollback:
-	my $dir = "$git_dir/public-inbox/xapian" . SCHEMA_VERSION;
-	my $db;
-
-	if ($writable) { # not used by the WWW interface
-		require Search::Xapian::WritableDatabase;
-		my $flag = Search::Xapian::DB_OPEN;
-		if ($writable == 1) {
-			require File::Path;
-			File::Path::mkpath($dir);
-			$flag = Search::Xapian::DB_CREATE_OR_OPEN;
-		}
-		$db = Search::Xapian::WritableDatabase->new($dir, $flag);
-	} else {
-		$db = Search::Xapian::Database->new($dir);
-	}
+	my ($class, $git_dir) = @_;
+	my $dir = $class->xdir($git_dir);
+	my $db = Search::Xapian::Database->new($dir);
 	bless { xdb => $db, git_dir => $git_dir }, $class;
 }
 
 sub reopen { $_[0]->{xdb}->reopen }
 
-sub add_message {
-	my ($self, $mime) = @_; # mime = Email::MIME object
-	my $db = $self->{xdb};
-
-	my $doc_id;
-	my $mid_orig = mid_clean($mime->header_obj->header('Message-ID'));
-	my $mid = mid_compressed($mid_orig);
-	my $was_ghost = 0;
-	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
-
-	eval {
-		my $smsg = $self->lookup_message($mid);
-		my $doc;
-
-		if ($smsg) {
-			$smsg->ensure_metadata;
-			# convert a ghost to a regular message
-			# it will also clobber any existing regular message
-			$smsg->mime($mime);
-			$doc = $smsg->{doc};
-
-			my $type = xpfx('type');
-			eval {
-				$doc->remove_term($type . 'ghost');
-				$was_ghost = 1;
-			};
-
-			# probably does not exist:
-			eval { $doc->remove_term($type . 'mail') };
-			$doc->add_term($type . 'mail');
-		}  else {
-			$smsg = PublicInbox::SearchMsg->new($mime);
-			$doc = $smsg->{doc};
-			$doc->add_term(xpfx('mid') . $mid);
-		}
-
-		my $subj = $smsg->subject;
-
-		if (length $subj) {
-			$doc->add_term(xpfx('subject') . $subj);
-
-			my $path = subject_path($subj);
-			$doc->add_term(xpfx('path') . mid_compressed($path));
-		}
-
-		my $from = $smsg->from_name;
-		my $date = $smsg->date;
-		my $ts = Search::Xapian::sortable_serialise($smsg->ts);
-		$doc->add_value(PublicInbox::Search::TS, $ts);
-
-		my $tg = $self->term_generator;
-
-		$tg->set_document($doc);
-		$tg->index_text($subj, 1, 'S') if $subj;
-		$tg->increase_termpos;
-		$tg->index_text($subj) if $subj;
-		$tg->increase_termpos;
-
-		$tg->index_text($smsg->from->format);
-		$tg->increase_termpos;
-
-		$mime->walk_parts(sub {
-			my ($part) = @_;
-			return if $part->subparts; # walk_parts already recurses
-			my $ct = $part->content_type || $ct_msg;
-
-			# account for filter bugs...
-			$ct =~ m!\btext/plain\b!i or return;
-
-			my (@orig, @quot);
-			my $body = $part->body;
-			$part->body_set('');
-			my @lines = split(/\n/, $body);
-			while (defined(my $l = shift @lines)) {
-				if ($l =~ /^\s*>/) {
-					push @quot, $l;
-				} else {
-					push @orig, $l;
-				}
-			}
-			if (@quot) {
-				$tg->index_text(join("\n", @quot), 0);
-				@quot = ();
-				$tg->increase_termpos;
-			}
-			if (@orig) {
-				$tg->index_text(join("\n", @orig));
-				@orig = ();
-				$tg->increase_termpos;
-			}
-		});
-
-		if ($was_ghost) {
-			$doc_id = $smsg->doc_id;
-			$self->link_message($smsg, 0);
-			$doc->set_data($smsg->to_doc_data);
-			$db->replace_document($doc_id, $doc);
-		} else {
-			$self->link_message($smsg, 0);
-			$doc->set_data($smsg->to_doc_data);
-			$doc_id = $db->add_document($doc);
-		}
-	};
-
-	if ($@) {
-		warn "failed to index message <$mid_orig>: $@\n";
-		return undef;
-	}
-	$doc_id;
-}
-
-# returns deleted doc_id on success, undef on missing
-sub remove_message {
-	my ($self, $mid_orig) = @_;
-	my $db = $self->{xdb};
-	my $doc_id;
-	$mid_orig = mid_clean($mid_orig);
-	my $mid = mid_compressed($mid_orig);
-
-	eval {
-		$doc_id = $self->find_unique_doc_id('mid', $mid);
-		$db->delete_document($doc_id) if defined $doc_id;
-	};
-
-	if ($@) {
-		warn "failed to remove message <$mid_orig>: $@\n";
-		return undef;
-	}
-	$doc_id;
-}
-
 # read-only
 sub query {
 	my ($self, $query_string, $opts) = @_;
@@ -300,32 +160,6 @@ sub qp {
 	$self->{query_parser} = $qp;
 }
 
-sub term_generator { # write-only
-	my ($self) = @_;
-
-	my $tg = $self->{term_generator};
-	return $tg if $tg;
-
-	$tg = Search::Xapian::TermGenerator->new;
-	$tg->set_stemmer($self->stemmer);
-
-	$self->{term_generator} = $tg;
-}
-
-sub next_doc_id { $_[0]->{xdb}->get_lastdocid + 1 }
-
-# increments last_thread_id counter
-# returns a 64-bit integer represented as a hex string
-sub next_thread_id {
-	my ($self) = @_;
-	my $db = $self->{xdb};
-	my $last_thread_id = int($db->get_metadata('last_thread_id') || 0);
-
-	$db->set_metadata('last_thread_id', ++$last_thread_id);
-
-	$last_thread_id;
-}
-
 sub ts_range_processor {
 	$_[0]->{tsrp} ||= Search::Xapian::NumberValueRangeProcessor->new(TS);
 }
@@ -334,76 +168,6 @@ sub date_range_processor {
 	$_[0]->{drp} ||= Search::Xapian::DateValueRangeProcessor->new(TS);
 }
 
-sub link_message {
-	my ($self, $smsg, $is_ghost) = @_;
-
-	if ($is_ghost) {
-		$smsg->ensure_metadata;
-	} else {
-		$self->link_message_to_parents($smsg);
-	}
-}
-
-sub link_message_to_parents {
-	my ($self, $smsg) = @_;
-	my $doc = $smsg->{doc};
-	my $mid = mid_compressed($smsg->mid);
-	my $mime = $smsg->mime;
-	my $refs = $mime->header_obj->header('References');
-	my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
-	my $irt = $mime->header_obj->header('In-Reply-To');
-	if ($irt) {
-		$irt = mid_compressed(mid_clean($irt));
-
-		# maybe some crazies will try to make a circular reference:
-		if ($irt eq $mid) {
-			$irt = undef;
-		} else {
-			# last References should be $irt
-			# we will de-dupe later
-			push @refs, $irt;
-		}
-	}
-
-	my $tid;
-	if (@refs) {
-		my @crefs = map { mid_compressed($_) } @refs;
-		my %uniq = ($mid => 1);
-
-		# prevent circular references via References: here:
-		@refs = ();
-		foreach my $ref (@crefs) {
-			next if $uniq{$ref};
-			$uniq{$ref} = 1;
-			push @refs, $ref;
-		}
-	}
-	if (@refs) {
-		$doc->add_term(xpfx('inreplyto') . $irt) if defined $irt;
-		$smsg->{references_sorted} = '<'.join('><', @refs).'>';
-
-		my $ref_pfx = xpfx('references');
-
-		# first ref *should* be the thread root,
-		# but we can never trust clients to do the right thing
-		my $ref = shift @refs;
-		$doc->add_term($ref_pfx . $ref);
-		$tid = $self->_resolve_mid_to_tid($ref);
-
-		# the rest of the refs should point to this tid:
-		foreach $ref (@refs) {
-			$doc->add_term($ref_pfx . $ref);
-			my $ptid = $self->_resolve_mid_to_tid($ref);
-			if ($tid ne $ptid) {
-				$self->merge_threads($tid, $ptid);
-			}
-		}
-	} else {
-		$tid = $self->next_thread_id;
-	}
-	$doc->add_term(xpfx('thread') . $tid);
-}
-
 sub lookup_message {
 	my ($self, $mid) = @_;
 	$mid = mid_clean($mid);
@@ -450,47 +214,6 @@ sub find_doc_ids_for_term {
 	($db->postlist_begin($term), $db->postlist_end($term));
 }
 
-# this will create a ghost as necessary
-sub _resolve_mid_to_tid {
-	my ($self, $mid) = @_;
-
-	my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
-	$smsg->thread_id;
-}
-
-sub create_ghost {
-	my ($self, $mid, $tid) = @_;
-
-	$mid = mid_compressed($mid);
-	$tid = $self->next_thread_id unless defined $tid;
-
-	my $doc = Search::Xapian::Document->new;
-	$doc->add_term(xpfx('mid') . $mid);
-	$doc->add_term(xpfx('thread') . $tid);
-	$doc->add_term(xpfx('type') . 'ghost');
-
-	my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
-	$self->link_message($smsg, 1);
-	$self->{xdb}->add_document($doc);
-
-	$smsg;
-}
-
-sub merge_threads {
-	my ($self, $winner_tid, $loser_tid) = @_;
-	my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
-	my $thread_pfx = xpfx('thread');
-	my $db = $self->{xdb};
-
-	for (; $head != $tail; $head->inc) {
-		my $docid = $head->get_docid;
-		my $doc = $db->get_document($docid);
-		$doc->remove_term($thread_pfx . $loser_tid);
-		$doc->add_term($thread_pfx . $winner_tid);
-		$db->replace_document($docid, $doc);
-	}
-}
-
 # normalize subjects so they are suitable as pathnames for URLs
 sub subject_path {
 	my $subj = pop;
@@ -509,79 +232,9 @@ sub subject_normalized {
 	$subj;
 }
 
-sub do_cat_mail {
-	my ($git, $blob) = @_;
-	my $mime = eval {
-		my $str = $git->cat_file($blob);
-		Email::MIME->new($str);
-	};
-	$@ ? undef : $mime;
-}
-
-sub index_blob {
-	my ($self, $git, $blob) = @_;
-	my $mime = do_cat_mail($git, $blob) or return;
-	eval { $self->add_message($mime) };
-	warn "W: index_blob $blob: $@\n" if $@;
-}
-
-sub unindex_blob {
-	my ($self, $git, $blob) = @_;
-	my $mime = do_cat_mail($git, $blob) or return;
-	my $mid = $mime->header_obj->header('Message-ID');
-	eval { $self->remove_message($mid) } if defined $mid;
-	warn "W: unindex_blob $blob: $@\n" if $@;
-}
-
 sub enquire {
 	my ($self) = @_;
 	$self->{enquire} ||= Search::Xapian::Enquire->new($self->{xdb});
 }
 
-# indexes all unindexed messages
-sub index_sync {
-	my ($self, $head) = @_;
-	require PublicInbox::GitCatFile;
-	my $db = $self->{xdb};
-	my $hex = '[a-f0-9]';
-	my $h40 = $hex .'{40}';
-	my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
-	my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
-	$head ||= 'HEAD';
-
-	$db->begin_transaction;
-	eval {
-		my $git = PublicInbox::GitCatFile->new($self->{git_dir});
-
-		my $latest = $db->get_metadata('last_commit');
-		my $range = length $latest ? "$latest..$head" : $head;
-		$latest = undef;
-
-		# get indexed messages
-		my @cmd = ('git', "--git-dir=$self->{git_dir}", "log",
-			    qw/--reverse --no-notes --no-color --raw -r
-			       --no-abbrev/, $range);
-		my $pid = open(my $log, '-|', @cmd) or
-			die('open` '.join(' ', @cmd) . " pipe failed: $!\n");
-
-		while (my $line = <$log>) {
-			if ($line =~ /$addmsg/o) {
-				$self->index_blob($git, $1);
-			} elsif ($line =~ /$delmsg/o) {
-				$self->unindex_blob($git, $1);
-			} elsif ($line =~ /^commit ($h40)/o) {
-				$latest = $1;
-			}
-		}
-		close $log;
-		$db->set_metadata('last_commit', $latest) if defined $latest;
-	};
-	if ($@) {
-		warn "indexing failed: $@\n";
-		$db->cancel_transaction;
-	} else {
-		$db->commit_transaction;
-	}
-}
-
 1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
new file mode 100644
index 0000000..408b21f
--- /dev/null
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -0,0 +1,363 @@
+# Copyright (C) 2015, all contributors <meta@public-inbox.org>
+# License: AGPLv3 or later (https://www.gnu.org/licenses/agpl-3.0.txt)
+# based on notmuch, but with no concept of folders, files or flags
+package PublicInbox::SearchIdx;
+use strict;
+use warnings;
+use base qw(PublicInbox::Search);
+use PublicInbox::MID qw/mid_clean mid_compressed/;
+*xpfx = *PublicInbox::Search::xpfx;
+
+sub new {
+	my ($class, $git_dir, $writable) = @_;
+	my $dir = $class->xdir($git_dir);
+	require Search::Xapian::WritableDatabase;
+	my $flag = Search::Xapian::DB_OPEN;
+	if ($writable == 1) {
+		require File::Path;
+		File::Path::mkpath($dir);
+		$flag = Search::Xapian::DB_CREATE_OR_OPEN;
+	}
+	my $db = Search::Xapian::WritableDatabase->new($dir, $flag);
+	bless { xdb => $db, git_dir => $git_dir }, $class;
+}
+
+sub add_message {
+	my ($self, $mime) = @_; # mime = Email::MIME object
+	my $db = $self->{xdb};
+
+	my $doc_id;
+	my $mid_orig = mid_clean($mime->header_obj->header('Message-ID'));
+	my $mid = mid_compressed($mid_orig);
+	my $was_ghost = 0;
+	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
+
+	eval {
+		my $smsg = $self->lookup_message($mid);
+		my $doc;
+
+		if ($smsg) {
+			$smsg->ensure_metadata;
+			# convert a ghost to a regular message
+			# it will also clobber any existing regular message
+			$smsg->mime($mime);
+			$doc = $smsg->{doc};
+
+			my $type = xpfx('type');
+			eval {
+				$doc->remove_term($type . 'ghost');
+				$was_ghost = 1;
+			};
+
+			# probably does not exist:
+			eval { $doc->remove_term($type . 'mail') };
+			$doc->add_term($type . 'mail');
+		}  else {
+			$smsg = PublicInbox::SearchMsg->new($mime);
+			$doc = $smsg->{doc};
+			$doc->add_term(xpfx('mid') . $mid);
+		}
+
+		my $subj = $smsg->subject;
+
+		if (length $subj) {
+			$doc->add_term(xpfx('subject') . $subj);
+
+			my $path = $self->subject_path($subj);
+			$doc->add_term(xpfx('path') . mid_compressed($path));
+		}
+
+		my $from = $smsg->from_name;
+		my $date = $smsg->date;
+		my $ts = Search::Xapian::sortable_serialise($smsg->ts);
+		$doc->add_value(PublicInbox::Search::TS, $ts);
+
+		my $tg = $self->term_generator;
+
+		$tg->set_document($doc);
+		$tg->index_text($subj, 1, 'S') if $subj;
+		$tg->increase_termpos;
+		$tg->index_text($subj) if $subj;
+		$tg->increase_termpos;
+
+		$tg->index_text($smsg->from->format);
+		$tg->increase_termpos;
+
+		$mime->walk_parts(sub {
+			my ($part) = @_;
+			return if $part->subparts; # walk_parts already recurses
+			my $ct = $part->content_type || $ct_msg;
+
+			# account for filter bugs...
+			$ct =~ m!\btext/plain\b!i or return;
+
+			my (@orig, @quot);
+			my $body = $part->body;
+			$part->body_set('');
+			my @lines = split(/\n/, $body);
+			while (defined(my $l = shift @lines)) {
+				if ($l =~ /^\s*>/) {
+					push @quot, $l;
+				} else {
+					push @orig, $l;
+				}
+			}
+			if (@quot) {
+				$tg->index_text(join("\n", @quot), 0);
+				@quot = ();
+				$tg->increase_termpos;
+			}
+			if (@orig) {
+				$tg->index_text(join("\n", @orig));
+				@orig = ();
+				$tg->increase_termpos;
+			}
+		});
+
+		if ($was_ghost) {
+			$doc_id = $smsg->doc_id;
+			$self->link_message($smsg, 0);
+			$doc->set_data($smsg->to_doc_data);
+			$db->replace_document($doc_id, $doc);
+		} else {
+			$self->link_message($smsg, 0);
+			$doc->set_data($smsg->to_doc_data);
+			$doc_id = $db->add_document($doc);
+		}
+	};
+
+	if ($@) {
+		warn "failed to index message <$mid_orig>: $@\n";
+		return undef;
+	}
+	$doc_id;
+}
+
+# returns deleted doc_id on success, undef on missing
+sub remove_message {
+	my ($self, $mid_orig) = @_;
+	my $db = $self->{xdb};
+	my $doc_id;
+	$mid_orig = mid_clean($mid_orig);
+	my $mid = mid_compressed($mid_orig);
+
+	eval {
+		$doc_id = $self->find_unique_doc_id('mid', $mid);
+		$db->delete_document($doc_id) if defined $doc_id;
+	};
+
+	if ($@) {
+		warn "failed to remove message <$mid_orig>: $@\n";
+		return undef;
+	}
+	$doc_id;
+}
+
+sub term_generator { # write-only
+	my ($self) = @_;
+
+	my $tg = $self->{term_generator};
+	return $tg if $tg;
+
+	$tg = Search::Xapian::TermGenerator->new;
+	$tg->set_stemmer($self->stemmer);
+
+	$self->{term_generator} = $tg;
+}
+
+sub next_doc_id { $_[0]->{xdb}->get_lastdocid + 1 }
+
+# increments last_thread_id counter
+# returns a 64-bit integer represented as a hex string
+sub next_thread_id {
+	my ($self) = @_;
+	my $db = $self->{xdb};
+	my $last_thread_id = int($db->get_metadata('last_thread_id') || 0);
+
+	$db->set_metadata('last_thread_id', ++$last_thread_id);
+
+	$last_thread_id;
+}
+
+sub link_message {
+	my ($self, $smsg, $is_ghost) = @_;
+
+	if ($is_ghost) {
+		$smsg->ensure_metadata;
+	} else {
+		$self->link_message_to_parents($smsg);
+	}
+}
+
+sub link_message_to_parents {
+	my ($self, $smsg) = @_;
+	my $doc = $smsg->{doc};
+	my $mid = mid_compressed($smsg->mid);
+	my $mime = $smsg->mime;
+	my $refs = $mime->header_obj->header('References');
+	my @refs = $refs ? ($refs =~ /<([^>]+)>/g) : ();
+	my $irt = $mime->header_obj->header('In-Reply-To');
+	if ($irt) {
+		$irt = mid_compressed(mid_clean($irt));
+
+		# maybe some crazies will try to make a circular reference:
+		if ($irt eq $mid) {
+			$irt = undef;
+		} else {
+			# last References should be $irt
+			# we will de-dupe later
+			push @refs, $irt;
+		}
+	}
+
+	my $tid;
+	if (@refs) {
+		my @crefs = map { mid_compressed($_) } @refs;
+		my %uniq = ($mid => 1);
+
+		# prevent circular references via References: here:
+		@refs = ();
+		foreach my $ref (@crefs) {
+			next if $uniq{$ref};
+			$uniq{$ref} = 1;
+			push @refs, $ref;
+		}
+	}
+	if (@refs) {
+		$doc->add_term(xpfx('inreplyto') . $irt) if defined $irt;
+		$smsg->{references_sorted} = '<'.join('><', @refs).'>';
+
+		my $ref_pfx = xpfx('references');
+
+		# first ref *should* be the thread root,
+		# but we can never trust clients to do the right thing
+		my $ref = shift @refs;
+		$doc->add_term($ref_pfx . $ref);
+		$tid = $self->_resolve_mid_to_tid($ref);
+
+		# the rest of the refs should point to this tid:
+		foreach $ref (@refs) {
+			$doc->add_term($ref_pfx . $ref);
+			my $ptid = $self->_resolve_mid_to_tid($ref);
+			if ($tid ne $ptid) {
+				$self->merge_threads($tid, $ptid);
+			}
+		}
+	} else {
+		$tid = $self->next_thread_id;
+	}
+	$doc->add_term(xpfx('thread') . $tid);
+}
+
+sub index_blob {
+	my ($self, $git, $blob) = @_;
+	my $mime = do_cat_mail($git, $blob) or return;
+	eval { $self->add_message($mime) };
+	warn "W: index_blob $blob: $@\n" if $@;
+}
+
+sub unindex_blob {
+	my ($self, $git, $blob) = @_;
+	my $mime = do_cat_mail($git, $blob) or return;
+	my $mid = $mime->header_obj->header('Message-ID');
+	eval { $self->remove_message($mid) } if defined $mid;
+	warn "W: unindex_blob $blob: $@\n" if $@;
+}
+
+sub do_cat_mail {
+	my ($git, $blob) = @_;
+	my $mime = eval {
+		my $str = $git->cat_file($blob);
+		Email::MIME->new($str);
+	};
+	$@ ? undef : $mime;
+}
+
+# indexes all unindexed messages
+sub index_sync {
+	my ($self, $head) = @_;
+	require PublicInbox::GitCatFile;
+	my $db = $self->{xdb};
+	my $hex = '[a-f0-9]';
+	my $h40 = $hex .'{40}';
+	my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
+	my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
+	$head ||= 'HEAD';
+
+	$db->begin_transaction;
+	eval {
+		my $git = PublicInbox::GitCatFile->new($self->{git_dir});
+
+		my $latest = $db->get_metadata('last_commit');
+		my $range = length $latest ? "$latest..$head" : $head;
+		$latest = undef;
+
+		# get indexed messages
+		my @cmd = ('git', "--git-dir=$self->{git_dir}", "log",
+			    qw/--reverse --no-notes --no-color --raw -r
+			       --no-abbrev/, $range);
+		my $pid = open(my $log, '-|', @cmd) or
+			die('open` '.join(' ', @cmd) . " pipe failed: $!\n");
+
+		while (my $line = <$log>) {
+			if ($line =~ /$addmsg/o) {
+				$self->index_blob($git, $1);
+			} elsif ($line =~ /$delmsg/o) {
+				$self->unindex_blob($git, $1);
+			} elsif ($line =~ /^commit ($h40)/o) {
+				$latest = $1;
+			}
+		}
+		close $log;
+		$db->set_metadata('last_commit', $latest) if defined $latest;
+	};
+	if ($@) {
+		warn "indexing failed: $@\n";
+		$db->cancel_transaction;
+	} else {
+		$db->commit_transaction;
+	}
+}
+
+# this will create a ghost as necessary
+sub _resolve_mid_to_tid {
+	my ($self, $mid) = @_;
+
+	my $smsg = $self->lookup_message($mid) || $self->create_ghost($mid);
+	$smsg->thread_id;
+}
+
+sub create_ghost {
+	my ($self, $mid, $tid) = @_;
+
+	$mid = mid_compressed($mid);
+	$tid = $self->next_thread_id unless defined $tid;
+
+	my $doc = Search::Xapian::Document->new;
+	$doc->add_term(xpfx('mid') . $mid);
+	$doc->add_term(xpfx('thread') . $tid);
+	$doc->add_term(xpfx('type') . 'ghost');
+
+	my $smsg = PublicInbox::SearchMsg->wrap($doc, $mid);
+	$self->link_message($smsg, 1);
+	$self->{xdb}->add_document($doc);
+
+	$smsg;
+}
+
+sub merge_threads {
+	my ($self, $winner_tid, $loser_tid) = @_;
+	my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
+	my $thread_pfx = xpfx('thread');
+	my $db = $self->{xdb};
+
+	for (; $head != $tail; $head->inc) {
+		my $docid = $head->get_docid;
+		my $doc = $db->get_document($docid);
+		$doc->remove_term($thread_pfx . $loser_tid);
+		$doc->add_term($thread_pfx . $winner_tid);
+		$db->replace_document($docid, $doc);
+	}
+}
+
+1;
diff --git a/public-inbox-index b/public-inbox-index
index 1104bbc..f39ad9e 100755
--- a/public-inbox-index
+++ b/public-inbox-index
@@ -10,7 +10,7 @@ use strict;
 use warnings;
 my $usage = "public-inbox-index GIT_DIR";
 use PublicInbox::Config;
-eval { require PublicInbox::Search };
+eval { require PublicInbox::SearchIdx };
 if ($@) {
 	print STDERR "Search::Xapian required for $0\n";
 	exit 1;
@@ -26,6 +26,6 @@ foreach my $dir (@ARGV) {
 sub index_dir {
 	my ($git_dir) = @_;
 	-d $git_dir or die "$git_dir does not appear to be a git repository\n";
-	my $s = PublicInbox::Search->new($git_dir, 1);
+	my $s = PublicInbox::SearchIdx->new($git_dir, 1);
 	$s->index_sync;
 }
diff --git a/public-inbox-learn b/public-inbox-learn
index bd59247..c89ffb5 100755
--- a/public-inbox-learn
+++ b/public-inbox-learn
@@ -78,9 +78,9 @@ foreach my $recipient (keys %dests) {
 	}
 
 	$err or eval {
-		require PublicInbox::Search;
+		require PublicInbox::SearchIdx;
 		umask 0022; # XXX FIXME use git config core.sharedRepository
-		my $s = PublicInbox::Search->new($git_dir, 2);
+		my $s = PublicInbox::SearchIdx->new($git_dir, 2);
 		$s->index_sync;
 	};
 }
diff --git a/public-inbox-mda b/public-inbox-mda
index a3c959a..630ffcb 100755
--- a/public-inbox-mda
+++ b/public-inbox-mda
@@ -90,7 +90,7 @@ sub search_index_sync {
 	eval {
 		require PublicInbox::Search;
 		umask 0022; # XXX FIXME use git config core.sharedRepository
-		my $s = PublicInbox::Search->new($git_dir, 2);
+		my $s = PublicInbox::SearchIdx->new($git_dir, 2);
 		$s->index_sync;
 	};
 }
diff --git a/t/search.t b/t/search.t
index 89a2862..be39410 100644
--- a/t/search.t
+++ b/t/search.t
@@ -3,7 +3,7 @@
 use strict;
 use warnings;
 use Test::More;
-eval { require PublicInbox::Search; };
+eval { require PublicInbox::SearchIdx; };
 plan skip_all => "Xapian missing for search" if $@;
 use File::Temp qw/tempdir/;
 use Email::MIME;
@@ -16,11 +16,11 @@ is(0, system(qw(git init -q --bare), $git_dir), "git init (main)");
 eval { PublicInbox::Search->new($git_dir) };
 ok($@, "exception raised on non-existent DB");
 
-my $rw = PublicInbox::Search->new($git_dir, 1);
+my $rw = PublicInbox::SearchIdx->new($git_dir, 1);
 my $ro = PublicInbox::Search->new($git_dir);
 my $rw_commit = sub {
 	$rw = undef;
-	$rw = PublicInbox::Search->new($git_dir, 1);
+	$rw = PublicInbox::SearchIdx->new($git_dir, 1);
 };
 
 {
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/6] view: misc cleanups and simplifications
  2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
  2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

Less code should be easier-to-read.
---
 lib/PublicInbox/View.pm | 22 +++++++---------------
 1 file changed, 7 insertions(+), 15 deletions(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 4e01507..922e8e9 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -93,8 +93,7 @@ sub index_entry {
 	unless (defined $ts) {
 		$ts = msg_timestamp($mime);
 	}
-	my $fmt = '%Y-%m-%d %H:%M';
-	$ts = POSIX::strftime($fmt, gmtime($ts));
+	$ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts));
 
 	my $rv = "<table\nsummary=l$level><tr>";
 	if ($level) {
@@ -167,14 +166,8 @@ sub emit_thread_html {
 	}
 	my $final_anchor = $state->[3];
 	my $next = "<a\nid=\"s$final_anchor\">";
-
-	if ($final_anchor == 1) {
-		$next .= 'only message in thread';
-	} else {
-		$next .= 'end of thread';
-	}
-	$next .= "</a>, back to <a\nhref=\"../\">index</a>\n";
-
+	$next .= $final_anchor == 1 ? 'only message in' : 'end of';
+	$next .= " thread</a>, back to <a\nhref=\"../\">index</a>\n";
 	$fh->write("<hr />" . PRE_WRAP . $next . $foot .
 		   "</pre></body></html>");
 	$fh->close;
@@ -258,9 +251,9 @@ sub flush_quote {
 	if ($full_pfx) {
 		if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) {
 			# show quote inline
-			my $rv = join("\n", map { linkify($_); $_ } @$quot);
+			my $rv = join('', map { linkify($_); $_ } @$quot);
 			@$quot = ();
-			return $rv . "\n";
+			return $rv;
 		}
 
 		# show a short snippet of quoted text and link to full version:
@@ -286,7 +279,7 @@ sub flush_quote {
 		# short version (see above)
 		my $nr = ++$$n;
 		my $rv = "<a\nid=q${part_nr}_$nr></a>";
-		$rv .= join("\n", map { linkify($_); $_ } @$quot) . "\n";
+		$rv .= join('', map { linkify($_); $_ } @$quot);
 		@$quot = ();
 		$rv;
 	}
@@ -309,7 +302,7 @@ sub add_text_body {
 	$part->body_set('');
 	$s = $enc->decode($s);
 	$s = ascii_html($s);
-	my @lines = split(/\n/, $s);
+	my @lines = split(/^/m, $s);
 	$s = '';
 
 	if ($$part_nr > 0) {
@@ -330,7 +323,6 @@ sub add_text_body {
 			# regular line, OK
 			linkify($cur);
 			$s .= $cur;
-			$s .= "\n";
 		} else {
 			push @quot, $cur;
 		}
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/6] view: reference total followups
  2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
  2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
  2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

In case there's huge threads, readers should know about them
even though we currently lack the navigation to display them.
---
 lib/PublicInbox/View.pm | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 922e8e9..e333906 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -437,7 +437,12 @@ sub html_footer {
 		       "threadlink</a>$idx";
 		my $res = $srch->get_followups($mid);
 		if (my $c = $res->{total}) {
-			$c = $c == 1 ? '1 followup' : "$c followups";
+			my $nr = scalar @{$res->{msgs}};
+			if ($nr < $c) {
+				$c = "$nr of $c followups";
+			} else {
+				$c = $c == 1 ? '1 followup' : "$c followups";
+			}
 			$idx .= "\n$c:\n";
 			$res->{srch} = $srch;
 			thread_followups(\$idx, $mime, $res);
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 4/6] search: consistently pass options and flags
  2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
  2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
  2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
  2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
  2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

Most of our special query functions require exact matches, so none
of the flags we normally use are necessary for query parsing.
---
 lib/PublicInbox/Search.pm | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 753f5f3..c61d4cf 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -82,7 +82,7 @@ sub query {
 sub get_subject_path {
 	my ($self, $path, $opts) = @_;
 	my $query = $self->qp->parse_query("path:".mid_compressed($path), 0);
-	$self->do_enquire($query);
+	$self->do_enquire($query, $opts);
 }
 
 # given a message ID, get followups to a message
@@ -94,8 +94,7 @@ sub get_followups {
 	my $irt = $qp->parse_query("inreplyto:$mid", 0);
 	my $ref = $qp->parse_query("references:$mid", 0);
 	my $query = Search::Xapian::Query->new(OP_OR, $irt, $ref);
-
-	$self->do_enquire($query);
+	$self->do_enquire($query, $opts);
 }
 
 sub get_thread {
@@ -104,8 +103,8 @@ sub get_thread {
 
 	return { total => 0, msgs => [] } unless $smsg;
 	my $qp = $self->qp;
-	my $qtid = $qp->parse_query('thread:'.$smsg->thread_id);
-	my $qsub = $qp->parse_query('path:'.mid_compressed($smsg->path));
+	my $qtid = $qp->parse_query('thread:'.$smsg->thread_id, 0);
+	my $qsub = $qp->parse_query('path:'.mid_compressed($smsg->path), 0);
 	my $query = Search::Xapian::Query->new(OP_OR, $qtid, $qsub);
 	$self->do_enquire($query, $opts);
 }
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 5/6] mbox: support uncompressed mbox
  2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
                   ` (2 preceding siblings ...)
  2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
  2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

Some folks may want to view the mbox inline as a string of raw text,
when guessing URLs.  Let them do this...
---
 lib/PublicInbox/Mbox.pm | 18 +++++++++++-------
 lib/PublicInbox/WWW.pm  | 12 +++++++-----
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/lib/PublicInbox/Mbox.pm b/lib/PublicInbox/Mbox.pm
index 5f5612a..d49e9b3 100644
--- a/lib/PublicInbox/Mbox.pm
+++ b/lib/PublicInbox/Mbox.pm
@@ -7,10 +7,10 @@ use warnings;
 use PublicInbox::MID qw/mid_compressed mid2path/;
 
 sub thread_mbox {
-	my ($ctx, $srch) = @_;
+	my ($ctx, $srch, $sfx) = @_;
 	sub {
 		my ($response) = @_; # Plack callback
-		emit_mbox($response, $ctx, $srch);
+		emit_mbox($response, $ctx, $srch, $sfx);
 	}
 }
 
@@ -38,14 +38,18 @@ sub emit_msg {
 }
 
 sub emit_mbox {
-	my ($response, $ctx, $srch) = @_;
-	eval { require IO::Compress::Gzip };
-	return need_gzip($response) if $@;
+	my ($response, $ctx, $srch, $sfx) = @_;
+	my $type = 'mbox';
+	if ($sfx) {
+		eval { require IO::Compress::Gzip };
+		return need_gzip($response) if $@;
+		$type = 'gzip';
+	}
 
 	# http://www.iana.org/assignments/media-types/application/gzip
 	# http://www.iana.org/assignments/media-types/application/mbox
-	my $fh = $response->([200, ['Content-Type' => 'application/gzip']]);
-	$fh = PublicInbox::MboxGz->new($fh);
+	my $fh = $response->([200, ['Content-Type' => "application/$type"]]);
+	$fh = PublicInbox::MboxGz->new($fh) if $sfx;
 
 	require PublicInbox::GitCatFile;
 	require Email::Simple;
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index 30a7a43..33c7110 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,9 +53,10 @@ sub run {
 	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) {
 		invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx, $cgi);
 
-	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox\.gz!o) {
+	} elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.mbox(\.gz)?\z!o) {
 		my $sfx = $3;
-		invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $cgi);
+		invalid_list_mid(\%ctx, $1, $2) ||
+			get_thread_mbox(\%ctx, $cgi, $sfx);
 
 	} elsif ($path_info =~ m!$LISTNAME_RE/f/\S+\.txt\z!o) {
 		invalid_list_mid(\%ctx, $1, $2) ||
@@ -331,15 +332,16 @@ sub msg_pfx {
 	"../f/$href.html";
 }
 
-# /$LISTNAME/t/$MESSAGE_ID.mbox.gz        -> search results as gzipped mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox           -> thread as mbox
+# /$LISTNAME/t/$MESSAGE_ID.mbox.gz        -> thread as gzipped mbox
 # note: I'm not a big fan of other compression formats since they're
 # significantly more expensive on CPU than gzip and less-widely available,
 # especially on older systems.  Stick to zlib since that's what git uses.
 sub get_thread_mbox {
-	my ($ctx, $cgi) = @_;
+	my ($ctx, $cgi, $sfx) = @_;
 	my $srch = searcher($ctx) or return need_search($ctx);
 	require PublicInbox::Mbox;
-	PublicInbox::Mbox::thread_mbox($ctx, $srch);
+	PublicInbox::Mbox::thread_mbox($ctx, $srch, $sfx);
 }
 
 1;
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 6/6] view: wire up mbox.gz links
  2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
                   ` (3 preceding siblings ...)
  2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
@ 2015-08-22 11:41 ` Eric Wong
  4 siblings, 0 replies; 6+ messages in thread
From: Eric Wong @ 2015-08-22 11:41 UTC (permalink / raw)
  To: meta

To reduce clutter, we will not link to uncompressed versions.
Users should be able to download entire threads for offline
reading, enable this feature for them.
---
 lib/PublicInbox/View.pm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index e333906..fe37a9f 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -168,6 +168,8 @@ sub emit_thread_html {
 	my $next = "<a\nid=\"s$final_anchor\">";
 	$next .= $final_anchor == 1 ? 'only message in' : 'end of';
 	$next .= " thread</a>, back to <a\nhref=\"../\">index</a>\n";
+	$mid = PublicInbox::Hval->new_msgid($mid)->as_href;
+	$next .= "download: <a\nhref=\"$mid.mbox.gz\">mbox.gz</a>\n\n";
 	$fh->write("<hr />" . PRE_WRAP . $next . $foot .
 		   "</pre></body></html>");
 	$fh->close;
-- 
EW


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2015-08-22 11:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-08-22 11:41 [PATCH 1/6] search: split search indexing to a separate file Eric Wong
2015-08-22 11:41 ` [PATCH 2/6] view: misc cleanups and simplifications Eric Wong
2015-08-22 11:41 ` [PATCH 3/6] view: reference total followups Eric Wong
2015-08-22 11:41 ` [PATCH 4/6] search: consistently pass options and flags Eric Wong
2015-08-22 11:41 ` [PATCH 5/6] mbox: support uncompressed mbox Eric Wong
2015-08-22 11:41 ` [PATCH 6/6] view: wire up mbox.gz links Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).