From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 08/13] v2writable: support reindexing Xapian
Date: Thu, 22 Mar 2018 09:40:10 +0000 [thread overview]
Message-ID: <20180322094015.14422-9-e@80x24.org> (raw)
In-Reply-To: <20180322094015.14422-1-e@80x24.org>
This still requires a msgmap.sqlite3 file to exist, but
it allows us to tweak Xapian indexing rules and reindex
the Xapian database online while -watch is running.
---
lib/PublicInbox/Msgmap.pm | 40 +++++++++--
lib/PublicInbox/SearchIdx.pm | 14 +++-
lib/PublicInbox/SearchIdxSkeleton.pm | 1 +
lib/PublicInbox/V2Writable.pm | 134 +++++++++++++++++++++++++++++++----
script/public-inbox-index | 25 +++++--
5 files changed, 189 insertions(+), 25 deletions(-)
diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 78922d3..1283305 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -24,9 +24,8 @@ sub new {
new_file($class, "$d/msgmap.sqlite3", $writable);
}
-sub new_file {
- my ($class, $f, $writable) = @_;
-
+sub dbh_new {
+ my ($f, $writable) = @_;
my $dbh = DBI->connect("dbi:SQLite:dbname=$f",'','', {
AutoCommit => 1,
RaiseError => 1,
@@ -35,6 +34,13 @@ sub new_file {
sqlite_use_immediate_transaction => 1,
});
$dbh->do('PRAGMA case_sensitive_like = ON');
+ $dbh;
+}
+
+sub new_file {
+ my ($class, $f, $writable) = @_;
+
+ my $dbh = dbh_new($f, $writable);
my $self = bless { dbh => $dbh }, $class;
if ($writable) {
@@ -49,12 +55,13 @@ sub new_file {
# used to keep track of used numeric mappings for v2 reindex
sub tmp_clone {
my ($self) = @_;
- my ($fh, $fn) = tempfile(EXLOCK => 0);
+ my ($fh, $fn) = tempfile('msgmap-XXXXXXXX', EXLOCK => 0, TMPDIR => 1);
$self->{dbh}->sqlite_backup_to_file($fn);
my $tmp = ref($self)->new_file($fn, 1);
$tmp->{dbh}->do('PRAGMA synchronous = OFF');
$tmp->{tmp_name} = $fn; # SQLite won't work if unlinked, apparently
- $fh = undef;
+ $tmp->{pid} = $$;
+ close $fh or die "failed to close $fn: $!";
$tmp;
}
@@ -205,7 +212,28 @@ sub mid_set {
sub DESTROY {
my ($self) = @_;
delete $self->{dbh};
- unlink $self->{tmp_name} if defined $self->{tmp_name};
+ my $f = delete $self->{tmp_name};
+ if (defined $f && $self->{pid} == $$) {
+ unlink $f or warn "failed to unlink $f: $!\n";
+ }
+}
+
+sub atfork_parent {
+ my ($self) = @_;
+ my $f = $self->{tmp_name} or die "not a temporary clone\n";
+ delete $self->{dbh} and die "tmp_clone dbh not prepared for parent";
+ my $dbh = $self->{dbh} = dbh_new($f, 1);
+ $dbh->do('PRAGMA synchronous = OFF');
+}
+
+sub atfork_prepare {
+ my ($self) = @_;
+ my $f = $self->{tmp_name} or die "not a temporary clone\n";
+ $self->{pid} == $$ or
+ die "BUG: atfork_prepare not called from $self->{pid}\n";
+ $self->{dbh} or die "temporary clone not open\n";
+ # must clobber prepared statements
+ %$self = (tmp_name => $f, pid => $$);
}
1;
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index ef723a4..7ac16ec 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -352,7 +352,7 @@ sub add_message {
# populates smsg->references for smsg->to_doc_data
my $refs = parse_references($smsg);
- $mid0 = $mids->[0] unless defined $mid0;
+ $mid0 = $mids->[0] unless defined $mid0; # v1 compatibility
my $data = $smsg->to_doc_data($oid, $mid0);
foreach my $mid (@$mids) {
$tg->index_text($mid, 1, 'XM');
@@ -369,10 +369,12 @@ sub add_message {
}
}
+ $self->delete_article($num) if defined $num; # for reindexing
if ($skel) {
push @values, $mids, $xpath, $data;
$skel->index_skeleton(\@values);
$doc->add_boolean_term('Q' . $_) foreach @$mids;
+ $doc->add_boolean_term('XNUM' . $num) if defined $num;
$doc_id = $self->{xdb}->add_document($doc);
} else {
$doc_id = link_and_save($self, $doc, $mids, $refs,
@@ -421,6 +423,16 @@ sub remove_message {
}
}
+sub delete_article {
+ my ($self, $num) = @_;
+ my $ndel = 0;
+ batch_do($self, 'XNUM' . $num, sub {
+ my ($ids) = @_;
+ $ndel += scalar @$ids;
+ $self->{xdb}->delete_document($_) for @$ids;
+ });
+}
+
# MID is a hint in V2
sub remove_by_oid {
my ($self, $oid, $mid) = @_;
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 78a1730..4f15816 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -134,6 +134,7 @@ sub index_skeleton_real ($$) {
$smsg->load_from_data($doc_data);
my $num = $values->[PublicInbox::Search::NUM];
my @refs = ($smsg->references =~ /<([^>]+)>/g);
+ $self->delete_article($num) if defined $num; # for reindexing
$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
}
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 46bfebb..550a74d 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -359,6 +359,23 @@ sub git_init {
$git_dir
}
+sub git_dir_latest {
+ my ($self, $max) = @_;
+ $$max = -1;
+ my $pfx = "$self->{-inbox}->{mainrepo}/git";
+ return unless -d $pfx;
+ my $latest;
+ opendir my $dh, $pfx or die "opendir $pfx: $!\n";
+ while (defined(my $git_dir = readdir($dh))) {
+ $git_dir =~ m!\A(\d+)\.git\z! or next;
+ if ($1 > $$max) {
+ $$max = $1;
+ $latest = "$pfx/$git_dir";
+ }
+ }
+ $latest;
+}
+
sub importer {
my ($self) = @_;
my $im = $self->{im};
@@ -375,20 +392,9 @@ sub importer {
return $self->import_init($git, 0);
}
}
- my $latest;
- my $max = -1;
my $new = 0;
- my $pfx = "$self->{-inbox}->{mainrepo}/git";
- if (-d $pfx) {
- foreach my $git_dir (glob("$pfx/*.git")) {
- $git_dir =~ m!/(\d+)\.git\z! or next;
- my $n = $1;
- if ($n > $max) {
- $max = $n;
- $latest = $git_dir;
- }
- }
- }
+ my $max;
+ my $latest = git_dir_latest($self, \$max);
if (defined $latest) {
my $git = PublicInbox::Git->new($latest);
my $packed_bytes = $git->packed_bytes;
@@ -466,6 +472,8 @@ sub lookup_content {
sub atfork_child {
my ($self) = @_;
+ my $fh = delete $self->{reindex_pipe};
+ close $fh if $fh;
if (my $parts = $self->{idx_parts}) {
$_->atfork_child foreach @$parts;
}
@@ -474,4 +482,104 @@ sub atfork_child {
}
}
+sub mark_deleted {
+ my ($self, $D, $git, $oid) = @_;
+ my $msgref = $git->cat_file($oid);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+ foreach my $mid (@$mids) {
+ $D->{$mid.$cid} = 1;
+ }
+}
+
+sub reindex_oid {
+ my ($self, $mm_tmp, $D, $git, $oid) = @_;
+ my $len;
+ my $msgref = $git->cat_file($oid, \$len);
+ my $mime = PublicInbox::MIME->new($$msgref);
+ my $mids = mids($mime->header_obj);
+ my $cid = content_id($mime);
+
+ # get the NNTP article number we used before, highest number wins
+ # and gets deleted from mm_tmp;
+ my $mid0;
+ my $num = -1;
+ my $del = 0;
+ foreach my $mid (@$mids) {
+ $del += (delete $D->{$mid.$cid} || 0);
+ my $n = $mm_tmp->num_for($mid);
+ if (defined $n && $n > $num) {
+ $mid0 = $mid;
+ $num = $n;
+ }
+ }
+ if (!defined($mid0) || $del) {
+ return if (!defined($mid0) && $del); # expected for deletes
+
+ my $id = '<' . join('> <', @$mids) . '>';
+ defined($mid0) or
+ warn "Skipping $id, no article number found\n";
+ if ($del && defined($mid0)) {
+ warn "$id was deleted $del " .
+ "time(s) but mapped to article #$num\n";
+ }
+ return;
+
+ }
+ $mm_tmp->mid_delete($mid0) or
+ die "failed to delete <$mid0> for article #$num\n";
+
+ my $nparts = $self->{partitions};
+ my $part = $num % $nparts;
+ my $idx = $self->idx_part($part);
+ $idx->index_raw($len, $msgref, $num, $oid, $mid0, $mime);
+ my $n = $self->{transact_bytes} += $len;
+ if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
+ $git->cleanup;
+ $mm_tmp->atfork_prepare;
+ $self->done; # release lock
+ # allow -watch or -mda to write...
+ $self->idx_init; # reacquire lock
+ $mm_tmp->atfork_parent;
+ }
+}
+
+sub reindex {
+ my ($self) = @_;
+ my $ibx = $self->{-inbox};
+ my $pfx = "$ibx->{mainrepo}/git";
+ my $max_git;
+ my $latest = git_dir_latest($self, \$max_git);
+ return unless defined $latest;
+ my @cmd = qw(log --raw -r --pretty=tformat:%h
+ --no-notes --no-color --no-abbrev);
+ my $head = $ibx->{ref_head} || 'refs/heads/master';
+ $self->idx_init; # acquire lock
+ my $x40 = qr/[a-f0-9]{40}/;
+ my $mm_tmp = $self->{skel}->{mm}->tmp_clone;
+ my $D = {};
+
+ # work backwards through history
+ for (my $cur = $max_git; $cur >= 0; $cur--) {
+ die "already reindexing!\n" if delete $self->{reindex_pipe};
+ my $cmt;
+ my $git_dir = "$pfx/$cur.git";
+ my $git = PublicInbox::Git->new($git_dir);
+ my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $head);
+ while (<$fh>) {
+ if (/\A$x40$/o) {
+ chomp($cmt = $_);
+ } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) {
+ $self->reindex_oid($mm_tmp, $D, $git, $1);
+ } elsif (m!\A:\d{6} 100644 $x40 ($x40) [AM]\t_/D$!o) {
+ $self->mark_deleted($D, $git, $1);
+ }
+ }
+ delete $self->{reindex_pipe};
+ }
+ my ($min, $max) = $mm_tmp->minmax;
+ defined $max and die "leftover article numbers at $min..$max\n";
+}
+
1;
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 1debbaa..cea3573 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -31,6 +31,9 @@ my @dirs;
sub resolve_repo_dir {
my ($cd) = @_;
my $prefix = defined $cd ? $cd : './';
+ if (-d $prefix && -f "$prefix/inbox.lock") { # v2
+ return abs_path($prefix);
+ }
my @cmd = qw(git rev-parse --git-dir);
my $cmd = join(' ', @cmd);
@@ -75,14 +78,26 @@ foreach my $k (keys %$config) {
}
foreach my $dir (@dirs) {
+ if (!ref($dir) && -f "$dir/inbox.lock") { # v2
+ my $ibx = { mainrepo => $dir, name => 'unnamed' };
+ $dir = PublicInbox::Inbox->new($ibx);
+ }
index_dir($dir);
}
sub index_dir {
- my ($git_dir) = @_;
- if (!ref $git_dir && ! -d $git_dir) {
- die "$git_dir does not appear to be a git repository\n";
+ my ($repo) = @_;
+ if (!ref $repo && ! -d $repo) {
+ die "$repo does not appear to be an inbox repository\n";
+ }
+ if (ref($repo) && ($repo->{version} || 1) == 2) {
+ eval { require PublicInbox::V2Writable };
+ die "v2 requirements not met: $@\n" if $@;
+ my $v2w = PublicInbox::V2Writable->new($repo);
+ $v2w->reindex;
+ $v2w->done;
+ } else {
+ my $s = PublicInbox::SearchIdx->new($repo, 1);
+ $s->index_sync({ reindex => $reindex });
}
- my $s = PublicInbox::SearchIdx->new($git_dir, 1);
- $s->index_sync({ reindex => $reindex });
}
--
EW
next prev parent reply other threads:[~2018-03-22 9:40 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-03-22 9:40 [PATCH 00/13] reindexing, feeds, date fixes Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 01/13] content_id: do not take Message-Id into account Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 02/13] introduce InboxWritable class Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 03/13] import: discard all the same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 04/13] InboxWritable: add mbox/maildir parsing + import logic Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 05/13] use both Date: and Received: times Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 06/13] msgmap: add tmp_clone to create an anonymous copy Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 07/13] fix syntax warnings Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-26 20:08 ` [PATCH 08/13] v2writable: support reindexing Xapian Eric Wong
2018-03-22 9:40 ` [PATCH 09/13] t/altid.t: extra tests for mid_set Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 10/13] v2writable: add NNTP article number regeneration support Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 11/13] v2writable: clarify header cleanups Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 12/13] v2writable: DEBUG_DIFF respects $TMPDIR Eric Wong (Contractor, The Linux Foundation)
2018-03-22 9:40 ` [PATCH 13/13] feed: $INBOX/new.atom endpoint supports v2 inboxes Eric Wong (Contractor, The Linux Foundation)
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180322094015.14422-9-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).