unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 02/27] v2writable: support "barrier" operation to avoid reforking
Date: Mon, 19 Mar 2018 08:14:34 +0000	[thread overview]
Message-ID: <20180319081459.10645-3-e@80x24.org> (raw)
In-Reply-To: <20180319081459.10645-1-e@80x24.org>

Stopping and starting a bunch of processes to look up duplicates
or removals is inefficient.  Take advantage of checkpointing
in "git fast-import" and transactions in Xapian and SQLite.
---
 lib/PublicInbox/Import.pm            | 10 ++++++++-
 lib/PublicInbox/SearchIdxPart.pm     | 12 ++++++++++
 lib/PublicInbox/SearchIdxSkeleton.pm | 43 ++++++++++++++++++++++++++++++++----
 lib/PublicInbox/V2Writable.pm        | 34 +++++++++++++++++++++++++++-
 t/v2writable.t                       |  3 ++-
 5 files changed, 95 insertions(+), 7 deletions(-)

diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 664bec6..8406c9e 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -133,7 +133,6 @@ sub check_remove_v1 {
 	(undef, $cur);
 }
 
-# used for v2 (maybe)
 sub checkpoint {
 	my ($self) = @_;
 	return unless $self->{pid};
@@ -141,6 +140,15 @@ sub checkpoint {
 	undef;
 }
 
+sub progress {
+	my ($self, $msg) = @_;
+	return unless $self->{pid};
+	print { $self->{out} } "progress $msg\n" or wfail;
+	$self->{in}->getline eq "progress $msg\n" or die
+		"progress $msg not received\n";
+	undef;
+}
+
 # used for v2
 sub get_mark {
 	my ($self, $mark) = @_;
diff --git a/lib/PublicInbox/SearchIdxPart.pm b/lib/PublicInbox/SearchIdxPart.pm
index 6d8cb2a..dd7ace6 100644
--- a/lib/PublicInbox/SearchIdxPart.pm
+++ b/lib/PublicInbox/SearchIdxPart.pm
@@ -49,6 +49,11 @@ sub partition_worker_loop ($$$) {
 		} elsif ($line eq "close\n") {
 			$self->_xdb_release;
 			$xdb = $txn = undef;
+		} elsif ($line eq "barrier\n") {
+			$xdb->commit_transaction if $txn;
+			$txn = undef;
+			print { $self->{skeleton}->{w} } "barrier $part\n" or
+					die "write failed to skeleton: $!\n";
 		} else {
 			chomp $line;
 			my ($len, $artnum, $oid, $mid0) = split(/ /, $line);
@@ -81,4 +86,11 @@ sub atfork_child {
 	close $_[0]->{w} or die "failed to close write pipe: $!\n";
 }
 
+# called by V2Writable:
+sub barrier {
+	my $w = $_[0]->{w};
+	print $w "barrier\n" or die "failed to print: $!";
+	$w->flush or die "failed to flush: $!";
+}
+
 1;
diff --git a/lib/PublicInbox/SearchIdxSkeleton.pm b/lib/PublicInbox/SearchIdxSkeleton.pm
index 40b28c5..4cb10f5 100644
--- a/lib/PublicInbox/SearchIdxSkeleton.pm
+++ b/lib/PublicInbox/SearchIdxSkeleton.pm
@@ -15,21 +15,25 @@ sub new {
 
 	my ($r, $w);
 	pipe($r, $w) or die "pipe failed: $!\n";
-	binmode $r, ':raw';
-	binmode $w, ':raw';
+	my ($barrier_wait, $barrier_note);
+	pipe($barrier_wait, $barrier_note) or die "pipe failed: $!\n";
+	binmode $_, ':raw' foreach ($r, $w, $barrier_wait, $barrier_note);
 	my $pid = fork;
 	defined $pid or die "fork failed: $!\n";
 	if ($pid == 0) {
 		$v2writable->atfork_child;
 		$v2writable = undef;
 		close $w;
-		eval { skeleton_worker_loop($self, $r) };
+		close $barrier_wait;
+		eval { skeleton_worker_loop($self, $r, $barrier_note) };
 		die "skeleton worker died: $@\n" if $@;
 		exit;
 	}
 	$self->{w} = $w;
 	$self->{pid} = $pid;
 	close $r;
+	close $barrier_note;
+	$self->{barrier_wait} = $barrier_wait;
 
 	$w->autoflush(1);
 
@@ -40,11 +44,13 @@ sub new {
 }
 
 sub skeleton_worker_loop {
-	my ($self, $r) = @_;
+	my ($self, $r, $barrier_note) = @_;
+	$barrier_note->autoflush(1);
 	$0 = 'pi-v2-skeleton';
 	my $xdb = $self->_xdb_acquire;
 	$xdb->begin_transaction;
 	my $txn = 1;
+	my $barrier = undef;
 	while (my $line = $r->getline) {
 		if ($line eq "commit\n") {
 			$xdb->commit_transaction if $txn;
@@ -52,6 +58,21 @@ sub skeleton_worker_loop {
 		} elsif ($line eq "close\n") {
 			$self->_xdb_release;
 			$xdb = $txn = undef;
+		} elsif ($line =~ /\Abarrier_init (\d+)\n\z/) {
+			my $n = $1 - 1;
+			die "barrier in-progress\n" if defined $barrier;
+			$barrier = { map { $_ => 1 } (0..$n) };
+		} elsif ($line =~ /\Abarrier (\d+)\n\z/) {
+			my $part = $1;
+			die "no barrier in-progress\n" unless defined $barrier;
+			delete $barrier->{$1} or die "unknown barrier: $part\n";
+			if ((scalar keys %$barrier) == 0) {
+				$barrier = undef;
+				$xdb->commit_transaction if $txn;
+				$txn = undef;
+				print $barrier_note "barrier_done\n" or die
+					"print failed to barrier note: $!";
+			}
 		} else {
 			my $len = int($line);
 			my $n = read($r, my $msg, $len) or die "read: $!\n";
@@ -107,4 +128,18 @@ sub index_skeleton_real ($$) {
 	$self->link_and_save($doc, $mids, \@refs, $num, $xpath);
 }
 
+# write to the subprocess
+sub barrier_init {
+	my ($self, $nparts) = @_;
+	my $w = $_[0]->{w};
+	print $w "barrier_init $nparts\n" or die "failed to write: $!";
+	$w->flush or die "failed to flush: $!";
+}
+
+sub barrier_wait {
+	my ($self) = @_;
+	my $l = $self->{barrier_wait}->getline;
+	$l eq "barrier_done\n" or die "bad response from barrier_wait: $l\n";
+}
+
 1;
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 7728b91..6e2a8d6 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -113,7 +113,7 @@ sub num_for {
 		};
 
 		# crap, Message-ID is already known, hope somebody just resent:
-		$self->done; # write barrier, clears $self->{skel}
+		$self->barrier;
 		foreach my $m (@$mids) {
 			# read-only lookup now safe to do after above barrier
 			my $existing = $self->lookup_content($mime, $m);
@@ -228,6 +228,37 @@ sub checkpoint {
 	$self->searchidx_checkpoint(1);
 }
 
+# issue a write barrier to ensure all data is visible to other processes
+# and read-only ops.  Order of data importance is: git > SQLite > Xapian
+sub barrier {
+	my ($self) = @_;
+
+	# For safety, we ensure git checkpoint is complete before because
+	# the data in git is still more important than what is in Xapian.
+	# Performance may be gained by delaying ->progress call but we
+	# lose safety
+	if (my $im = $self->{im}) {
+		$im->checkpoint;
+		$im->progress('checkpoint');
+	}
+	my $skel = $self->{skel};
+	my $parts = $self->{idx_parts};
+	if ($parts && $skel) {
+		my $dbh = $skel->{mm}->{dbh};
+		$dbh->commit; # SQLite data is second in importance
+
+		# Now deal with Xapian
+		$skel->barrier_init(scalar(@$parts));
+		# each partition needs to issue a barrier command to skel:
+		$_->barrier foreach @$parts;
+
+		$skel->barrier_wait; # wait for each Xapian partition
+
+		$dbh->begin_work;
+	}
+	$self->{transact_bytes} = 0;
+}
+
 sub searchidx_checkpoint {
 	my ($self, $more) = @_;
 
@@ -349,6 +380,7 @@ sub lookup_content {
 	my $ibx = $self->{-inbox};
 
 	my $srch = $ibx->search;
+	$srch->reopen;
 	my $cid = content_id($mime);
 	my $found;
 	$srch->each_smsg_by_mid($mid, sub {
diff --git a/t/v2writable.t b/t/v2writable.t
index 404c865..7d276da 100644
--- a/t/v2writable.t
+++ b/t/v2writable.t
@@ -55,7 +55,7 @@ if ('ensure git configs are correct') {
 {
 	my @warn;
 	local $SIG{__WARN__} = sub { push @warn, @_ };
-	is(undef, $im->add($mime), 'obvious duplicate rejected');
+	is($im->add($mime), undef, 'obvious duplicate rejected');
 	like(join(' ', @warn), qr/resent/, 'warned about resent message');
 
 	@warn = ();
@@ -105,6 +105,7 @@ if ('ensure git configs are correct') {
 	ok($im->add($mime), 'message with multiple Message-ID');
 	$im->done;
 	my @found;
+	$ibx->search->reopen;
 	$ibx->search->each_smsg_by_mid('abcde@1', sub { push @found, @_; 1 });
 	is(scalar(@found), 1, 'message found by first MID');
 	$ibx->search->each_smsg_by_mid('abcde@2', sub { push @found, @_; 1 });
-- 
EW


  parent reply	other threads:[~2018-03-19  8:14 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-19  8:14 [PATCH 00/27] v2 public-inbox-watch support Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 01/27] content_id: use Sender header if From is not available Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-03-19  8:14 ` [PATCH 03/27] use string ref for Email::Simple->new Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 04/27] v2writable: remove unnecessary idx_init call Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 05/27] searchidx: do not delete documents while iterating Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 06/27] search: allow ->reopen to be chainable Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 07/27] v2writable: implement remove correctly Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 08/27] skeleton: barrier init requires a lock Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 09/27] import: (v2) delete writes the blob into history in subdir Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 10/27] import: (v2): write deletes to a separate '_' subdirectory Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 11/27] import: implement barrier operation for v1 repos Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 12/27] mid: mid_mime uses v2-compatible mids function Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 13/27] watchmaildir: use content_digest to generate Message-Id Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 14/27] import: force Message-ID generation for v1 here Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 15/27] import: switch to URL-safe Base64 for Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 16/27] v2writable: test for idempotent removals Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 17/27] import: enable locking under v2 Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 18/27] index: s/GIT_DIR/REPO_DIR/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 19/27] Lock: new base class for writable lockers Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 20/27] t/watch_maildir: note the reason for FIFO creation Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 21/27] v2writable: ensure ->done is idempotent Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 22/27] watchmaildir: support v2 repositories Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 23/27] searchidxpart: s/barrier/remote_barrier/ Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 24/27] v2writable: allow disabling parallelization Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 25/27] scripts/import_vger_from_mbox: filter out same headers as MDA Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 26/27] v2writable: add DEBUG_DIFF env support Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:14 ` [PATCH 27/27] v2writable: remove "resent" message for duplicate Message-IDs Eric Wong (Contractor, The Linux Foundation)
2018-03-19  8:18   ` SQUASH: " Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180319081459.10645-3-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).