unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/12] v2writable: round-robin to partitions based on article number
Date: Thu, 22 Feb 2018 21:42:21 +0000	[thread overview]
Message-ID: <20180222214222.1086-12-e@80x24.org> (raw)
In-Reply-To: <20180222214222.1086-1-e@80x24.org>

Instead of relying on the git object_id hash to partition,
round-robin to these partitions based on the NNTP article
number.  This reduces the partition pipes as a source of
contention when two (or more) sequential messages end up
going to the same partition.
---
 lib/PublicInbox/V2Writable.pm | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index cb74ab1..cf19c76 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -17,6 +17,9 @@ $Email::MIME::ContentType::STRICT_PARAMS = 0;
 # an estimate of the post-packed size to the raw uncompressed size
 my $PACKING_FACTOR = 0.4;
 
+# assume 2 cores if GNU nproc(1) is not available
+my $NPROC = int($ENV{NPROC} || `nproc 2>/dev/null` || 2);
+
 sub new {
 	my ($class, $v2ibx, $creat) = @_;
 	my $dir = $v2ibx->{mainrepo} or die "no mainrepo in inbox\n";
@@ -33,7 +36,7 @@ sub new {
 		im => undef, #  PublicInbox::Import
 		xap_rw => undef, # PublicInbox::V2SearchIdx
 		xap_ro => undef,
-		partitions => 4,
+		partitions => $NPROC,
 		transact_bytes => 0,
 		# limit each repo to 1GB or so
 		rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
@@ -59,11 +62,11 @@ sub add {
 	my $oid = $im->{last_object_id};
 	my ($len, $msgref) = @{$im->{last_object}};
 
+	$self->idx_init;
+	my $num = $self->{all}->index_mm($mime);
 	my $nparts = $self->{partitions};
-	my $part = hex(substr($oid, 0, 8)) % $nparts;
+	my $part = $num % $nparts;
 	my $idx = $self->idx_part($part);
-	my $all = $self->{all};
-	my $num = $all->index_mm($mime);
 	$idx->index_raw($len, $msgref, $num, $oid);
 	my $n = $self->{transact_bytes} += $len;
 	if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
@@ -75,21 +78,23 @@ sub add {
 
 sub idx_part {
 	my ($self, $part) = @_;
-	my $idx = $self->{idx_parts};
-	return $idx->[$part] if $idx; # fast path
+	$self->{idx_parts}->[$part];
+}
 
+sub idx_init {
+	my ($self) = @_;
+	return if $self->{idx_parts};
 	# first time initialization:
-	my $all = $self->{all} = 
+	my $all = $self->{all} =
 		PublicInbox::SearchIdxThread->new($self->{-inbox});
 
 	# need to create all parts before initializing msgmap FD
 	my $max = $self->{partitions} - 1;
-	$idx = $self->{idx_parts} = [];
+	my $idx = $self->{idx_parts} = [];
 	for my $i (0..$max) {
 		push @$idx, PublicInbox::SearchIdxPart->new($self, $i, $all);
 	}
 	$all->_msgmap_init->{dbh}->begin_work;
-	$idx->[$part];
 }
 
 sub remove {
@@ -127,10 +132,12 @@ sub searchidx_checkpoint {
 	# order matters, we can only close {all} after all partitions
 	# are done because the partitions also write to {all}
 
-	my $parts = $self->{idx_parts};
-	foreach my $idx (@$parts) {
-		$idx->remote_commit;
-		$idx->remote_close unless $more;
+	if (my $parts = $self->{idx_parts}) {
+		foreach my $idx (@$parts) {
+			$idx->remote_commit;
+			$idx->remote_close unless $more;
+		}
+		delete $self->{idx_parts} unless $more;
 	}
 
 	if (my $all = $self->{all}) {
@@ -140,6 +147,7 @@ sub searchidx_checkpoint {
 		}
 		$all->remote_commit;
 		$all->remote_close unless $more;
+		delete $self->{all} unless $more;
 	}
 	$self->{transact_bytes} = 0;
 }
-- 
EW


  parent reply	other threads:[~2018-02-22 21:42 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-02-22 21:42 [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 01/12] import: allow the epoch (0s) as a valid time Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 02/12] extmsg: fix broken Xapian MID lookup Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 03/12] search: stop assuming Message-ID is unique Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 04/12] www: stop assuming mainrepo == git_dir Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 05/12] v2writable: initial cut for repo-rotation Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 06/12] git: reload alternates file on missing blob Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 07/12] v2: support Xapian + SQLite indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 08/12] import_vger_from_inbox: allow "-V" option Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 09/12] import_vger_from_mbox: use PublicInbox::MIME and avoid clobbering Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 10/12] v2: parallelize Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-02-22 21:42 ` [PATCH 12/12] searchidxpart: increase pipe size for partitions Eric Wong (Contractor, The Linux Foundation)
2018-02-23  1:22 ` [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20180222214222.1086-12-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).