From: "Eric Wong (Contractor, The Linux Foundation)" <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 11/12] v2writable: round-robin to partitions based on article number
Date: Thu, 22 Feb 2018 21:42:21 +0000 [thread overview]
Message-ID: <20180222214222.1086-12-e@80x24.org> (raw)
In-Reply-To: <20180222214222.1086-1-e@80x24.org>
Instead of relying on the git object_id hash to partition,
round-robin to these partitions based on the NNTP article
number. This reduces the partition pipes as a source of
contention when two (or more) sequential messages end up
going to the same partition.
---
lib/PublicInbox/V2Writable.pm | 34 +++++++++++++++++++++-------------
1 file changed, 21 insertions(+), 13 deletions(-)
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index cb74ab1..cf19c76 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -17,6 +17,9 @@ $Email::MIME::ContentType::STRICT_PARAMS = 0;
# an estimate of the post-packed size to the raw uncompressed size
my $PACKING_FACTOR = 0.4;
+# assume 2 cores if GNU nproc(1) is not available
+my $NPROC = int($ENV{NPROC} || `nproc 2>/dev/null` || 2);
+
sub new {
my ($class, $v2ibx, $creat) = @_;
my $dir = $v2ibx->{mainrepo} or die "no mainrepo in inbox\n";
@@ -33,7 +36,7 @@ sub new {
im => undef, # PublicInbox::Import
xap_rw => undef, # PublicInbox::V2SearchIdx
xap_ro => undef,
- partitions => 4,
+ partitions => $NPROC,
transact_bytes => 0,
# limit each repo to 1GB or so
rotate_bytes => int((1024 * 1024 * 1024) / $PACKING_FACTOR),
@@ -59,11 +62,11 @@ sub add {
my $oid = $im->{last_object_id};
my ($len, $msgref) = @{$im->{last_object}};
+ $self->idx_init;
+ my $num = $self->{all}->index_mm($mime);
my $nparts = $self->{partitions};
- my $part = hex(substr($oid, 0, 8)) % $nparts;
+ my $part = $num % $nparts;
my $idx = $self->idx_part($part);
- my $all = $self->{all};
- my $num = $all->index_mm($mime);
$idx->index_raw($len, $msgref, $num, $oid);
my $n = $self->{transact_bytes} += $len;
if ($n > (PublicInbox::SearchIdx::BATCH_BYTES * $nparts)) {
@@ -75,21 +78,23 @@ sub add {
sub idx_part {
my ($self, $part) = @_;
- my $idx = $self->{idx_parts};
- return $idx->[$part] if $idx; # fast path
+ $self->{idx_parts}->[$part];
+}
+sub idx_init {
+ my ($self) = @_;
+ return if $self->{idx_parts};
# first time initialization:
- my $all = $self->{all} =
+ my $all = $self->{all} =
PublicInbox::SearchIdxThread->new($self->{-inbox});
# need to create all parts before initializing msgmap FD
my $max = $self->{partitions} - 1;
- $idx = $self->{idx_parts} = [];
+ my $idx = $self->{idx_parts} = [];
for my $i (0..$max) {
push @$idx, PublicInbox::SearchIdxPart->new($self, $i, $all);
}
$all->_msgmap_init->{dbh}->begin_work;
- $idx->[$part];
}
sub remove {
@@ -127,10 +132,12 @@ sub searchidx_checkpoint {
# order matters, we can only close {all} after all partitions
# are done because the partitions also write to {all}
- my $parts = $self->{idx_parts};
- foreach my $idx (@$parts) {
- $idx->remote_commit;
- $idx->remote_close unless $more;
+ if (my $parts = $self->{idx_parts}) {
+ foreach my $idx (@$parts) {
+ $idx->remote_commit;
+ $idx->remote_close unless $more;
+ }
+ delete $self->{idx_parts} unless $more;
}
if (my $all = $self->{all}) {
@@ -140,6 +147,7 @@ sub searchidx_checkpoint {
}
$all->remote_commit;
$all->remote_close unless $more;
+ delete $self->{all} unless $more;
}
$self->{transact_bytes} = 0;
}
--
EW
next prev parent reply other threads:[~2018-02-22 21:42 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-02-22 21:42 [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 01/12] import: allow the epoch (0s) as a valid time Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 02/12] extmsg: fix broken Xapian MID lookup Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 03/12] search: stop assuming Message-ID is unique Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 04/12] www: stop assuming mainrepo == git_dir Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 05/12] v2writable: initial cut for repo-rotation Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 06/12] git: reload alternates file on missing blob Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 07/12] v2: support Xapian + SQLite indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 08/12] import_vger_from_inbox: allow "-V" option Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 09/12] import_vger_from_mbox: use PublicInbox::MIME and avoid clobbering Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` [PATCH 10/12] v2: parallelize Xapian indexing Eric Wong (Contractor, The Linux Foundation)
2018-02-22 21:42 ` Eric Wong (Contractor, The Linux Foundation) [this message]
2018-02-22 21:42 ` [PATCH 12/12] searchidxpart: increase pipe size for partitions Eric Wong (Contractor, The Linux Foundation)
2018-02-23 1:22 ` [WIP PATCH 0/12] v2: git repo rotation + parallel Xapian indexing Eric Wong
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://public-inbox.org/README
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180222214222.1086-12-e@80x24.org \
--to=e@80x24.org \
--cc=meta@public-inbox.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).