From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E93EF1F8C6 for ; Fri, 25 Jun 2021 01:06:39 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] extindex: maintain pack symlinks and use "git multi-pack-index" Date: Fri, 25 Jun 2021 01:06:39 +0000 Message-Id: <20210625010639.32244-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This is a fair amount of complexity, but it speeds up "git cat-file --batch" startup by 3-4% with 50K packfiles with a hot kernel cache. This appears extremely sensitive to RAM available to the kernel page cache with my SATA 2 SSD. Faster storage and more RAM can bring loading pack. 2.60s vs 2.69s were the best cases on my workstation with and without the multi-pack-index, however times could be all over the place (even in the minutes) with more activity on my workstation. Getting sub-minute times requires a git patch to speed up alt_odb_usable(): Otherwise, prepare to wait several minutes. --- lib/PublicInbox/ExtSearchIdx.pm | 49 +++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 08f2295a..a14f0652 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -20,6 +20,7 @@ use parent qw(PublicInbox::ExtSearch PublicInbox::Lock); use Carp qw(croak carp); use Sys::Hostname qw(hostname); use POSIX qw(strftime); +use File::Glob qw(bsd_glob GLOB_NOSORT); use PublicInbox::Search; use PublicInbox::SearchIdx qw(prepare_stack is_ancestor is_bad_blob); use PublicInbox::OverIdx; @@ -930,6 +931,31 @@ sub _idx_init { # with_umask callback $self->{midx} = PublicInbox::MiscIdx->new($self); } +sub symlink_packs ($$) { + my ($ibx, $pd) = @_; + my $ret = 0; + my $glob = "$ibx->{inboxdir}/git/*.git/objects/pack/*.idx"; + for my $idx (bsd_glob($glob, GLOB_NOSORT)) { + my $src = substr($idx, 0, -length('.idx')); + my $dst = $pd . substr($src, rindex($src, '/')); + if (-f "$src.pack" and + symlink("$src.pack", "$dst.pack") and + symlink($idx, "$dst.idx") and + -f $idx) { + ++$ret; + # .promisor and .keep are optional + # XXX should we symlink .keep here? + for my $s (qw(promisor)) { + symlink("$src.$s", "$dst.$s") if -f "$src.$s"; + } + } elsif (!$!{EEXIST}) { + warn "W: ln -s $src.{pack,idx} => $dst.*: $!\n"; + unlink "$dst.pack", "$dst.idx"; + } + } + $ret; +} + sub idx_init { # similar to V2Writable my ($self, $opt) = @_; return if $self->{idx_shards}; @@ -985,7 +1011,24 @@ sub idx_init { # similar to V2Writable } } } + # git-multi-pack-index(1) can speed up "git cat-file" startup slightly + my $dh; + my $git_midx = 0; + my $pd = "$ALL/objects/pack"; + if (!mkdir($pd) && $!{EEXIST} && opendir($dh, $pd)) { + # drop stale symlinks + while (defined(my $dn = readdir($dh))) { + if ($dn =~ /\.(?:idx|pack|promisor)\z/) { + my $f = "$pd/$dn"; + unlink($f) if -l $f && !-e $f; + } + } + undef $dh; + } for my $ibx (@{$self->{ibx_list}}) { + # create symlinks for multi-pack-index + $git_midx += symlink_packs($ibx, $pd); + # add new lines to our alternates file my $line = $ibx->git->{git_dir} . "/objects\n"; chomp(my $d = $line); if (my @st = stat($d)) { @@ -1001,6 +1044,12 @@ sub idx_init { # similar to V2Writable my $o = \@old; PublicInbox::V2Writable::write_alternates($info_dir, $mode, $o); } + $git_midx and $self->with_umask(sub { + my @cmd = ('multi-pack-index'); + push @cmd, '--no-progress' if ($opt->{quiet}//0) > 1; + system('git', "--git-dir=$ALL", @cmd, 'write'); + # ignore errors, fairly new command, may not exist + }); $self->parallel_init($self->{indexlevel}); $self->with_umask(\&_idx_init, $self, $opt); $self->{oidx}->begin_lazy;