unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] cindex: support sha256 coderepos alongside sha1
@ 2023-04-19 21:54 Eric Wong
  2023-04-20  0:53 ` [PATCH 2/1] cindex: limit parallelism of extensions.objectFormat check Eric Wong
  0 siblings, 1 reply; 2+ messages in thread
From: Eric Wong @ 2023-04-19 21:54 UTC (permalink / raw)
  To: meta

This special support is only needed for --prune at the moment
since the indexing side works on a per-repo basis.  There's no
automated tests, yet, but it seems to work well on my sha256
projects when sharing a cindex with sha1 projects.
---
 lib/PublicInbox/CodeSearchIdx.pm | 100 +++++++++++++++++++++----------
 lib/PublicInbox/Git.pm           |   4 +-
 lib/PublicInbox/Import.pm        |  10 +++-
 3 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 78032c00..54dbf785 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -31,6 +31,7 @@ use PublicInbox::Config qw(glob2re);
 use PublicInbox::Spawn qw(spawn popen_rd);
 use PublicInbox::OnDestroy;
 use PublicInbox::CidxLogP;
+use PublicInbox::Git qw(%OFMT2HEXLEN);
 use Socket qw(MSG_EOR);
 use Carp ();
 our (
@@ -44,7 +45,6 @@ our (
 	@RDONLY_XDB, # Xapian::Database
 	@IDX_SHARDS, # clones of self
 	$MAX_SIZE,
-	$TMP_GIT, # PublicInbox::Git object for --prune
 	$REINDEX, # PublicInbox::SharedKV
 	@GIT_DIR_GONE, # [ git_dir1, git_dir2 ]
 	%TO_PRUNE, # (docid => docid) mapping (hash in case of retry_reopen)
@@ -58,6 +58,9 @@ our (
 	%ACTIVE_GIT_DIR, # GIT_DIR => undef mapping for prune
 	$IDX_TODO, # [ $git0, $root0, $git1, $root1, ...]
 	$GIT_TODO, # [ GIT_DIR0, GIT_DIR1, ...]
+	%HEXLEN2TMPGIT, # ((40|64) => PublicInbox::Git for prune)
+	%ALT_FH, # '', or 'sha256' => tmp IO for TMPGIT alternates
+	$TMPDIR, # File::Temp->newdir object
 );
 
 # stop walking history if we see >$SEEN_MAX existing commits, this assumes
@@ -701,28 +704,33 @@ sub event_step { # may be requeued via DS
 			scalar(@cmt) == 1 or warn
 "BUG? shard[$self->{shard}] #$PRUNE_CUR has multiple commits: @cmt";
 			for my $o (@cmt) {
-				$TMP_GIT->check_async($o, \&prune_cb,
-							[$self, $PRUNE_CUR])
+				$HEXLEN2TMPGIT{length($o)}->check_async($o,
+						\&prune_cb, [$self, $PRUNE_CUR])
 			}
 		}
 	}
-	$TMP_GIT->async_wait_all;
+	$_->async_wait_all for (values %HEXLEN2TMPGIT);
 	cidx_ckpoint($self);
 	return PublicInbox::DS::requeue($self) if $PRUNE_CUR <= $PRUNE_MAX;
 	send($PRUNE_OP_P, "prune_done $self->{shard}", MSG_EOR);
 	$PRUNE_NR //= 0;
 	progress($self, "prune [$self->{shard}] $PRUNE_NR done");
-	$TMP_GIT->cleanup;
-	$TMP_GIT = $PRUNE_OP_P = $PRUNE_CUR = $PRUNE_MAX = undef;
-	%ACTIVE_GIT_DIR = ();
+	$_->cleanup for (values %HEXLEN2TMPGIT);
+	$PRUNE_OP_P = $PRUNE_CUR = $PRUNE_MAX = undef;
+	undef %ACTIVE_GIT_DIR;
+	undef %HEXLEN2TMPGIT;
 }
 
 sub prune_start { # via wq_io_do in IDX_SHARDS
-	my ($self, $git_dir, @active_git_dir) = @_;
+	my ($self, $tmpdir, @active_git_dir) = @_;
 	$PRUNE_CUR = 1;
 	$PRUNE_OP_P = delete $self->{0} // die 'BUG: no {0} op_p';
 	%ACTIVE_GIT_DIR = map { $_ => undef } @active_git_dir;
-	$TMP_GIT = PublicInbox::Git->new($git_dir); # TMP_GIT copy
+	for my $git_dir (<$tmpdir/*.git>) {
+		my ($hexlen) = ($git_dir =~ m!/hexlen([0-9]+)\.git\z!);
+		$hexlen or die "BUG: no hexlen in $git_dir";
+		$HEXLEN2TMPGIT{$hexlen} = PublicInbox::Git->new($git_dir);
+	}
 	$self->begin_txn_lazy;
 	$PRUNE_MAX = $self->{xdb}->get_lastdocid // 1;
 	event_step($self);
@@ -750,24 +758,6 @@ sub parent_quit {
 	warn "# SIG$_[0] received, quitting...\n";
 }
 
-sub init_tmp_git_dir ($) {
-	my ($self) = @_;
-	require File::Temp;
-	require PublicInbox::Import;
-	my $tmp = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
-	PublicInbox::Import::init_bare("$tmp", 'cidx-all');
-	my $f = "$tmp/objects/info/alternates";
-	open my $fh, '>', $f or die "open($f): $!";
-	my $o;
-	for (@{$self->{git_dirs}}) { # TODO: sha256 check?
-		$o = $_.'/objects';
-		say $fh $o if -d $o;
-	}
-	close $fh or die "close($f): $!";
-	$TMP_GIT = PublicInbox::Git->new("$tmp");
-	$TMP_GIT->{-tmp} = $tmp;
-}
-
 sub prep_umask ($) {
 	my ($self) = @_;
 	if ($self->{-cidx_internal}) { # respect core.sharedRepository
@@ -789,16 +779,60 @@ sub prep_umask ($) {
 	}
 }
 
-sub start_prune ($) {
+sub prep_alternate { # awaitpid callback for config extensions.objectFormat
+	my ($pid, $objdir, $out, $send_prune) = @_;
+	my $status = $? >> 8;
+	my $fmt;
+	if ($status == 1) { # unset, default is '' (SHA-1)
+		$fmt = 'sha1';
+	} elsif ($status == 0) {
+		seek($out, 0, SEEK_SET) or die "seek: $!";
+		chomp($fmt = <$out> // 'sha1');
+	} else {
+		return warn("git config \$?=$? for objdir=$objdir");
+	}
+	my $hexlen = $OFMT2HEXLEN{$fmt} // return warn <<EOM;
+E: ignoring objdir=$objdir, unknown extensions.objectFormat=$fmt
+EOM
+	unless ($ALT_FH{$fmt}) {
+		my $git_dir = "$TMPDIR/hexlen$hexlen.git";
+		PublicInbox::Import::init_bare($git_dir, 'cidx-all', $fmt);
+		my $f = "$git_dir/objects/info/alternates";
+		open $ALT_FH{$fmt}, '>', $f or die "open($f): $!";
+	}
+	say { $ALT_FH{$fmt} } $out or die "say: $!";
+	# send_prune fires on the last one
+}
+
+sub init_prune ($) {
 	my ($self) = @_;
 	return (@$PRUNE_DONE = map { 1 } @IDX_SHARDS) if !$self->{-opt}->{prune};
-	init_tmp_git_dir($self);
+
+	require File::Temp;
+	require PublicInbox::Import;
+	$TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
+	my $send_prune = PublicInbox::OnDestroy->new($$, \&send_prune, $self);
+	my $cmd = [ 'git', undef, 'config', 'extensions.objectFormat' ];
+	for (@{$self->{git_dirs}}) {
+		my $o = $_.'/objects';
+		next if !-d $o;
+		$cmd->[1] = "--git-dir=$_";
+		open my $out, '+>', undef or die "open(tmp): $!";
+		my $pid = spawn($cmd, undef, { 1 => $out });
+		awaitpid($pid, \&prep_alternate, $o, $out, $send_prune);
+	}
+}
+
+sub send_prune { # OnDestroy when `git config extensions.objectFormat' are done
+	my ($self) = @_;
+	for (values %ALT_FH) { close $_ or die "close: $!" }
+	%ALT_FH = ();
 	my @active_git_dir = (@{$self->{git_dirs}}, @GIT_DIR_GONE);
 	my ($c, $p) = PublicInbox::PktOp->pair;
 	$c->{ops}->{prune_done} = [ $self ];
 	for my $s (@IDX_SHARDS) {
 		$s->wq_io_do('prune_start', [ $p->{op_p} ],
-				$TMP_GIT->{git_dir}, @active_git_dir)
+				"$TMPDIR", @active_git_dir)
 	}
 }
 
@@ -812,8 +846,8 @@ sub cidx_run { # main entry point
 	local $LIVE = {};
 	local $PRUNE_DONE = [];
 	local $IDX_TODO = [];
-	local ($DO_QUIT, $TMP_GIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE,
-		$GIT_TODO, $REPO_CTX);
+	local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE,
+		$GIT_TODO, $REPO_CTX, %ALT_FH, $TMPDIR, %HEXLEN2TMPGIT);
 	local $BATCH_BYTES = $self->{-opt}->{batch_size} //
 				$PublicInbox::SearchIdx::BATCH_BYTES;
 	local @IDX_SHARDS = cidx_init($self);
@@ -859,7 +893,7 @@ sub cidx_run { # main entry point
 	local $LIVE_JOBS = $self->{-opt}->{jobs} ||
 			PublicInbox::IPC::detect_nproc() || 2;
 	local @RDONLY_XDB = $self->xdb_shards_flat;
-	start_prune($self);
+	init_prune($self);
 	scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
 
 	local @PublicInbox::DS::post_loop_do = (\&shards_active);
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 3108ed85..61ba8aa1 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -22,7 +22,9 @@ use IO::Poll qw(POLLIN);
 use Carp qw(croak carp);
 use PublicInbox::SHA ();
 use PublicInbox::DS qw(awaitpid);
-our @EXPORT_OK = qw(git_unquote git_quote);
+our %HEXLEN2SHA = (40 => 1, 64 => 256);
+our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64);
+our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN);
 our $PIPE_BUFSIZ = 65536; # Linux default
 our $in_cleanup;
 our $RDTIMEO = 60_000; # milliseconds
diff --git a/lib/PublicInbox/Import.pm b/lib/PublicInbox/Import.pm
index 39719bcb..59462e9a 100644
--- a/lib/PublicInbox/Import.pm
+++ b/lib/PublicInbox/Import.pm
@@ -461,13 +461,21 @@ my @INIT_FILES = ('HEAD' => undef, # filled in at runtime
 EOC
 
 sub init_bare {
-	my ($dir, $head) = @_; # or self
+	my ($dir, $head, $fmt) = @_; # or self
 	$dir = $dir->{git}->{git_dir} if ref($dir);
 	require File::Path;
 	File::Path::make_path(map { $dir.$_ } qw(/objects/info /refs/heads));
 	$INIT_FILES[1] //= 'ref: '.default_branch."\n";
 	my @fn_contents = @INIT_FILES;
 	$fn_contents[1] = "ref: refs/heads/$head\n" if defined $head;
+	$fn_contents[3] = <<EOM if defined($fmt) && $fmt ne 'sha1';
+[core]
+	repositoryFormatVersion = 1
+	filemode = true
+	bare = true
+[extensions]
+	objectFormat = $fmt
+EOM
 	while (my ($fn, $contents) = splice(@fn_contents, 0, 2)) {
 		my $f = $dir.'/'.$fn;
 		next if -f $f;

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH 2/1] cindex: limit parallelism of extensions.objectFormat check
  2023-04-19 21:54 [PATCH] cindex: support sha256 coderepos alongside sha1 Eric Wong
@ 2023-04-20  0:53 ` Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2023-04-20  0:53 UTC (permalink / raw)
  To: meta

We can't safely spawn all `git config' processes of every
indexed git directory at once due to system resource limits
(RLIMIT_NPROC, RLIMIT_NOFILE). So queue them up and limit
parallelism that way.
---
 lib/PublicInbox/CodeSearchIdx.pm | 33 +++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 54dbf785..97123133 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -61,6 +61,7 @@ our (
 	%HEXLEN2TMPGIT, # ((40|64) => PublicInbox::Git for prune)
 	%ALT_FH, # '', or 'sha256' => tmp IO for TMPGIT alternates
 	$TMPDIR, # File::Temp->newdir object
+	@PRUNE_QUEUE, # GIT_DIRs to prepare for pruning
 );
 
 # stop walking history if we see >$SEEN_MAX existing commits, this assumes
@@ -779,9 +780,11 @@ sub prep_umask ($) {
 	}
 }
 
-sub prep_alternate { # awaitpid callback for config extensions.objectFormat
+sub prep_alternate_end { # awaitpid callback for config extensions.objectFormat
 	my ($pid, $objdir, $out, $send_prune) = @_;
 	my $status = $? >> 8;
+	my $next_dir = shift(@PRUNE_QUEUE);
+	prep_alternate_start($next_dir, $send_prune) if defined($next_dir);
 	my $fmt;
 	if ($status == 1) { # unset, default is '' (SHA-1)
 		$fmt = 'sha1';
@@ -801,7 +804,20 @@ EOM
 		open $ALT_FH{$fmt}, '>', $f or die "open($f): $!";
 	}
 	say { $ALT_FH{$fmt} } $out or die "say: $!";
-	# send_prune fires on the last one
+}
+
+sub prep_alternate_start {
+	my ($git_dir, $send_prune) = @_;
+	my $o = $git_dir.'/objects';
+	while (!-d $o) {
+		$git_dir = shift(@PRUNE_QUEUE) // return
+		$o = $git_dir.'/objects';
+	}
+	my $cmd = [ 'git', "--git-dir=$git_dir",
+			qw(config extensions.objectFormat) ];
+	open my $out, '+>', undef or die "open(tmp): $!";
+	my $pid = spawn($cmd, undef, { 1 => $out });
+	awaitpid($pid, \&prep_alternate_end, $o, $out, $send_prune);
 }
 
 sub init_prune ($) {
@@ -812,14 +828,9 @@ sub init_prune ($) {
 	require PublicInbox::Import;
 	$TMPDIR = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1);
 	my $send_prune = PublicInbox::OnDestroy->new($$, \&send_prune, $self);
-	my $cmd = [ 'git', undef, 'config', 'extensions.objectFormat' ];
-	for (@{$self->{git_dirs}}) {
-		my $o = $_.'/objects';
-		next if !-d $o;
-		$cmd->[1] = "--git-dir=$_";
-		open my $out, '+>', undef or die "open(tmp): $!";
-		my $pid = spawn($cmd, undef, { 1 => $out });
-		awaitpid($pid, \&prep_alternate, $o, $out, $send_prune);
+	@PRUNE_QUEUE = @{$self->{git_dirs}};
+	for (1..$LIVE_JOBS) {
+		prep_alternate_start(shift(@PRUNE_QUEUE) // last, $send_prune);
 	}
 }
 
@@ -846,7 +857,7 @@ sub cidx_run { # main entry point
 	local $LIVE = {};
 	local $PRUNE_DONE = [];
 	local $IDX_TODO = [];
-	local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE,
+	local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNE_QUEUE,
 		$GIT_TODO, $REPO_CTX, %ALT_FH, $TMPDIR, %HEXLEN2TMPGIT);
 	local $BATCH_BYTES = $self->{-opt}->{batch_size} //
 				$PublicInbox::SearchIdx::BATCH_BYTES;

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-04-20  0:53 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-19 21:54 [PATCH] cindex: support sha256 coderepos alongside sha1 Eric Wong
2023-04-20  0:53 ` [PATCH 2/1] cindex: limit parallelism of extensions.objectFormat check Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).