From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DA8C61F724 for ; Mon, 13 Nov 2023 13:15:54 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1699881354; bh=+LsI/s93eHJhjEyWdqDjR47jzwU3N86wGCMw8Ju4I9c=; h=From:To:Subject:Date:In-Reply-To:References:From; b=K372Oezu6Fstjr43oXnPRf9SGktn2IoDv3ig8BfE16QVuOUremgteug6gTYDn84tO 4/guaOTwbVTCQa+aXO7fik6r1f9e3cHqGLRzxze6sa/PSXQV4RxYeuBlz3ZSYgxY8g ohjaz83DuTpzGA1F8XG9C+hNvBjtINXFi37T4O9M= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 11/18] cindex: delay associate until prune+indexing finish Date: Mon, 13 Nov 2023 13:15:44 +0000 Message-Id: <20231113131551.843230-12-e@80x24.org> In-Reply-To: <20231113131551.843230-1-e@80x24.org> References: <20231113131551.843230-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Prune can get rid of invalid commits while indexing can add new candidates for association, so we don't dump coderepo roots for association until those are squared away. However, we can dump inbox info since we don't touch inboxes while -cindex is running. --- lib/PublicInbox/CidxComm.pm | 6 ++-- lib/PublicInbox/CodeSearchIdx.pm | 48 +++++++++++++++++++++----------- 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/lib/PublicInbox/CidxComm.pm b/lib/PublicInbox/CidxComm.pm index c7ab3c10..80a235e9 100644 --- a/lib/PublicInbox/CidxComm.pm +++ b/lib/PublicInbox/CidxComm.pm @@ -13,8 +13,8 @@ use parent qw(PublicInbox::DS); use PublicInbox::Syscall qw(EPOLLIN EPOLLONESHOT); sub new { - my ($cls, $rd, $cidx) = @_; - my $self = bless { cidx => $cidx }, $cls; + my ($cls, $rd, $cidx, $drs) = @_; + my $self = bless { cidx => $cidx, drs => $drs }, $cls; $self->SUPER::new($rd, EPOLLIN|EPOLLONESHOT); } @@ -22,7 +22,7 @@ sub event_step { my ($self) = @_; my $rd = $self->{sock} // return warn('BUG?: no {sock}'); $self->close; # EPOLL_CTL_DEL - delete($self->{cidx})->cidx_read_comm($rd); + delete($self->{cidx})->cidx_read_comm($rd, delete $self->{drs}); } 1; diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 0bd26af2..04c514fe 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -76,8 +76,8 @@ our ( $NPROC, $XHC, # XapClient $REPO_CTX, # current repo being indexed in shards - $IDX_TODO, # [ $git0, $root0, $git1, $root1, ...] - $GIT_TODO, # [ GIT_DIR0, GIT_DIR1, ...] + $IDX_TODO, # PublicInbox::Git object arrayref + $GIT_TODO, # PublicInbox::Git object arrayref %ALT_FH, # hexlen => tmp IO for TMPDIR git alternates $TMPDIR, # File::Temp->newdir object for prune @PRUNE_QUEUE, # GIT_DIRs to prepare for pruning @@ -337,8 +337,10 @@ sub prune_done { # called via prune_do completion return if $DO_QUIT || !$PRUNE_DONE; die "BUG: \$PRUNE_DONE->[$n] already defined" if $PRUNE_DONE->[$n]; $PRUNE_DONE->[$n] = 1; - grep(defined, @$PRUNE_DONE) == @IDX_SHARDS and - progress($self, 'prune done') + if (grep(defined, @$PRUNE_DONE) == @IDX_SHARDS) { + progress($self, 'prune done'); + index_next($self); # may kick dump_roots_start + } } sub seen ($$) { @@ -506,10 +508,15 @@ sub assoc_max_init ($) { $max < 0 ? ((2 ** 31) - 1) : $max; } +sub start_xhc () { + my ($xhc, $pid) = PublicInbox::XapClient::start_helper("-j$NPROC"); + awaitpid($pid, \&cmd_done, ['xap_helper', "-j$NPROC"]); + $xhc; +} + sub dump_roots_start { my ($self, $associate) = @_; - ($XHC, my $pid) = PublicInbox::XapClient::start_helper("-j$NPROC"); - awaitpid($pid, \&cmd_done, ['xap_helper', "-j$NPROC"]); + $XHC //= start_xhc; $associate // die 'BUG: no $associate'; $TODO{associating} = 1; # keep shards_active() happy progress($self, 'dumping IDs from coderepos'); @@ -559,6 +566,7 @@ EOM sub dump_ibx_start { my ($self, $associate) = @_; + $XHC //= start_xhc; my ($sort_opt, $fold_opt); pipe(local $sort_opt->{0}, $DUMP_IBX_WPIPE); pipe(local $fold_opt->{0}, local $sort_opt->{1}); @@ -581,11 +589,10 @@ sub index_next ($) { fp_start($self, $git, $prep_repo); ct_start($self, $git, $prep_repo); } elsif ($TMPDIR) { - return if delete($TODO{dump_roots_start}); + delete $TODO{dump_roots_start}; delete $TODO{dump_ibx_start}; # runs OnDestroy once return dump_ibx($self, shift @IBXQ) if @IBXQ; undef $DUMP_IBX_WPIPE; # done dumping inboxes - undef $XHC; delete $TODO{associate}; } # else: wait for shards_active (post_loop_do) callback @@ -604,7 +611,7 @@ sub next_repos { # OnDestroy cb } sub index_done { # OnDestroy cb called when done indexing each code repo - my ($repo_ctx) = @_; + my ($repo_ctx, $drs) = @_; my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)}; return if $DO_QUIT; @@ -615,6 +622,7 @@ sub index_done { # OnDestroy cb called when done indexing each code repo $active->{$n} = undef; my ($c, $p) = PublicInbox::PktOp->pair; $c->{ops}->{repo_stored} = [ $self, $repo_ctx ]; + $c->{-cidx_dump_roots_start} = $drs if $drs; $IDX_SHARDS[$n]->wq_io_do('store_repo', [ $p->{op_p} ], $repo); # repo_stored will fire once store_repo is done } @@ -638,8 +646,9 @@ sub index_repo { # run_git cb $repo->{git_dir} = $git->{git_dir}; my $repo_ctx = $REPO_CTX = { self => $self, repo => $repo }; delete $git->{-cidx_gits_fini}; # may fire gits_fini + my $drs = delete $git->{-cidx_dump_roots_start}; my $index_done = PublicInbox::OnDestroy->new($$, \&index_done, - $repo_ctx); + $repo_ctx, $drs); my ($c, $p) = PublicInbox::PktOp->pair; $c->{ops}->{shard_done} = [ $self, $repo_ctx, $index_done ]; for my $n (0..$#shard_in) { @@ -738,6 +747,7 @@ EOM @shards; } +# called when all git coderepos are done sub gits_fini { undef $GITS_NR; PublicInbox::DS::enqueue_reap(); # kick @post_loop_do @@ -749,6 +759,9 @@ sub scan_git_dirs ($) { $GITS_NR = @$GIT_TODO; my $gits_fini = PublicInbox::OnDestroy->new($$, \&gits_fini); $_->{-cidx_gits_fini} = $gits_fini for @$GIT_TODO; + if (my $drs = $TODO{dump_roots_start}) { + $_->{-cidx_dump_roots_start} = $drs for @$GIT_TODO; + } progress($self, "scanning $GITS_NR code repositories..."); } @@ -797,7 +810,7 @@ sub kill_shards { $_->wq_kill(@_) for (@IDX_SHARDS) } sub parent_quit { $DO_QUIT = POSIX->can("SIG$_[0]")->(); - $XHC = undef; + $XHC = 0; # stops the process kill_shards(@_); warn "# SIG$_[0] received, quitting...\n"; } @@ -870,6 +883,7 @@ sub cmd_done { # run_await cb for sort, xapian-delve, sed failures sub associate { my ($self) = @_; return if $DO_QUIT; + $XHC = 0; # should not be recreated again @IDX_SHARDS or return warn("# aborting on no shards\n"); unlink("$TMPDIR/root2id"); my @pending = keys %{$self->{PENDING}}; @@ -949,7 +963,8 @@ sub init_prune ($) { require_progs('prune', 'xapian-delve' => \@delve, sed => \@sed, comm => \@COMM, awk => \@AWK); for (0..$#IDX_SHARDS) { push @delve, "$self->{xpfx}/$_" } - my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self); + my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self, + $TODO{dump_roots_start}); my ($sort_opt, $sed_opt, $delve_opt); pipe(local $sed_opt->{0}, local $delve_opt->{1}); pipe(local $sort_opt->{0}, local $sed_opt->{1}); @@ -975,7 +990,7 @@ sub dump_git_commits { # run_await cb } sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done - my ($self) = @_; + my ($self, $drs) = @_; return if $DO_QUIT; # setup the following pipeline: ( # git --git-dir=hexlen40.git cat-file \ @@ -991,7 +1006,7 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done run_await(\@AWK, $CMD_ENV, $awk_opt, \&cmd_done); run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done); my $comm_rd = popen_rd(\@COMM, $CMD_ENV, $comm_opt, \&cmd_done, \@COMM); - PublicInbox::CidxComm->new($comm_rd, $self); # calls cidx_read_comm + PublicInbox::CidxComm->new($comm_rd, $self, $drs); # ->cidx_read_comm my $git_ver = PublicInbox::Git::git_version(); push @PRUNE_BATCH, '--buffer' if $git_ver ge v2.6; @@ -1007,7 +1022,7 @@ EOM } sub cidx_read_comm { # via PublicInbox::CidxComm::event_step - my ($self, $comm_rd) = @_; + my ($self, $comm_rd, $drs) = @_; return if $DO_QUIT; $_->wq_do('prune_init') for @IDX_SHARDS; while (defined(my $cmt = <$comm_rd>)) { @@ -1022,6 +1037,7 @@ sub cidx_read_comm { # via PublicInbox::CidxComm::event_step } my ($c, $p) = PublicInbox::PktOp->pair; $c->{ops}->{prune_done} = [ $self ]; + $c->{-cidx_dump_roots_start} = $drs; $_->wq_io_do('prune_commit', [ $p->{op_p} ]) for @IDX_SHARDS; } @@ -1103,8 +1119,8 @@ sub show_roots { # for diagnostics sub do_inits { # called via PublicInbox::DS::add_timer my ($self) = @_; - init_prune($self); init_associate_postfork($self); + init_prune($self); scan_git_dirs($self) if $self->{-opt}->{scan} // 1; my $max = $TODO{associate} ? max($LIVE_JOBS, $NPROC) : $LIVE_JOBS; index_next($self) for (1..$max);