* [PATCH 01/15] cindex: fix store_repo+repo_stored on no-op
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 11:40 ` [PATCH 02/15] codesearch: allow inbox count to exceed matches Eric Wong
` (13 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
It's possible to update the fingerprint for a given repo when we
have no commits to index on because they were already done for
another repo. Thus we'll always vivify $repo_ctx->{active}
before calling store_repo since $active may've been undef.
---
lib/PublicInbox/CodeSearchIdx.pm | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 7d696099..bd67a57e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -613,14 +613,14 @@ sub next_repos { # OnDestroy cb
sub index_done { # OnDestroy cb called when done indexing each code repo
my ($repo_ctx, $drs) = @_;
- my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)};
-
return if $DO_QUIT;
+ my ($self, $repo, $active) = @$repo_ctx{qw(self repo active)};
+ # $active may be undef here, but it's fine to vivify
my $n = grep { ! $repo_ctx->{shard_ok}->{$_} } keys %$active;
die "E: $repo->{git_dir} $n shards failed" if $n;
$repo_ctx->{shard_ok} = {}; # reset for future shard_done
$n = $repo->{shard_n};
- $active->{$n} = undef;
+ $repo_ctx->{active}->{$n} = undef; # may vivify $repo_ctx->{active}
my ($c, $p) = PublicInbox::PktOp->pair;
$c->{ops}->{repo_stored} = [ $self, $repo_ctx, $drs ];
$IDX_SHARDS[$n]->wq_io_do('store_repo', [ $p->{op_p} ], $repo);
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 02/15] codesearch: allow inbox count to exceed matches
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
2023-11-30 11:40 ` [PATCH 01/15] cindex: fix store_repo+repo_stored on no-op Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 11:40 ` [PATCH 03/15] config: reject newlines consistently in dir names Eric Wong
` (12 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
It's entirely possible for public inboxes to have zero patches
in them, so the amount of match slots may not match match the
number of joined ekeys.
---
lib/PublicInbox/CodeSearch.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index 5c5774cf..60deb2ae 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -69,7 +69,7 @@ sub join_data {
W: $self->{topdir} join data for $self->{-cfg_f} missing: @m
EOM
undef;
- } elsif (@{$cur->{ekeys}} != @{$cur->{ibx2root}}) {
+ } elsif (@{$cur->{ekeys}} < @{$cur->{ibx2root}}) {
warn <<EOM;
W: $self->{topdir} join data for $self->{-cfg_f} mismatched ekeys and ibx2root
EOM
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 03/15] config: reject newlines consistently in dir names
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
2023-11-30 11:40 ` [PATCH 01/15] cindex: fix store_repo+repo_stored on no-op Eric Wong
2023-11-30 11:40 ` [PATCH 02/15] codesearch: allow inbox count to exceed matches Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 11:40 ` [PATCH 04/15] cindex: only create {-cidx_err} field on failures Eric Wong
` (11 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
Explicitly drop support for "\n" in git coderepo pathnames as
we do other stuff. Gcf2 (our libgit2 helper) was always
broken with "\n" in pathnames, and I'm not sure if cgit config
files work with them, either. Dealing with newline characters
requires extra complexity that I'm not willing to deal with when
managing alternates files.
---
lib/PublicInbox/Config.pm | 32 ++++++++++++++------------------
1 file changed, 14 insertions(+), 18 deletions(-)
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index 779e3140..6bebf790 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -361,12 +361,19 @@ sub parse_cgitrc {
cgit_repo_merge($self, $repo->{dir}, $repo) if $repo;
}
+sub valid_dir ($$) {
+ my $dir = get_1($_[0], $_[1]) // return;
+ index($dir, "\n") < 0 ? $dir : do {
+ warn "E: `$_[1]=$dir' must not contain `\\n'\n";
+ undef;
+ }
+}
+
# parse a code repo, only git is supported at the moment
sub fill_coderepo {
my ($self, $nick) = @_;
my $pfx = "coderepo.$nick";
- my $dir = $self->{"$pfx.dir"} // return undef; # aka "GIT_DIR"
- my $git = PublicInbox::Git->new($dir);
+ my $git = PublicInbox::Git->new(valid_dir($self, "$pfx.dir") // return);
if (defined(my $cgits = $self->{"$pfx.cgiturl"})) {
$git->{cgit_url} = $cgits = _array($cgits);
$self->{"$pfx.cgiturl"} = $cgits;
@@ -450,18 +457,15 @@ sub _fill_ibx {
my $v = $self->{"$pfx.$k"};
$ibx->{$k} = $v if defined $v;
}
- for my $k (qw(filter inboxdir newsgroup replyto httpbackendmax feedmax
+ for my $k (qw(filter newsgroup replyto httpbackendmax feedmax
indexlevel indexsequentialshard boost)) {
my $v = get_1($self, "$pfx.$k") // next;
$ibx->{$k} = $v;
}
# "mainrepo" is backwards compatibility:
- my $dir = $ibx->{inboxdir} //= $self->{"$pfx.mainrepo"} // return;
- if (index($dir, "\n") >= 0) {
- warn "E: `$dir' must not contain `\\n'\n";
- return;
- }
+ my $dir = $ibx->{inboxdir} = valid_dir($self, "$pfx.inboxdir") //
+ valid_dir($self, "$pfx.mainrepo") // return;
for my $k (qw(obfuscate)) {
my $v = $self->{"$pfx.$k"} // next;
if (defined(my $bval = git_bool($v))) {
@@ -548,12 +552,8 @@ sub _fill_ei ($$) {
my ($self, $name) = @_;
eval { require PublicInbox::ExtSearch } or return;
my $pfx = "extindex.$name";
- my $d = $self->{"$pfx.topdir"} // return;
+ my $d = valid_dir($self, "$pfx.topdir") // return;
-d $d or return;
- if (index($d, "\n") >= 0) {
- warn "E: `$d' must not contain `\\n'\n";
- return;
- }
my $es = PublicInbox::ExtSearch->new($d);
for my $k (qw(indexlevel indexsequentialshard)) {
my $v = get_1($self, "$pfx.$k") // next;
@@ -573,12 +573,8 @@ sub _fill_csrch ($$) {
return if $name ne '' && !valid_foo_name($name, 'cindex');
eval { require PublicInbox::CodeSearch } or return;
my $pfx = "cindex.$name";
- my $d = $self->{"$pfx.topdir"} // return;
+ my $d = valid_dir($self, "$pfx.topdir") // return;
-d $d or return;
- if (index($d, "\n") >= 0) {
- warn "E: `$d' must not contain `\\n'\n";
- return;
- }
my $csrch = PublicInbox::CodeSearch->new($d, $self);
for my $k (qw(localprefix)) {
my $v = $self->{"$pfx.$k"} // next;
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 04/15] cindex: only create {-cidx_err} field on failures
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (2 preceding siblings ...)
2023-11-30 11:40 ` [PATCH 03/15] config: reject newlines consistently in dir names Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 11:40 ` [PATCH 05/15] cindex: keep batch pipe for pruning SHA-256 repos Eric Wong
` (10 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
We only use it as a boolean flag, and there's no need to waste
space for common, non-error cases.
---
lib/PublicInbox/CodeSearchIdx.pm | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index bd67a57e..cf6a6efe 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -383,8 +383,8 @@ sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) }
sub _cb { # run_await cb
my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_;
return if $DO_QUIT;
- ($git->{-cidx_err} = $?) ? warn("W: @$cmd (\$?=$?)\n") :
- $cb->($opt, $self, $git, @arg);
+ $? ? ($git->{-cidx_err} = warn("W: @$cmd (\$?=$?)\n")) :
+ $cb->($opt, $self, $git, @arg);
}
sub run_git {
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 05/15] cindex: keep batch pipe for pruning SHA-256 repos
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (3 preceding siblings ...)
2023-11-30 11:40 ` [PATCH 04/15] cindex: only create {-cidx_err} field on failures Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 11:40 ` [PATCH 06/15] cindex: store extensions.objectFormat with repo data Eric Wong
` (9 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
This fixes the case where we're running both SHA-256 and SHA-1.
There's no tests for SHA-256, yet, but the bug is pretty obvious
upon reading the code.
---
lib/PublicInbox/CodeSearchIdx.pm | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index cf6a6efe..26018232 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -1087,7 +1087,7 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
# ) | awk | sort | comm | cidx_read_comm()
my ($awk_opt, $sort_opt, $batch_opt);
my $comm_opt = { -C => "$TMPDIR" };
- pipe(local $awk_opt->{0}, local $batch_opt->{1});
+ pipe(local $awk_opt->{0}, $batch_opt->{1});
pipe(local $sort_opt->{0}, local $awk_opt->{1});
pipe(local $comm_opt->{0}, local $sort_opt->{1});
run_await(\@AWK, $CMD_ENV, $awk_opt, \&cmd_done);
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 06/15] cindex: store extensions.objectFormat with repo data
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (4 preceding siblings ...)
2023-11-30 11:40 ` [PATCH 05/15] cindex: keep batch pipe for pruning SHA-256 repos Eric Wong
@ 2023-11-30 11:40 ` Eric Wong
2023-11-30 21:36 ` Eric Wong
2023-11-30 11:41 ` [PATCH 07/15] git: share unlinked pack checking code with gcf2 Eric Wong
` (8 subsequent siblings)
14 siblings, 1 reply; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:40 UTC (permalink / raw)
To: meta
This will allow WWW to use a combined LeiALE-like
thing to reduce git processes.
---
lib/PublicInbox/CodeSearch.pm | 27 ++++--
lib/PublicInbox/CodeSearchIdx.pm | 161 +++++++++++++++++++++----------
2 files changed, 127 insertions(+), 61 deletions(-)
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index 60deb2ae..208f7528 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -209,15 +209,20 @@ sub roots2paths { # for diagnostics
\%ret;
}
-sub root_oids ($$) {
+sub docids_of_git_dir ($$) {
my ($self, $git_dir) = @_;
my @ids = $self->docids_by_postlist('P'.$git_dir);
- @ids or warn <<"";
-BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir}
-
warn <<"" if @ids > 1;
BUG: (non-fatal) $git_dir indexed multiple times in $self->{topdir}
+ @ids;
+}
+
+sub root_oids ($$) {
+ my ($self, $git_dir) = @_;
+ my @ids = docids_of_git_dir $self, $git_dir or warn <<"";
+BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir}
+
my %ret;
for my $docid (@ids) {
my @oids = xap_terms('G', $self->xdb, $docid);
@@ -242,15 +247,21 @@ sub paths2roots {
\%ret;
}
+sub load_ct { # retry_reopen cb
+ my ($self, $git_dir) = @_;
+ my @ids = docids_of_git_dir $self, $git_dir or return;
+ for (@ids) {
+ my $doc = $self->get_doc($_) // next;
+ return int_val($doc, CT);
+ }
+}
+
sub load_commit_times { # each_cindex callback
my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...]
my (@pending, $rec, $dir, @ids, $doc);
while ($rec = shift @$todo) {
- @ids = $self->docids_by_postlist('P'.$rec->[1]->{git_dir});
+ @ids = docids_of_git_dir $self, $rec->[1]->{git_dir};
if (@ids) {
- warn <<EOM if @ids > 1;
-W: $rec->[1]->{git_dir} indexed multiple times in $self->{topdir}
-EOM
for (@ids) {
$doc = $self->get_doc($_) // next;
$rec->[0] = int_val($doc, CT);
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 26018232..7580a49a 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -70,7 +70,7 @@ use PublicInbox::Git qw(%OFMT2HEXLEN);
use PublicInbox::Compat qw(uniqstr);
use PublicInbox::Aspawn qw(run_await);
use Compress::Zlib qw(compress);
-use Carp ();
+use Carp qw(croak);
use Time::Local qw(timegm);
use autodie qw(close pipe open sysread seek sysseek send);
our $DO_QUIT = 15; # signal number
@@ -91,11 +91,11 @@ our (
$NPROC,
$XHC, # XapClient
$REPO_CTX, # current repo being indexed in shards
- $IDX_TODO, # PublicInbox::Git object arrayref
- $GIT_TODO, # PublicInbox::Git object arrayref
+ $IDXQ, # PublicInbox::Git object arrayref
+ $SCANQ, # PublicInbox::Git object arrayref
%ALT_FH, # hexlen => tmp IO for TMPDIR git alternates
$TMPDIR, # File::Temp->newdir object for prune
- @PRUNE_QUEUE, # GIT_DIRs to prepare for pruning
+ @PRUNEQ, # GIT_DIRs to prepare for pruning
%TODO, @IBXQ, @IBX,
@JOIN, # join(1) command for --join
$CMD_ENV, # env for awk(1), comm(1), sort(1) commands during prune
@@ -116,7 +116,7 @@ our $SEEN_MAX = 100000;
# window for commits/emails to determine a inbox <-> coderepo association
my $JOIN_WINDOW = 50000;
-our @PRUNE_BATCH = qw(git _ cat-file --batch-all-objects --batch-check);
+our @PRUNE_BATCH = qw(cat-file --batch-all-objects --batch-check);
# TODO: do we care about committer name + email? or tree OID?
my @FMT = qw(H P ct an ae at s b); # (b)ody must be last
@@ -210,9 +210,23 @@ sub progress {
$pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n");
}
+sub check_objfmt_status ($$$) {
+ my ($git, $chld_err, $fmt) = @_;
+ my ($status, $sig) = ($chld_err >> 8, $chld_err & 127);
+ if (!$sig && $status == 1) { # unset, default is '' (SHA-1)
+ $fmt = 'sha1';
+ } elsif (!$sig && $status == 0) {
+ chomp($fmt ||= 'sha1');
+ }
+ $fmt // warn("git --git-dir=$git->{git_dir} config \$?=$chld_err");
+ $fmt;
+}
+
sub store_repo { # wq_io_do, sends docid back
my ($self, $repo) = @_;
my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p';
+ my $git = bless $repo, 'PublicInbox::Git';
+ my $rd = $git->popen(qw(config extensions.objectFormat));
$self->begin_txn_lazy;
$self->{xdb}->delete_document($_) for @{$repo->{to_delete}};
my $doc = $PublicInbox::Search::X{Document}->new;
@@ -221,6 +235,13 @@ sub store_repo { # wq_io_do, sends docid back
$doc->add_boolean_term('T'.'r');
$doc->add_boolean_term('G'.$_) for @{$repo->{roots}};
$doc->set_data($repo->{fp}); # \n delimited
+ my $fmt = readline($rd);
+ $rd->close;
+ $fmt = check_objfmt_status $git, $?, $fmt;
+ $OFMT2HEXLEN{$fmt} // warn <<EOM; # store unknown formats anyways
+E: unknown extensions.objectFormat=$fmt in $repo->{git_dir}
+EOM
+ $doc->add_boolean_term('H'.$fmt);
my $did = $repo->{docid};
$did ? $self->{xdb}->replace_document($did, $doc)
: ($did = $self->{xdb}->add_document($doc));
@@ -383,6 +404,7 @@ sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) }
sub _cb { # run_await cb
my ($pid, $cmd, undef, $opt, $cb, $self, $git, @arg) = @_;
return if $DO_QUIT;
+ return $cb->($opt, $self, $git, @arg) if $opt->{quiet};
$? ? ($git->{-cidx_err} = warn("W: @$cmd (\$?=$?)\n")) :
$cb->($opt, $self, $git, @arg);
}
@@ -436,7 +458,7 @@ sub prep_repo ($$) {
delete $git->{-repo};
return index_next($self);
}
- my $n = git_dir_hash($git->{git_dir}) % $self->{nshard};
+ my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
my $shard = bless { %$self, shard => $n }, ref($self);
$repo->{shard_n} = $n;
delete @$shard{qw(lockfh lock_path)};
@@ -446,7 +468,7 @@ sub prep_repo ($$) {
sub check_existing { # retry_reopen callback
my ($shard, $self, $git) = @_;
- my @docids = $shard->docids_by_postlist('P'.$git->{git_dir});
+ my @docids = $shard->docids_of_git_dir($git->{git_dir});
my $docid = shift(@docids) // return get_roots($self, $git);
my $doc = $shard->get_doc($docid) //
die "BUG: no #$docid ($git->{git_dir})";
@@ -581,10 +603,10 @@ sub dump_ibx_start {
sub index_next ($) {
my ($self) = @_;
return if $DO_QUIT;
- if ($IDX_TODO && @$IDX_TODO) {
- index_repo(undef, $self, shift @$IDX_TODO);
- } elsif ($GIT_TODO && @$GIT_TODO) {
- my $git = shift @$GIT_TODO;
+ if ($IDXQ && @$IDXQ) {
+ index_repo(undef, $self, shift @$IDXQ);
+ } elsif ($SCANQ && @$SCANQ) {
+ my $git = shift @$SCANQ;
my $prep_repo = PublicInbox::OnDestroy->new($$, \&prep_repo,
$self, $git);
fp_start($self, $git, $prep_repo);
@@ -631,7 +653,7 @@ sub index_repo { # run_git cb
my (undef, $self, $git) = @_;
return if $DO_QUIT;
return index_next($self) if $git->{-cidx_err};
- return push(@$IDX_TODO, $git) if $REPO_CTX; # busy
+ return push(@$IDXQ, $git) if $REPO_CTX; # busy
my $repo = delete $git->{-repo} or return index_next($self);
my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}';
seek($roots_fh, 0, SEEK_SET);
@@ -755,12 +777,12 @@ sub gits_fini {
sub scan_git_dirs ($) {
my ($self) = @_;
- @$GIT_TODO = map { PublicInbox::Git->new($_) } @{$self->{git_dirs}};
- $GITS_NR = @$GIT_TODO;
+ @$SCANQ = () unless $self->{-opt}->{scan};
+ $GITS_NR = @$SCANQ or return;
my $gits_fini = PublicInbox::OnDestroy->new($$, \&gits_fini);
- $_->{-cidx_gits_fini} = $gits_fini for @$GIT_TODO;
+ $_->{-cidx_gits_fini} = $gits_fini for @$SCANQ;
if (my $drs = $TODO{dump_roots_start}) {
- $_->{-cidx_dump_roots_start} = $drs for @$GIT_TODO;
+ $_->{-cidx_dump_roots_start} = $drs for @$SCANQ;
}
progress($self, "scanning $GITS_NR code repositories...");
}
@@ -794,9 +816,9 @@ sub prune_commit { # via wq_io_do in IDX_SHARDS
sub shards_active { # post_loop_do
return if $DO_QUIT;
- return if grep(defined, $PRUNE_DONE, $GIT_TODO, $IDX_TODO) != 3;
+ return if grep(defined, $PRUNE_DONE, $SCANQ, $IDXQ) != 3;
return 1 if grep(defined, @$PRUNE_DONE) != @IDX_SHARDS;
- return 1 if $GITS_NR || scalar(@$IDX_TODO) || $REPO_CTX;
+ return 1 if $GITS_NR || scalar(@$IDXQ) || $REPO_CTX;
return 1 if @IBXQ || keys(%TODO);
for my $s (grep { $_->{-wq_s1} } @IDX_SHARDS) {
$s->{-cidx_quit} = 1 if defined($s->{-wq_s1});
@@ -836,18 +858,8 @@ sub prep_umask ($) {
}
}
-sub prep_alternate_end { # run_await cb for config extensions.objectFormat
- my ($pid, $cmd, undef, $opt, $objdir, $run_prune) = @_;
- my ($status, $sig) = ($? >> 8, $? & 127);
- my $next_dir = shift(@PRUNE_QUEUE);
- prep_alternate_start($next_dir, $run_prune) if defined($next_dir);
- my $fmt;
- if (!$sig && $status == 1) { # unset, default is '' (SHA-1)
- $fmt = 'sha1';
- } elsif (!$sig && $status == 0) {
- chomp($fmt = ${$opt->{1}} || 'sha1');
- }
- $fmt // return warn("git config \$?=$? for objdir=$objdir");
+sub prep_alternate_end ($$) {
+ my ($objdir, $fmt) = @_;
my $hexlen = $OFMT2HEXLEN{$fmt} // return warn <<EOM;
E: ignoring objdir=$objdir, unknown extensions.objectFormat=$fmt
EOM
@@ -860,17 +872,55 @@ EOM
say { $ALT_FH{$hexlen} } $objdir;
}
+sub store_objfmt { # via wq_do - make early cidx users happy
+ my ($self, $docid, $git_dir, $fmt) = @_;
+ $self->begin_txn_lazy;
+ my $doc = $self->get_doc($docid) // return
+ warn "BUG? #$docid for $git_dir missing";
+ my @p = xap_terms('P', $doc) or return
+ warn "BUG? #$docid for $git_dir has no P(ath)";
+ @p == 1 or return warn "BUG? #$docid $git_dir multi: @p";
+ $p[0] eq $git_dir or return warn "BUG? #$docid $git_dir != @p";
+ $doc->add_boolean_term('H'.$fmt);
+ $self->{xdb}->replace_document($docid, $doc);
+ # wait for prune_commit to commit...
+}
+
+# TODO: remove prep_alternate_read and store_objfmt 1-2 years after 2.0 is out
+# they are for compatibility with pre-release indices
+sub prep_alternate_read { # run_git cb for config extensions.objectFormat
+ my ($opt, $self, $git, $objdir, $docid, $shard_n, $run_prune) = @_;
+ return if $DO_QUIT;
+ my $chld_err = $?;
+ prep_alternate_start($self, shift(@PRUNEQ), $run_prune) if @PRUNEQ;
+ my $fmt = check_objfmt_status $git, $chld_err, ${$opt->{1}};
+ $IDX_SHARDS[$shard_n]->wq_do('store_objfmt', # async
+ $docid, $git->{git_dir}, $fmt);
+ prep_alternate_end $objdir, $fmt;
+}
+
sub prep_alternate_start {
- my ($git_dir, $run_prune) = @_;
- my $o = $git_dir.'/objects';
+ my ($self, $git, $run_prune) = @_;
+ my $o = $git->git_path('objects');
while (!-d $o) {
- $git_dir = shift(@PRUNE_QUEUE) // return;
- $o = $git_dir.'/objects';
+ $git = shift(@PRUNEQ) // return;
+ $o = $git->git_path('objects');
+ }
+ my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
+ local $self->{xdb} = $RDONLY_XDB[$n] // croak("BUG: no shard[$n]");
+ my @ids = $self->docids_by_postlist('P'.$git->{git_dir});
+ my @fmt = @ids ? xap_terms('H', $self->{xdb}, $ids[0]) : ();
+ @fmt > 1 and warn "BUG? multi `H' for shard[$n] #$ids[0]: @fmt";
+
+ if (@fmt) { # cache hit
+ @PRUNEQ and
+ prep_alternate_start($self, shift(@PRUNEQ), $run_prune);
+ prep_alternate_end $o, $fmt[0];
+ } else { # compatibility w/ early cidx format
+ run_git([qw(config extensions.objectFormat)], { quiet => 1 },
+ \&prep_alternate_read, $self, $git, $o, $ids[0], $n,
+ $run_prune);
}
- my $cmd = [ 'git', "--git-dir=$git_dir",
- qw(config extensions.objectFormat) ];
- my $opt = { quiet => 1 };
- run_await($cmd, undef, $opt, \&prep_alternate_end, $o, $run_prune);
}
sub cmd_done { # run_await cb for sort, xapian-delve, sed failures
@@ -1059,21 +1109,22 @@ sub init_prune ($) {
run_await([@SORT, '-u'], $CMD_ENV, $sort_opt, \&cmd_done, $run_prune);
run_await(\@sed, $CMD_ENV, $sed_opt, \&cmd_done, $run_prune);
run_await(\@delve, undef, $delve_opt, \&cmd_done, $run_prune);
- @PRUNE_QUEUE = @{$self->{git_dirs}};
+ @PRUNEQ = @$SCANQ;
for (1..$LIVE_JOBS) {
- prep_alternate_start(shift(@PRUNE_QUEUE) // last, $run_prune);
+ prep_alternate_start($self, shift(@PRUNEQ) // last, $run_prune);
}
}
sub dump_git_commits { # run_await cb
- my ($pid, undef, undef, $batch_opt) = @_;
- (defined($pid) && $?) and die "E: @PRUNE_BATCH: \$?=$?";
+ my ($pid, $cmd, undef, $batch_opt, $self) = @_;
+ (defined($pid) && $?) and die "E: @$cmd \$?=$?";
return if $DO_QUIT;
- my ($hexlen) = keys(%ALT_FH) or return; # done
+ my ($hexlen) = keys(%ALT_FH) or return; # done, DESTROY batch_opt->{1}
close(delete $ALT_FH{$hexlen}); # flushes `say' buffer
-
- $PRUNE_BATCH[1] = "--git-dir=$TMPDIR/hexlen$hexlen.git";
- run_await(\@PRUNE_BATCH, undef, $batch_opt, \&dump_git_commits);
+ progress($self, "preparing $hexlen-byte hex OID commits for prune...");
+ my $g = PublicInbox::Git->new("$TMPDIR/hexlen$hexlen.git");
+ run_await($g->cmd(@PRUNE_BATCH), undef, $batch_opt,
+ \&dump_git_commits, $self);
}
sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
@@ -1105,12 +1156,13 @@ sub run_prune { # OnDestroy when `git config extensions.objectFormat' are done
warn(sprintf(<<EOM, $git_ver)) if $git_ver lt v2.19;
W: git v2.19+ recommended for high-latency storage (have git v%vd)
EOM
- dump_git_commits(undef, undef, undef, $batch_opt);
+ dump_git_commits(undef, undef, undef, $batch_opt, $self);
}
sub cidx_read_comm { # via PublicInbox::CidxComm::event_step
my ($self, $comm_rd, $drs) = @_;
return if $DO_QUIT;
+ progress($self, 'starting prune...');
$_->wq_do('prune_init') for @IDX_SHARDS;
while (defined(my $cmt = <$comm_rd>)) {
chop($cmt) eq "\n" or die "BUG: no LF in comm output ($cmt)";
@@ -1121,6 +1173,7 @@ sub cidx_read_comm { # via PublicInbox::CidxComm::event_step
for my $git_dir (@GIT_DIR_GONE) {
my $n = git_dir_hash($git_dir) % scalar(@IDX_SHARDS);
$IDX_SHARDS[$n]->wq_do('prune_one', 'P'.$git_dir);
+ last if $DO_QUIT;
}
my ($c, $p) = PublicInbox::PktOp->pair;
$c->{ops}->{prune_done} = [ $self, $drs ];
@@ -1201,9 +1254,11 @@ sub show_json { # for diagnostics (unstable output)
sub do_inits { # called via PublicInbox::DS::add_timer
my ($self) = @_;
- init_join_postfork($self);
- init_prune($self);
- scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
+ grep !!$_, @{$self->{-opt}}{qw(scan prune)} and
+ @$SCANQ = map PublicInbox::Git->new($_), @{$self->{git_dirs}};
+ init_join_postfork $self;
+ init_prune $self;
+ scan_git_dirs $self;
my $max = $TODO{do_join} ? max($LIVE_JOBS, $NPROC) : $LIVE_JOBS;
index_next($self) for (1..$max);
}
@@ -1216,9 +1271,9 @@ sub cidx_run { # main entry point
my $restore = PublicInbox::OnDestroy->new($$,
\&PublicInbox::DS::sig_setmask, $SIGSET);
local $PRUNE_DONE = [];
- local $IDX_TODO = [];
- local $GIT_TODO = [];
- local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNE_QUEUE,
+ local $IDXQ = [];
+ local $SCANQ = [];
+ local ($DO_QUIT, $REINDEX, $TXN_BYTES, @GIT_DIR_GONE, @PRUNEQ,
$REPO_CTX, %ALT_FH, $TMPDIR, @AWK, @COMM, $CMD_ENV,
%TODO, @IBXQ, @IBX, @JOIN, %JOIN, @JOIN_PFX,
@JOIN_DT, $DUMP_IBX_WPIPE, @OFF2ROOT, $XHC, @SORT, $GITS_NR);
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 07/15] git: share unlinked pack checking code with gcf2
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (5 preceding siblings ...)
2023-11-30 11:40 ` [PATCH 06/15] cindex: store extensions.objectFormat with repo data Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 08/15] cindex: skip getpid guard for most OnDestroy use Eric Wong
` (7 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
It saves some code in case we keep libgit2 around.
---
lib/PublicInbox/Gcf2.pm | 16 ++++------------
lib/PublicInbox/Git.pm | 27 ++++++++++++++-------------
2 files changed, 18 insertions(+), 25 deletions(-)
diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm
index dcbb201d..78392990 100644
--- a/lib/PublicInbox/Gcf2.pm
+++ b/lib/PublicInbox/Gcf2.pm
@@ -9,7 +9,7 @@ use PublicInbox::Spawn qw(which run_qx); # may set PERL_INLINE_DIRECTORY
use Fcntl qw(SEEK_SET);
use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC);
use IO::Handle; # autoflush
-use PublicInbox::Git;
+use PublicInbox::Git qw($ck_unlinked_packs);
use PublicInbox::Lock;
use autodie qw(close open seek truncate);
@@ -86,16 +86,6 @@ sub add_alt ($$) {
1;
}
-sub have_unlinked_files () {
- # FIXME: port gcf2-like over to git.git so we won't need to
- # deal with libgit2
- return 1 if $^O ne 'linux';
- if (my $s = PublicInbox::IO::try_cat("/proc/$$/maps")) {
- return 1 if /\.(?:idx|pack) \(deleted\)/s;
- }
- undef;
-}
-
# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop [EXPIRE-TIMEOUT]
# (see lib/PublicInbox/Gcf2Client.pm)
sub loop (;$) {
@@ -104,6 +94,7 @@ sub loop (;$) {
my (%seen, $check_at);
STDERR->autoflush(1);
STDOUT->autoflush(1);
+ my $pid = $$;
while (<STDIN>) {
chomp;
@@ -130,7 +121,8 @@ sub loop (;$) {
$check_at //= $now + $exp;
if ($now > $check_at) {
undef $check_at;
- if (have_unlinked_files()) {
+ if (!$ck_unlinked_packs ||
+ $ck_unlinked_packs->($pid)) {
$gcf2 = new();
%seen = ();
}
diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm
index 235a35cd..9c4d938e 100644
--- a/lib/PublicInbox/Git.pm
+++ b/lib/PublicInbox/Git.pm
@@ -24,7 +24,8 @@ use Carp qw(croak carp);
use PublicInbox::SHA qw(sha_all);
our %HEXLEN2SHA = (40 => 1, 64 => 256);
our %OFMT2HEXLEN = (sha1 => 40, sha256 => 64);
-our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN);
+our @EXPORT_OK = qw(git_unquote git_quote %HEXLEN2SHA %OFMT2HEXLEN
+ $ck_unlinked_packs);
our $in_cleanup;
our $async_warn; # true in read-only daemons
@@ -597,27 +598,27 @@ sub manifest_entry {
$ent;
}
+our $ck_unlinked_packs = $^O eq 'linux' ? sub {
+ # FIXME: port gcf2-like over to git.git so we won't need to
+ # deal with libgit2
+ my $s = try_cat "/proc/$_[0]/maps";
+ $s =~ /\.(?:idx|pack) \(deleted\)/s ? 1 : undef;
+} : undef;
+
# returns true if there are pending cat-file processes
sub cleanup_if_unlinked {
my ($self) = @_;
- return cleanup($self, 1) if $^O ne 'linux';
+ $ck_unlinked_packs or return cleanup($self, 1);
# Linux-specific /proc/$PID/maps access
# TODO: support this inside git.git
- my $ret = 0;
+ my $nr_live = 0;
for my $obj ($self, ($self->{ck} // ())) {
my $sock = $obj->{sock} // next;
my $pid = $sock->attached_pid // next;
- open my $fh, '<', "/proc/$pid/maps" or return cleanup($self, 1);
- while (<$fh>) {
- # n.b. we do not restart for unlinked multi-pack-index
- # since it's not too huge, and the startup cost may
- # be higher.
- /\.(?:idx|pack) \(deleted\)$/ and
- return cleanup($self, 1);
- }
- ++$ret;
+ $ck_unlinked_packs->($pid) and return cleanup($self, 1);
+ ++$nr_live;
}
- $ret;
+ $nr_live;
}
sub event_step {
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 08/15] cindex: skip getpid guard for most OnDestroy use
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (6 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 07/15] git: share unlinked pack checking code with gcf2 Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 09/15] spawn: drop IO layer support from redirects Eric Wong
` (6 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
We no longer fork after cidx_init, so there's no need to spend
CPU cycles on the getpid() syscall, especially since it's no
longer cached on glibc while syscalls are also more expensive
these days due to CPU vulnerability mitigations.
---
lib/PublicInbox/CodeSearchIdx.pm | 22 +++++++++++-----------
1 file changed, 11 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 7580a49a..555a1efe 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -365,7 +365,7 @@ sub repo_stored {
$did > 0 or die "BUG: $repo_ctx->{repo}->{git_dir}: docid=$did";
my ($c, $p) = PublicInbox::PktOp->pair;
$c->{ops}->{shard_done} = [ $self, $repo_ctx,
- PublicInbox::OnDestroy->new($$, \&next_repos, $repo_ctx, $drs)];
+ PublicInbox::OnDestroy->new(\&next_repos, $repo_ctx, $drs)];
# shard_done fires when all shards are committed
my @active = keys %{$repo_ctx->{active}};
$IDX_SHARDS[$_]->wq_io_do('shard_commit', [ $p->{op_p} ]) for @active;
@@ -607,8 +607,8 @@ sub index_next ($) {
index_repo(undef, $self, shift @$IDXQ);
} elsif ($SCANQ && @$SCANQ) {
my $git = shift @$SCANQ;
- my $prep_repo = PublicInbox::OnDestroy->new($$, \&prep_repo,
- $self, $git);
+ my $prep_repo = PublicInbox::OnDestroy->new(\&prep_repo,
+ $self, $git);
fp_start($self, $git, $prep_repo);
ct_start($self, $git, $prep_repo);
} elsif ($TMPDIR) {
@@ -669,7 +669,7 @@ sub index_repo { # run_git cb
my $repo_ctx = $REPO_CTX = { self => $self, repo => $repo };
delete $git->{-cidx_gits_fini}; # may fire gits_fini
my $drs = delete $git->{-cidx_dump_roots_start};
- my $index_done = PublicInbox::OnDestroy->new($$, \&index_done,
+ my $index_done = PublicInbox::OnDestroy->new(\&index_done,
$repo_ctx, $drs);
my ($c, $p) = PublicInbox::PktOp->pair;
$c->{ops}->{shard_done} = [ $self, $repo_ctx, $index_done ];
@@ -779,7 +779,7 @@ sub scan_git_dirs ($) {
my ($self) = @_;
@$SCANQ = () unless $self->{-opt}->{scan};
$GITS_NR = @$SCANQ or return;
- my $gits_fini = PublicInbox::OnDestroy->new($$, \&gits_fini);
+ my $gits_fini = PublicInbox::OnDestroy->new(\&gits_fini);
$_->{-cidx_gits_fini} = $gits_fini for @$SCANQ;
if (my $drs = $TODO{dump_roots_start}) {
$_->{-cidx_dump_roots_start} = $drs for @$SCANQ;
@@ -851,7 +851,7 @@ sub prep_umask ($) {
umask == $um or progress($self, 'using umask from ',
$self->{cidx_dir}, ': ',
sprintf('0%03o', $um));
- PublicInbox::OnDestroy->new($$, \&CORE::umask, umask($um));
+ PublicInbox::OnDestroy->new(\&CORE::umask, umask($um));
} else {
$self->{umask} = umask; # for SearchIdx->with_umask
undef;
@@ -1073,11 +1073,11 @@ EOM
($JOIN_DT[1]) = ($QRY_STR =~ /\.\.([0-9]{14})\z/); # YYYYmmddHHMMSS
($JOIN_DT[0]) = ($QRY_STR =~ /\Adt:([0-9]{14})/); # YYYYmmddHHMMSS
$JOIN_DT[0] //= '19700101'.'000000'; # git uses unsigned times
- $TODO{do_join} = PublicInbox::OnDestroy->new($$, \&do_join, $self);
+ $TODO{do_join} = PublicInbox::OnDestroy->new(\&do_join, $self);
$TODO{joining} = 1; # keep shards_active() happy
- $TODO{dump_ibx_start} = PublicInbox::OnDestroy->new($$,
- \&dump_ibx_start, $self, $TODO{do_join});
- $TODO{dump_roots_start} = PublicInbox::OnDestroy->new($$,
+ $TODO{dump_ibx_start} = PublicInbox::OnDestroy->new(\&dump_ibx_start,
+ $self, $TODO{do_join});
+ $TODO{dump_roots_start} = PublicInbox::OnDestroy->new(
\&dump_roots_start, $self, $TODO{do_join});
progress($self, "will join in $QRY_STR date range...");
my $id = -1;
@@ -1100,7 +1100,7 @@ sub init_prune ($) {
require_progs('prune', 'xapian-delve' => \@delve, sed => \@sed,
comm => \@COMM, awk => \@AWK);
for (0..$#IDX_SHARDS) { push @delve, "$self->{xpfx}/$_" }
- my $run_prune = PublicInbox::OnDestroy->new($$, \&run_prune, $self,
+ my $run_prune = PublicInbox::OnDestroy->new(\&run_prune, $self,
$TODO{dump_roots_start});
my ($sort_opt, $sed_opt, $delve_opt);
pipe(local $sed_opt->{0}, local $delve_opt->{1});
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 09/15] spawn: drop IO layer support from redirects
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (7 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 08/15] cindex: skip getpid guard for most OnDestroy use Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong
` (5 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
When setting up stdin for commands, the write_file API is
convenient enough nowadays to not be worth having special
support with process spawning.
When reading stdout of commands, we should probably be using
utf8_maybe everywhere since there'll always be legacy encodings
in git repos.
Reading regular files with :utf8 also results in worse memory
management since the file size cannot be used as a hint.
---
lib/PublicInbox/MailDiff.pm | 3 ++-
lib/PublicInbox/SearchIdx.pm | 5 ++++-
lib/PublicInbox/Spawn.pm | 32 +++++++++++---------------------
3 files changed, 17 insertions(+), 23 deletions(-)
diff --git a/lib/PublicInbox/MailDiff.pm b/lib/PublicInbox/MailDiff.pm
index e4e262ef..125360fe 100644
--- a/lib/PublicInbox/MailDiff.pm
+++ b/lib/PublicInbox/MailDiff.pm
@@ -65,6 +65,7 @@ sub next_smsg ($) {
sub emit_msg_diff {
my ($bref, $self) = @_; # bref is `git diff' output
require PublicInbox::Hval;
+ PublicInbox::Hval::utf8_maybe($$bref);
# will be escaped to `•' in HTML
$self->{ctx}->{ibx}->{obfuscate} and
@@ -81,7 +82,7 @@ sub do_diff {
my $dir = "$self->{tmp}/$n";
$self->dump_eml($dir, $eml);
my $cmd = [ qw(git diff --no-index --no-color -- a), $n ];
- my $opt = { -C => "$self->{tmp}", quiet => 1, 1 => [':utf8', \my $o] };
+ my $opt = { -C => "$self->{tmp}", quiet => 1 };
my $qsp = PublicInbox::Qspawn->new($cmd, undef, $opt);
$qsp->psgi_qx($self->{ctx}->{env}, undef, \&emit_msg_diff, $self);
}
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 17538027..86c435fd 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -355,8 +355,11 @@ sub index_body_text {
my $rd;
if ($$sref =~ /^(?:diff|---|\+\+\+) /ms) { # start patch-id in parallel
my $git = ($self->{ibx} // $self->{eidx} // $self)->git;
+ my $fh = PublicInbox::IO::write_file '+>:utf8', undef, $$sref;
+ $fh->flush or die "flush: $!";
+ sysseek($fh, 0, SEEK_SET);
$rd = popen_rd($git->cmd(qw(patch-id --stable)), undef,
- { 0 => [ ':utf8', $sref ] });
+ { 0 => $fh });
}
# split off quoted and unquoted blocks:
diff --git a/lib/PublicInbox/Spawn.pm b/lib/PublicInbox/Spawn.pm
index 9c680690..e6b12994 100644
--- a/lib/PublicInbox/Spawn.pm
+++ b/lib/PublicInbox/Spawn.pm
@@ -332,18 +332,6 @@ sub which ($) {
undef;
}
-sub scalar_redirect {
- my ($layer, $opt, $child_fd, $bref) = @_;
- open my $fh, '+>'.$layer, undef;
- $opt->{"fh.$child_fd"} = $fh;
- if ($child_fd == 0) {
- print $fh $$bref;
- $fh->flush or die "flush: $!";
- sysseek($fh, 0, SEEK_SET);
- }
- fileno($fh);
-}
-
sub spawn ($;$$) {
my ($cmd, $env, $opt) = @_;
my $f = which($cmd->[0]) // die "$cmd->[0]: command not found\n";
@@ -354,14 +342,18 @@ sub spawn ($;$$) {
}
for my $child_fd (0..2) {
my $pfd = $opt->{$child_fd};
- if ('ARRAY' eq ref($pfd)) {
- my ($layer, $bref) = @$pfd;
- $pfd = scalar_redirect($layer, $opt, $child_fd, $bref)
- } elsif ('SCALAR' eq ref($pfd)) {
- $pfd = scalar_redirect('', $opt, $child_fd, $pfd);
+ if ('SCALAR' eq ref($pfd)) {
+ open my $fh, '+>', undef;
+ $opt->{"fh.$child_fd"} = $fh; # for read_out_err
+ if ($child_fd == 0) {
+ print $fh $$pfd;
+ $fh->flush or die "flush: $!";
+ sysseek($fh, 0, SEEK_SET);
+ }
+ $pfd = fileno($fh);
} elsif (defined($pfd) && $pfd !~ /\A[0-9]+\z/) {
my $fd = fileno($pfd) //
- die "$pfd not an IO GLOB? $!";
+ croak "BUG: $pfd not an IO GLOB? $!";
$pfd = $fd;
}
$rdr[$child_fd] = $pfd // $child_fd;
@@ -399,9 +391,7 @@ sub read_out_err ($) {
for my $fd (1, 2) { # read stdout/stderr
my $fh = delete($opt->{"fh.$fd"}) // next;
seek($fh, 0, SEEK_SET);
- my $dst = $opt->{$fd};
- $dst = $opt->{$fd} = $dst->[1] if ref($dst) eq 'ARRAY';
- PublicInbox::IO::read_all $fh, 0, $dst
+ PublicInbox::IO::read_all $fh, undef, $opt->{$fd};
}
}
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 10/15] cindex: speed up initial scan setup phase
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (8 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 09/15] spawn: drop IO layer support from redirects Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 11/15] inbox: expire resources more aggressively Eric Wong
` (4 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
This brings a no-op -cindex scan of a git.kernel.org mirror
down from 70s to 10s with a hot cache on a busy machine.
CPU-intensive SHA-256 fingerprinting of the `git show-ref'
result can be parallelized on shard workers. Future changes can
move more of the initial scan setup phase into shard workers for
more parallelism.
But most of the performance for skipping unchanged repos is
gained from delaying the commit time reading until we've seen
the fingerprint is out-of-date, since reading commit times
requires a large amount of I/O compared to only reading refs
for fingerprints.
---
lib/PublicInbox/CodeSearchIdx.pm | 99 +++++++++++++++++---------------
1 file changed, 53 insertions(+), 46 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 555a1efe..ec0fc6e3 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -105,6 +105,7 @@ our (
@JOIN_DT, # YYYYmmddHHMMSS for dt:
$QRY_STR, # common query string for both code and inbox associations
$DUMP_IBX_WPIPE, # goes to sort(1)
+ $ANY_SHARD, # shard round-robin for scan fingerprinting
@OFF2ROOT,
);
@@ -416,51 +417,42 @@ sub run_git {
# this is different from the grokmirror-compatible fingerprint since we
# only care about --heads (branches) and --tags, and not even their names
-sub fp_start ($$$) {
- my ($self, $git, $prep_repo) = @_;
+sub fp_start ($$) {
+ my ($self, $git) = @_;
return if $DO_QUIT;
open my $refs, '+>', undef;
$git->{-repo}->{refs} = $refs;
- run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
- \&fp_fini, $self, $git, $prep_repo);
-}
-
-sub fp_fini { # run_git cb
- my (undef, $self, $git, $prep_repo) = @_;
- my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
- sysseek($refs, 0, SEEK_SET);
- $git->{-repo}->{fp} = sha_all(256, $refs)->hexdigest;
+ my ($c, $p) = PublicInbox::PktOp->pair;
+ my $next_on_err = PublicInbox::OnDestroy->new(\&index_next, $self);
+ $c->{ops}->{fp_done} = [ $self, $git, $next_on_err ];
+ $IDX_SHARDS[++$ANY_SHARD % scalar(@IDX_SHARDS)]->wq_io_do('fp_async',
+ [ $p->{op_p}, $refs ], $git->{git_dir})
}
-sub ct_start ($$$) {
- my ($self, $git, $prep_repo) = @_;
- return if $DO_QUIT;
- run_git([ qw[for-each-ref --sort=-committerdate
- --format=%(committerdate:raw) --count=1
- refs/heads/ refs/tags/] ], undef, # capture like qx
- \&ct_fini, $self, $git, $prep_repo);
+sub fp_async { # via wq_io_do in worker
+ my ($self, $git_dir) = @_;
+ my $op_p = delete $self->{0} // die 'BUG: no {0} op_p';
+ my $refs = delete $self->{1} // die 'BUG: no {1} refs';
+ my $git = PublicInbox::Git->new($git_dir);
+ run_git([qw(show-ref --heads --tags --hash)], { 1 => $refs },
+ \&fp_async_done, $self, $git, $op_p);
}
-sub ct_fini { # run_git cb
- my ($opt, $self, $git, $prep_repo) = @_;
- my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
- $git->{-repo}->{ct} = $ct + 0;
+sub fp_async_done { # run_git cb from worker
+ my ($opt, $self, $git, $op_p) = @_;
+ my $refs = delete $opt->{1} // 'BUG: no {-repo}->{refs}';
+ sysseek($refs, 0, SEEK_SET);
+ send($op_p, 'fp_done '.sha_all(256, $refs)->hexdigest, 0);
}
-# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
-sub prep_repo ($$) {
- my ($self, $git) = @_;
+sub fp_done { # called parent via PktOp by fp_async_done
+ my ($self, $git, $next_on_err, $hex) = @_;
+ $next_on_err->cancel;
return if $DO_QUIT;
- return index_next($self) if $git->{-cidx_err};
- my $repo = $git->{-repo} // die 'BUG: no {-repo}';
- if (!defined($repo->{ct})) {
- warn "W: $git->{git_dir} has no commits, skipping\n";
- delete $git->{-repo};
- return index_next($self);
- }
+ $git->{-repo}->{fp} = $hex;
my $n = git_dir_hash($git->{git_dir}) % scalar(@RDONLY_XDB);
my $shard = bless { %$self, shard => $n }, ref($self);
- $repo->{shard_n} = $n;
+ $git->{-repo}->{shard_n} = $n;
delete @$shard{qw(lockfh lock_path)};
local $shard->{xdb} = $RDONLY_XDB[$n] // die "BUG: shard[$n] undef";
$shard->retry_reopen(\&check_existing, $self, $git);
@@ -469,7 +461,7 @@ sub prep_repo ($$) {
sub check_existing { # retry_reopen callback
my ($shard, $self, $git) = @_;
my @docids = $shard->docids_of_git_dir($git->{git_dir});
- my $docid = shift(@docids) // return get_roots($self, $git);
+ my $docid = shift(@docids) // return prep_repo($self, $git); # new repo
my $doc = $shard->get_doc($docid) //
die "BUG: no #$docid ($git->{git_dir})";
my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data;
@@ -482,7 +474,7 @@ sub check_existing { # retry_reopen callback
warn "BUG: $git->{git_dir} indexed multiple times, culling\n";
$git->{-repo}->{to_delete} = \@docids; # XXX needed?
}
- get_roots($self, $git);
+ prep_repo($self, $git);
}
sub partition_refs ($$$) {
@@ -604,13 +596,9 @@ sub index_next ($) {
my ($self) = @_;
return if $DO_QUIT;
if ($IDXQ && @$IDXQ) {
- index_repo(undef, $self, shift @$IDXQ);
+ index_repo($self, shift @$IDXQ);
} elsif ($SCANQ && @$SCANQ) {
- my $git = shift @$SCANQ;
- my $prep_repo = PublicInbox::OnDestroy->new(\&prep_repo,
- $self, $git);
- fp_start($self, $git, $prep_repo);
- ct_start($self, $git, $prep_repo);
+ fp_start $self, shift @$SCANQ;
} elsif ($TMPDIR) {
delete $TODO{dump_roots_start};
delete $TODO{dump_ibx_start}; # runs OnDestroy once
@@ -649,12 +637,17 @@ sub index_done { # OnDestroy cb called when done indexing each code repo
# repo_stored will fire once store_repo is done
}
-sub index_repo { # run_git cb
- my (undef, $self, $git) = @_;
+sub index_repo {
+ my ($self, $git) = @_;
return if $DO_QUIT;
+ my $repo = $git->{-repo} // die 'BUG: no {-repo}';
return index_next($self) if $git->{-cidx_err};
+ if (!defined($repo->{ct})) {
+ warn "W: $git->{git_dir} has no commits, skipping\n";
+ return index_next($self);
+ }
return push(@$IDXQ, $git) if $REPO_CTX; # busy
- my $repo = delete $git->{-repo} or return index_next($self);
+ delete $git->{-repo};
my $roots_fh = delete $repo->{roots_fh} // die 'BUG: no {roots_fh}';
seek($roots_fh, 0, SEEK_SET);
chomp(my @roots = PublicInbox::IO::read_all $roots_fh);
@@ -685,15 +678,28 @@ sub index_repo { # run_git cb
# shard_done fires when shard_index is done
}
-sub get_roots ($$) {
+sub ct_fini { # run_git cb
+ my ($opt, $self, $git, $index_repo) = @_;
+ my ($ct) = split(/\s+/, ${$opt->{1}}); # drop TZ + LF
+ $git->{-repo}->{ct} = $ct + 0;
+}
+
+# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_repo ($$) {
my ($self, $git) = @_;
return if $DO_QUIT;
+ my $index_repo = PublicInbox::OnDestroy->new(\&index_repo, $self, $git);
my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
sysseek($refs, 0, SEEK_SET);
open my $roots_fh, '+>', undef;
$git->{-repo}->{roots_fh} = $roots_fh;
run_git([ qw(rev-list --stdin --max-parents=0) ],
- { 0 => $refs, 1 => $roots_fh }, \&index_repo, $self, $git)
+ { 0 => $refs, 1 => $roots_fh }, \&PublicInbox::Config::noop,
+ $self, $git, $index_repo);
+ run_git([ qw[for-each-ref --sort=-committerdate
+ --format=%(committerdate:raw) --count=1
+ refs/heads/ refs/tags/] ], undef, # capture like qx
+ \&ct_fini, $self, $git, $index_repo);
}
# for PublicInbox::SearchIdx `git patch-id' call and with_umask
@@ -1295,6 +1301,7 @@ sub cidx_run { # main entry point
init_join_prefork($self)
}
local @IDX_SHARDS = cidx_init($self); # forks workers
+ local $ANY_SHARD = -1;
local $self->{current_info} = '';
local $MY_SIG = {
CHLD => \&PublicInbox::DS::enqueue_reap,
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 11/15] inbox: expire resources more aggressively
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (9 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 10/15] cindex: speed up initial scan setup phase Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 12/15] git_async_cat: use git from "all" extindex if possible Eric Wong
` (3 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
We no longer trigger git cleanups from the Inbox package since
`git cat-file' users have their own cleanup to support git
coderepos not associated with any inbox.
This change means we unconditionally expire SQLite and Xapian
FDs and some internal caches regardless of git activity. The
old logic was irrelevant to Gcf2 (libgit2) users anyways since
we couldn't determine whether or not an inbox was active based
on {inflight} git requests, and upcoming changes will make it
inaccurate for all extindex/cindex users as well.
Opening SQLite and Xapian DBs is fairly cheap; so it's a small
price to pay to reduce memory use and fragmentation.
---
lib/PublicInbox/Inbox.pm | 24 ++++++++----------------
1 file changed, 8 insertions(+), 16 deletions(-)
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index e71ef6d2..7af0ad90 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -12,28 +12,20 @@ use List::Util qw(max);
use Carp qw(croak);
use PublicInbox::Compat qw(uniqstr);
-# returns true if further checking is required
+# in case DBs get replaced (Xapcmd does it for v1)
sub check_inodes ($) {
for (qw(over mm)) { $_[0]->{$_}->check_inodes if $_[0]->{$_} }
}
+# search/over/mm hold onto FDs and description+cloneurl may get updated.
+# creating long-lived allocations in the same phase as short-lived
+# allocations also leads to fragmentation, so we don't want some stuff
+# living too long.
sub do_cleanup {
my ($ibx) = @_;
- my $live;
- if (defined $ibx->{git}) {
- $live = $ibx->isa(__PACKAGE__) ? $ibx->{git}->cleanup(1)
- : $ibx->{git}->cleanup_if_unlinked;
- delete($ibx->{git}) unless $live;
- }
- if ($live) {
- check_inodes($ibx);
- } else {
- delete(@$ibx{qw(over mm description cloneurl
- -imap_url -nntp_url -pop3_url)});
- }
- my $srch = $ibx->{search} // $ibx;
+ my ($srch) = delete @$ibx{qw(search over mm description cloneurl)};
+ $srch //= $ibx; # extsearch
delete @$srch{qw(xdb qp)};
- PublicInbox::DS::add_uniq_timer($ibx+0, 5, \&do_cleanup, $ibx) if $live;
}
sub _cleanup_later ($) {
@@ -370,7 +362,7 @@ sub unsubscribe_unlock {
# called by inotify
sub on_unlock {
my ($self) = @_;
- check_inodes($self);
+ check_inodes($self); # DB files may be replaced while holding lock
my $subs = $self->{unlock_subs} or return;
for my $obj (values %$subs) {
eval { $obj->on_inbox_unlock($self) };
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 12/15] git_async_cat: use git from "all" extindex if possible
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (10 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 11/15] inbox: expire resources more aggressively Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 13/15] www_listing: support publicInbox.nameIsUrl Eric Wong
` (2 subsequent siblings)
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
For inboxes associated with an extindex (currently only the
special "all") one, we can share the git process across
all those inboxes unambiguously when retrieving full SHA-1
blobs.
The comment for my proposed patch is also out-of-date as that
git speedup has been a part of git since 2.33.
---
lib/PublicInbox/GitAsyncCat.pm | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/GitAsyncCat.pm b/lib/PublicInbox/GitAsyncCat.pm
index 09744b34..f57e0336 100644
--- a/lib/PublicInbox/GitAsyncCat.pm
+++ b/lib/PublicInbox/GitAsyncCat.pm
@@ -9,11 +9,11 @@ our $GCF2C; # singleton PublicInbox::Gcf2Client
sub ibx_async_cat ($$$$) {
my ($ibx, $oid, $cb, $arg) = @_;
- my $git = $ibx->{git} // $ibx->git;
+ my $isrch = $ibx->{isrch};
+ my $git = $isrch ? $isrch->{es}->git : ($ibx->{git} // $ibx->git);
# {topdir} means ExtSearch (likely [extindex "all"]) with potentially
- # 100K alternates. git(1) has a proposed patch for 100K alternates:
- # <https://lore.kernel.org/git/20210624005806.12079-1-e@80x24.org/>
- if (!defined($ibx->{topdir}) && !defined($git->{-tmp}) &&
+ # 100K alternates. git v2.33+ can handle 100k alternates fairly well.
+ if (!$isrch && !defined($ibx->{topdir}) && !defined($git->{-tmp}) &&
($GCF2C //= eval {
require PublicInbox::Gcf2Client;
PublicInbox::Gcf2Client::new();
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 13/15] www_listing: support publicInbox.nameIsUrl
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (11 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 12/15] git_async_cat: use git from "all" extindex if possible Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-12-01 1:29 ` Kyle Meyer
2023-11-30 11:41 ` [PATCH 14/15] inbox: shrink data structures for publicinbox.*.hide Eric Wong
2023-11-30 11:41 ` [PATCH 15/15] codesearch: use retry_reopen for WWW Eric Wong
14 siblings, 1 reply; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
This is a convenient (and slightly memory-saving) alternative to
specifying a `publicinbox.*.url' entry for every single inbox
when using publicinbox.wwwListing.
---
Documentation/public-inbox-config.pod | 19 ++++++++++++++++++-
lib/PublicInbox/WwwListing.pm | 21 +++++++++++++--------
2 files changed, 31 insertions(+), 9 deletions(-)
diff --git a/Documentation/public-inbox-config.pod b/Documentation/public-inbox-config.pod
index d394d31f..d2017704 100644
--- a/Documentation/public-inbox-config.pod
+++ b/Documentation/public-inbox-config.pod
@@ -273,7 +273,9 @@ Default: 25
A comma-delimited list of listings to hide the inbox from.
-Valid values are currently C<www> and C<manifest>.
+Valid values are currently C<www> and C<manifest> for non-C<404>
+values of L</publicinbox.wwwListing> and L</publicinbox.grokManifest>,
+respectively
Default: none
@@ -379,6 +381,21 @@ TODO support showing cgit listing
Default: C<404>
+=item publicinbox.nameIsUrl
+
+Treat the name of the public inbox as it's unqualified URL when
+using C<publicInbox.wwwListing=all>. This is, every
+C<[publicinbox "foo"]> section implicitly sets C<publicinbox.foo.url=foo>.
+
+This is a convenient alternative to specifying
+C<publicinbox.E<lt>nameE<gt>.url> for every single inbox if
+your inbox URLs are domain-agnostic when using
+C<publicInbox.wwwListing=all>
+
+Default: false
+
+New in public-inbox 2.0.0 (PENDING).
+
=item publicinbox.grokmanifest
Controls the generation of a grokmirror-compatible gzipped JSON file
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index 21e5b8bc..e3d2e84c 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -41,10 +41,7 @@ sub list_match_i { # ConfigIter callback
if (defined($section)) {
return if $section !~ m!\Apublicinbox\.([^/]+)\z!;
my $ibx = $cfg->lookup_name($1) or return;
- if (!$ibx->{-hide}->{$ctx->hide_key} &&
- grep(/$re/, @{$ibx->{url} // []})) {
- $ctx->ibx_entry($ibx);
- }
+ $ctx->ibx_entry($ibx) unless $ctx->hide_inbox($ibx, $re);
} else { # undef == "EOF"
$ctx->{-wcb}->($ctx->psgi_triple);
}
@@ -54,13 +51,17 @@ sub url_filter {
my ($ctx, $key, $default) = @_;
$key //= 'publicInbox.wwwListing';
$default //= '404';
- my $v = $ctx->{www}->{pi_cfg}->{lc $key} // $default;
+ my $cfg = $ctx->{www}->{pi_cfg};
+ my $v = $cfg->{lc $key} // $default;
again:
if ($v eq 'match=domain') {
my $h = $ctx->{env}->{HTTP_HOST} // $ctx->{env}->{SERVER_NAME};
$h =~ s/:[0-9]+\z//;
(qr!\A(?:https?:)?//\Q$h\E(?::[0-9]+)?/!i, "url:$h");
} elsif ($v eq 'all') {
+ my $niu = $cfg->{lc 'publicinbox.nameIsUrl'};
+ defined($niu) && $cfg->git_bool($niu) and
+ $ctx->{-name_is_url} = [ '.' ];
(qr/./, undef);
} elsif ($v eq '404') {
(undef, undef);
@@ -76,6 +77,12 @@ EOF
sub hide_key { 'www' }
+sub hide_inbox {
+ my ($ctx, $ibx, $re) = @_;
+ $ibx->{-hide}->{$ctx->hide_key} ||
+ !grep(/$re/, @{$ibx->{url} // $ctx->{-name_is_url} // []})
+}
+
sub add_misc_ibx { # MiscSearch->retry_reopen callback
my ($misc, $ctx, $re, $qs) = @_;
require PublicInbox::SearchQuery;
@@ -104,15 +111,13 @@ sub add_misc_ibx { # MiscSearch->retry_reopen callback
$ctx->ibx_entry($pi_cfg->ALL // die('BUG: ->ALL expected'), {});
}
my $mset = $misc->mset($qs, $opt); # sorts by $MODIFIED (mtime)
- my $hide_key = $ctx->hide_key;
for my $mi ($mset->items) {
my $doc = $mi->get_document;
my ($eidx_key) = PublicInbox::Search::xap_terms('Q', $doc);
$eidx_key // next;
my $ibx = $pi_cfg->lookup_eidx_key($eidx_key) // next;
- next if $ibx->{-hide}->{$hide_key};
- grep(/$re/, @{$ibx->{url} // []}) or next;
+ next if $ctx->hide_inbox($ibx, $re);
$ctx->ibx_entry($ibx, $misc->doc2ibx_cache_ent($doc));
if ($r) { # for descriptions in search_nav_bot
my $pct = PublicInbox::Search::get_pct($mi);
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 14/15] inbox: shrink data structures for publicinbox.*.hide
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (12 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 13/15] www_listing: support publicInbox.nameIsUrl Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 11:41 ` [PATCH 15/15] codesearch: use retry_reopen for WWW Eric Wong
14 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
We no longer vivify the intermediate $ibx->{-hide} hashref,
instead we use $ibx->{-hide_$KEY} directly. This avoids
an intermediate hashref and extra hash table lookups.
---
lib/PublicInbox/CodeSearch.pm | 2 +-
lib/PublicInbox/Inbox.pm | 8 ++------
lib/PublicInbox/WwwListing.pm | 2 +-
3 files changed, 4 insertions(+), 8 deletions(-)
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index 208f7528..f4694686 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -328,7 +328,7 @@ EOM
if (my $git = $dir2cr{$_}) {
$ibx_p2g{$_} = $git;
$ibx2self = 1;
- $ibx->{-hide}->{www} or
+ $ibx->{-hide_www} or
push @{$git->{ibx_score}},
[ $nr, $ibx->{name} ];
push @$gits, $git;
diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm
index 7af0ad90..dd689221 100644
--- a/lib/PublicInbox/Inbox.pm
+++ b/lib/PublicInbox/Inbox.pm
@@ -70,12 +70,8 @@ sub new {
delete $opts->{feedmax};
}
# allow any combination of multi-line or comma-delimited hide entries
- my $hide = {};
- if (defined(my $h = $opts->{hide})) {
- foreach my $v (@$h) {
- $hide->{$_} = 1 foreach (split(/\s*,\s*/, $v));
- }
- $opts->{-hide} = $hide;
+ for $v (@{delete($opts->{hide}) // []}) {
+ $opts->{-'hide_'.$_} = 1 for split(/\s*,\s*/, $v);
}
bless $opts, $class;
}
diff --git a/lib/PublicInbox/WwwListing.pm b/lib/PublicInbox/WwwListing.pm
index e3d2e84c..2d6c74da 100644
--- a/lib/PublicInbox/WwwListing.pm
+++ b/lib/PublicInbox/WwwListing.pm
@@ -79,7 +79,7 @@ sub hide_key { 'www' }
sub hide_inbox {
my ($ctx, $ibx, $re) = @_;
- $ibx->{-hide}->{$ctx->hide_key} ||
+ $ibx->{'-hide_'.$ctx->hide_key} ||
!grep(/$re/, @{$ibx->{url} // $ctx->{-name_is_url} // []})
}
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH 15/15] codesearch: use retry_reopen for WWW
2023-11-30 11:40 [PATCH 00/15] various cindex fixes + speedups Eric Wong
` (13 preceding siblings ...)
2023-11-30 11:41 ` [PATCH 14/15] inbox: shrink data structures for publicinbox.*.hide Eric Wong
@ 2023-11-30 11:41 ` Eric Wong
2023-11-30 21:40 ` [PATCH v2] " Eric Wong
14 siblings, 1 reply; 20+ messages in thread
From: Eric Wong @ 2023-11-30 11:41 UTC (permalink / raw)
To: meta
As with mail search, a cindex may be updated while WWW is
serving requests. Thus we must reopen the Xapian DB when
the revision we're using becomes stale.
---
lib/PublicInbox/CodeSearch.pm | 25 +++++++++++++++----------
1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index f4694686..a2f4bae8 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -256,17 +256,22 @@ sub load_ct { # retry_reopen cb
}
}
+sub load_ct { # retry_reopen cb
+ my ($self, $git_dir) = @_;
+ my @ids = docids_of_git_dir $self, $git_dir or return;
+ for (@ids) {
+ my $doc = $self->get_doc($_) // next;
+ return int_val($doc, CT);
+ }
+}
+
sub load_commit_times { # each_cindex callback
my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...]
- my (@pending, $rec, $dir, @ids, $doc);
+ my (@pending, $rec, $ct);
while ($rec = shift @$todo) {
- @ids = docids_of_git_dir $self, $rec->[1]->{git_dir};
- if (@ids) {
- for (@ids) {
- $doc = $self->get_doc($_) // next;
- $rec->[0] = int_val($doc, CT);
- last;
- }
+ $ct = $self->retry_reopen(\&load_ct, $rec->[1]->{git_dir});
+ if (defined $ct) {
+ $rec->[0] = $ct;
} else { # may be in another cindex:
push @pending, $rec;
}
@@ -295,7 +300,7 @@ EOM
$git;
};
}
- my $jd = join_data($self) or return warn <<EOM;
+ my $jd = $self->retry_reopen(\&join_data, $self) or return warn <<EOM;
W: cindex.$name.topdir=$self->{topdir} has no usable join data for $cfg_f
EOM
my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)};
@@ -366,7 +371,7 @@ sub repos_sorted {
my @recs = map { [ 0, $_ ] } @_; # PublicInbox::Git objects
my @todo = @recs;
$pi_cfg->each_cindex(\&load_commit_times, \@todo);
- @recs = sort { $b->[0] <=> $a->[0] } @recs;
+ @recs = sort { $b->[0] <=> $a->[0] } @recs; # sort by commit time
}
1;
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [PATCH v2] codesearch: use retry_reopen for WWW
2023-11-30 11:41 ` [PATCH 15/15] codesearch: use retry_reopen for WWW Eric Wong
@ 2023-11-30 21:40 ` Eric Wong
0 siblings, 0 replies; 20+ messages in thread
From: Eric Wong @ 2023-11-30 21:40 UTC (permalink / raw)
To: meta
As with mail search, a cindex may be updated while WWW is
serving requests. Thus we must reopen the Xapian DB when
the revision we're using becomes stale.
---
v2: avoid reintroducing load_ct as noted in
https://public-inbox.org/meta/20231130213641.M35664@dcvr/
lib/PublicInbox/CodeSearch.pm | 16 ++++++----------
1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index f4694686..3092718d 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -258,15 +258,11 @@ sub load_ct { # retry_reopen cb
sub load_commit_times { # each_cindex callback
my ($self, $todo) = @_; # todo = [ [ time, git ], [ time, git ] ...]
- my (@pending, $rec, $dir, @ids, $doc);
+ my (@pending, $rec, $ct);
while ($rec = shift @$todo) {
- @ids = docids_of_git_dir $self, $rec->[1]->{git_dir};
- if (@ids) {
- for (@ids) {
- $doc = $self->get_doc($_) // next;
- $rec->[0] = int_val($doc, CT);
- last;
- }
+ $ct = $self->retry_reopen(\&load_ct, $rec->[1]->{git_dir});
+ if (defined $ct) {
+ $rec->[0] = $ct;
} else { # may be in another cindex:
push @pending, $rec;
}
@@ -295,7 +291,7 @@ EOM
$git;
};
}
- my $jd = join_data($self) or return warn <<EOM;
+ my $jd = $self->retry_reopen(\&join_data, $self) or return warn <<EOM;
W: cindex.$name.topdir=$self->{topdir} has no usable join data for $cfg_f
EOM
my ($ekeys, $roots, $ibx2root) = @$jd{qw(ekeys roots ibx2root)};
@@ -366,7 +362,7 @@ sub repos_sorted {
my @recs = map { [ 0, $_ ] } @_; # PublicInbox::Git objects
my @todo = @recs;
$pi_cfg->each_cindex(\&load_commit_times, \@todo);
- @recs = sort { $b->[0] <=> $a->[0] } @recs;
+ @recs = sort { $b->[0] <=> $a->[0] } @recs; # sort by commit time
}
1;
^ permalink raw reply related [flat|nested] 20+ messages in thread