From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 090411F532 for ; Tue, 21 Mar 2023 23:07:47 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1679440067; bh=iysL6jiXbitZJUkwlp+RNxIncvmAwwZt8mpsq2qn9BI=; h=From:To:Subject:Date:In-Reply-To:References:From; b=sUqZhUgDAqq6ER01EV7dQM+fb+Dc7JztXUyud04m61Mu7b6ud/3twZbS1cLkwF3vd o+Bb7al4bhZgm3pf8iauGVki+D/VA/Wnlc28Y0XCDg4xrveWDyhp5O8Ka51/9Z+A+j RmE0iimqRhWP4hvQ3f14UOvZSPOk0SoV3kL5EZZk= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 17/28] cindex: implement --max-size=SIZE Date: Tue, 21 Mar 2023 23:07:32 +0000 Message-Id: <20230321230743.3020032-17-e@80x24.org> In-Reply-To: <20230321230743.3020032-1-e@80x24.org> References: <20230321230701.3019936-1-e@80x24.org> <20230321230743.3020032-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This matches existing behavior of -index and -extindex, and will hopefully allow me to avoid OOM problems by skipping problematic commits. --- lib/PublicInbox/CodeSearchIdx.pm | 6 ++++++ script/public-inbox-cindex | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index fcd28671..b185731d 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -161,6 +161,7 @@ sub shard_index { # via wq_io_do my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p'; my $batch_bytes = $self->{-opt}->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; + my $max_size = $self->{-opt}->{max_size}; # local-ized in parent before fork $TXN_BYTES = $batch_bytes; local $self->{git} = $git; # for patchid @@ -177,6 +178,11 @@ sub shard_index { # via wq_io_do $self->begin_txn_lazy; while (defined($buf = <$rd>)) { chomp($buf); + if ($max_size && length($buf) >= $max_size) { + my ($H, undef) = split(/\n/, $buf, 2); + warn "W: skipping $H (", length($buf)," >= $max_size)\n"; + next; + } $TXN_BYTES -= length($buf); @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT)); $/ = "\n"; diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex index 420ef4de..e2500b93 100755 --- a/script/public-inbox-cindex +++ b/script/public-inbox-cindex @@ -16,6 +16,7 @@ usage: public-inbox-cindex [options] --project-list=FILE PROJECT_ROOT --update | -u update previously-indexed code repos with `-d' --jobs=NUM set or disable parallelization (NUM=0) --batch-size=BYTES flush changes to OS after a given number of bytes + --max-size=BYTES do not index commit diffs larger than the given size --prune prune old repos and commits --reindex reindex previously indexed repos --verbose | -v increase verbosity (may be repeated) @@ -25,7 +26,8 @@ See public-inbox-cindex(1) man page for full documentation. EOF my $opt = { fsync => 1, scan => 1 }; # --no-scan is hidden GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous - indexlevel|index-level|L=s batch_size|batch-size=s + indexlevel|index-level|L=s + batch_size|batch-size=s max_size|max-size=s project-list=s exclude=s@ d=s update|u scan! prune dry-run|n C=s@ help|h)) or die $help;