From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 6F87C1F536 for ; Tue, 21 Mar 2023 23:07:47 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1679440067; bh=5QoBK8aA+K8r9PTyshrojXgtwe/CVCquI/1ki2xaDbk=; h=From:To:Subject:Date:In-Reply-To:References:From; b=S9G/KfHSrSlh+TwuZrjxAN1QqGqz+Ge2WExsU2kD+y7E5sgI/HNdemxtDo6192ANT 6z8PeLsg4Dwas/UkIwUhenuP2JiLncfk+CHkzVQd/xLBfaQ/hFzDs6FJh8Bs40r5MI xU2f2NIkfJttW6Wdg+llvSgElDqSux1J+qZsrzcE= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 19/28] cindex: truncate or drop body for over-sized commits Date: Tue, 21 Mar 2023 23:07:34 +0000 Message-Id: <20230321230743.3020032-19-e@80x24.org> In-Reply-To: <20230321230743.3020032-1-e@80x24.org> References: <20230321230701.3019936-1-e@80x24.org> <20230321230743.3020032-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: We need to get at least the commit OID indexed to avoid redundant work. --- lib/PublicInbox/CodeSearchIdx.pm | 53 +++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 11 deletions(-) diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 829fe28e..176422d0 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -41,7 +41,8 @@ our ( $TXN_BYTES, # number of bytes in current shard transaction $DO_QUIT, # signal number @RDONLY_SHARDS, # Xapian::Database - @IDX_SHARDS # clones of self + @IDX_SHARDS, # clones of self + $MAX_SIZE, ); # stop walking history if we see >$SEEN_MAX existing commits, this assumes @@ -159,17 +160,45 @@ sub cidx_ckpoint ($$) { $self->{xdb}->begin_transaction; } +sub truncate_cmt ($$) { + my ($cmt) = @_; # _[1] is $buf (giant) + my ($orig_len, $len); + $len = $orig_len = length($_[1]); + @$cmt{@FMT} = split(/\n/, $_[1], scalar(@FMT)); + undef $_[1]; + $len -= length($cmt->{b}); + + # try to keep the commit message body. + # n.b. this diffstat split may be unreliable but it's not worth + # perfection for giant commits: + my ($bdy) = split(/^---\n/sm, delete($cmt->{b}), 2); + if (($len + length($bdy)) <= $MAX_SIZE) { + $len += length($bdy); + $cmt->{b} = $bdy; + warn <{H}: truncated body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } else { + $cmt->{b} = ''; + warn <{H}: deleted body ($orig_len => $len bytes) +W: to be under --max-size=$MAX_SIZE +EOM + } + $len; +} + # sharded reader for `git log --pretty=format: --stdin' sub shard_index { # via wq_io_do my ($self, $git, $n, $roots) = @_; local $self->{current_info} = "$git->{git_dir} [$n]"; - my $cmt; local $self->{roots} = $roots; my $in = delete($self->{0}) // die 'BUG: no {0} input'; my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p'; my $batch_bytes = $self->{-opt}->{batch_size} // $PublicInbox::SearchIdx::BATCH_BYTES; - my $max_size = $self->{-opt}->{max_size}; + local $MAX_SIZE = $self->{-opt}->{max_size}; # local-ized in parent before fork $TXN_BYTES = $batch_bytes; local $self->{git} = $git; # for patchid @@ -180,24 +209,26 @@ sub shard_index { # via wq_io_do # a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4 # in linux.git, so we use $/ = "\n\0" to check end-of-patch my $FS = "\n\0"; + my $len; + my $cmt = {}; local $/ = $FS; my $buf = <$rd> // return; # leading $FS $buf eq $FS or die "BUG: not LF-NUL: $buf\n"; $self->begin_txn_lazy; while (defined($buf = <$rd>)) { chomp($buf); - if ($max_size && length($buf) >= $max_size) { - my ($H, undef) = split(/\n/, $buf, 2); - warn "W: skipping $H (", length($buf)," >= $max_size)\n"; - next; + $/ = "\n"; + $len = length($buf); + if (defined($MAX_SIZE) && $len > $MAX_SIZE) { + $len = truncate_cmt($cmt, $buf); + } else { + @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT)); } - $TXN_BYTES -= length($buf); + $TXN_BYTES -= $len; if ($TXN_BYTES <= 0) { cidx_ckpoint($self, "[$n] $nr"); - $TXN_BYTES = $batch_bytes - length($buf); + $TXN_BYTES = $batch_bytes - $len; } - @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT)); - $/ = "\n"; add_commit($self, $cmt); last if $DO_QUIT; ++$nr;