From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id D64191F53C for ; Tue, 21 Mar 2023 23:07:48 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1679440068; bh=jgghjWWcfs3Y4PftfNpeEn7WoERMfFwawA9obx7TVF4=; h=From:To:Subject:Date:In-Reply-To:References:From; b=V52R/ZuOY2Xn2CL1tZZYTbvYjFs/KBxw46Pyt3Sey2JgWaeSIxk0CixpXfndneGuc 9EUVgf/4AQ0e/u36F9Yf2XtrsCsre8jgzigE85E/b8aOOetkI3mLjosIs74QiseY8h dM/WWGqgXkuktMVTKRRhb5xe21JGF0+gxoaxuNYE= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 25/28] cindex: implement reindex Date: Tue, 21 Mar 2023 23:07:40 +0000 Message-Id: <20230321230743.3020032-25-e@80x24.org> In-Reply-To: <20230321230743.3020032-1-e@80x24.org> References: <20230321230701.3019936-1-e@80x24.org> <20230321230743.3020032-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This allows changing --indexlevel at the moment and will allow us to fix some yet-to-be-discovered bugs or backwards-compatible improvements in the future. --- lib/PublicInbox/CodeSearchIdx.pm | 33 ++++++++++++++++++++++---------- t/cindex.t | 4 ++++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 095c153e..5e6c0d22 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -43,7 +43,8 @@ our ( @RDONLY_SHARDS, # Xapian::Database @IDX_SHARDS, # clones of self $MAX_SIZE, - $TMP_GIT, # PublicInbox::Git object for --reindex and --prune + $TMP_GIT, # PublicInbox::Git object for --prune + $REINDEX, # PublicInbox::SharedKV ); # stop walking history if we see >$SEEN_MAX existing commits, this assumes @@ -89,12 +90,13 @@ sub new { # TODO: may be used for reshard/compact sub count_shards { scalar($_[0]->xdb_shards_flat) } -sub add_commit ($$) { +sub update_commit ($$) { my ($self, $cmt) = @_; # fields from @FMT my $x = 'Q'.$cmt->{H}; - for (docids_by_postlist($self, $x)) { - $self->{xdb}->delete_document($_) - } + my ($docid, @extra) = sort { $a <=> $b } docids_by_postlist($self, $x); + @extra and warn "W: $cmt->{H} indexed multiple times, pruning ", + join(', ', map { "#$_" } @extra), "\n"; + $self->{xdb}->delete_document($_) for @extra; my $doc = $PublicInbox::Search::X{Document}->new; $doc->add_boolean_term($x); $doc->add_boolean_term('G'.$_) for @{$self->{roots}}; @@ -119,7 +121,8 @@ sub add_commit ($$) { $x = delete $cmt->{b}; $self->index_body_text($doc, \$x) if $x =~ /\S/s; - $self->{xdb}->add_document($doc); + defined($docid) ? $self->{xdb}->replace_document($docid, $doc) : + $self->{xdb}->add_document($doc); } sub progress { @@ -235,7 +238,7 @@ sub shard_index { # via wq_io_do cidx_ckpoint($self, "[$n] $nr"); $TXN_BYTES = $batch_bytes - $len; } - add_commit($self, $cmt); + update_commit($self, $cmt); ++$nr; if ($TXN_BYTES <= 0) { cidx_ckpoint($self, "[$n] $nr"); @@ -398,7 +401,7 @@ sub check_existing { # retry_reopen callback my $docid = shift(@docids) // return get_roots($self, $git); my $doc = $shard->{xdb}->get_document($docid) // die "BUG: no #$docid ($git->{git_dir})"; - my $old_fp = $doc->get_data; + my $old_fp = $REINDEX ? "\0invalid" : $doc->get_data; if ($old_fp eq $git->{-repo}->{fp}) { # no change delete $git->{-repo}; return; @@ -426,7 +429,10 @@ sub partition_refs ($$$) { while (defined(my $cmt = <$rfh>)) { chomp $cmt; my $n = hex(substr($cmt, 0, 8)) % scalar(@RDONLY_SHARDS); - if (seen($RDONLY_SHARDS[$n], 'Q'.$cmt)) { + if ($REINDEX && $REINDEX->set_maybe(pack('H*', $cmt), '')) { + say { $shard_in[$n] } $cmt or die "say: $!"; + ++$nchange; + } elsif (seen($RDONLY_SHARDS[$n], 'Q'.$cmt)) { last if ++$seen > $SEEN_MAX; } else { say { $shard_in[$n] } $cmt or die "say: $!"; @@ -687,7 +693,7 @@ sub parent_quit { sub init_tmp_git_dir ($) { my ($self) = @_; - return unless ($self->{-opt}->{prune} || $self->{-opt}->{reindex}); + return unless $self->{-opt}->{prune}; require File::Temp; require PublicInbox::Import; my $tmp = File::Temp->newdir('cidx-all-git-XXXX', TMPDIR => 1); @@ -729,6 +735,13 @@ sub cidx_run { # main entry point $cb->($m, @_); }; load_existing($self); + local $REINDEX; + if ($self->{-opt}->{reindex}) { + require PublicInbox::SharedKV; + $REINDEX = PublicInbox::SharedKV->new; + delete $REINDEX->{lock_path}; + $REINDEX->dbh; + } my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}}; if (@nc) { warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n"; diff --git a/t/cindex.t b/t/cindex.t index 5d269217..eb66b2e6 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -93,6 +93,10 @@ EOM ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), '-cindex -u'); $mset = $csrch->reopen->mset('dfn:for-update'); is(scalar($mset->items), 1, 'got updated result'); + + ok(run_script([qw(-cindex -qu --reindex -d), "$tmp/ext"]), 'reindex'); + $mset = $csrch->reopen->mset('dfn:for-update'); + is(scalar($mset->items), 1, 'same result after reindex'); } if ('--prune') {