From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 553EE1F47A for ; Fri, 7 Apr 2023 12:40:54 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1680871254; bh=XUzy587FF52xTeWeQOPi7PFQmOUDRgUDla4qDY1lDXg=; h=From:To:Subject:Date:In-Reply-To:References:From; b=GLIiFUZZ3SoYQlXTXXQfbD3Zpn1xV0HB6ek1//dDrvhAd9+/SqY5sRZ5oj7WXyAt4 GWOP7nh7eUZpzP03ir/HvsW6+w8YKcZ8bHaHyCihzwp/HuUhnYZCHdgUrfKiPexGu1 Y5Mig/sc5nLUkcnqKPO8KtijFWkR2y3mjj21V1t8= From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 2/6] cindex: preserve indexlevel across invocations Date: Fri, 7 Apr 2023 12:40:49 +0000 Message-Id: <20230407124053.2233988-3-e@80x24.org> In-Reply-To: <20230407124053.2233988-1-e@80x24.org> References: <20230407124053.2233988-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This matches the behavior of mail indexers and ensures `medium' indices don't grow unexpectedly to be come `full' indices. --- lib/PublicInbox/CodeSearchIdx.pm | 15 +++++++++-- lib/PublicInbox/SearchIdx.pm | 2 +- t/cindex.t | 45 ++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm index 5f20325a..3a3fc03e 100644 --- a/lib/PublicInbox/CodeSearchIdx.pm +++ b/lib/PublicInbox/CodeSearchIdx.pm @@ -85,7 +85,6 @@ sub new { xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER, cidx_dir => $dir, creat => 1, # TODO: get rid of this, should be implicit - indexlevel => $l, transact_bytes => 0, # for checkpoint total_bytes => 0, # for lock_release current_info => '', @@ -617,16 +616,28 @@ sub cidx_init ($) { } $self->lock_acquire; my @shards; + my $l = $self->{indexlevel} //= $self->{-opt}->{indexlevel}; + for my $n (0..($self->{nshard} - 1)) { my $shard = bless { %$self, shard => $n }, ref($self); delete @$shard{qw(lockfh lock_path)}; - $shard->idx_acquire; + my $xdb = $shard->idx_acquire; + if (!$n) { + if (($l // '') eq 'medium') { + $xdb->set_metadata('indexlevel', $l); + } elsif (($l // '') eq 'full') { + $xdb->set_metadata('indexlevel', ''); # unset + } + $l ||= $xdb->get_metadata('indexlevel') || 'full'; + } + $shard->{indexlevel} = $l; $shard->idx_release; $shard->wq_workers_start("cidx shard[$n]", 1, $SIGSET, { siblings => \@shards, # for ipc_atfork_child }, \&shard_done_wait, $self); push @shards, $shard; } + $self->{indexlevel} //= $l; # this warning needs to happen after idx_acquire state $once; warn <{indexlevel} =~ $xapianlevels } +sub need_xapian ($) { ($_[0]->{indexlevel} // 'full') =~ $xapianlevels } sub idx_release { my ($self, $wake) = @_; diff --git a/t/cindex.t b/t/cindex.t index 9da0ba69..d40f73ff 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -4,11 +4,13 @@ use v5.12; use PublicInbox::TestCommon; use Cwd qw(getcwd abs_path); +use List::Util qw(sum); require_mods(qw(json Search::Xapian)); use_ok 'PublicInbox::CodeSearchIdx'; require PublicInbox::Import; my ($tmp, $for_destroy) = tmpdir(); my $pwd = getcwd(); +my @unused_keys = qw(last_commit has_threadid skip_docdata); # I reworked CodeSearchIdx->shard_worker to handle empty trees # in the initial commit generated by cvs2svn for xapian.git @@ -71,7 +73,48 @@ ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]), ok(-e "$tmp/ext/cidx.lock", 'external dir created'); ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo'); +ok(run_script([qw(-cindex -L medium --dangerous -q -d), + "$tmp/med", $zp, "$tmp/wt0"]), 'cindex external medium'); + +my $no_metadata_set = sub { + my ($i, $extra, $xdb) = @_; + for my $xdb (@$xdb) { + for my $k (@unused_keys, @$extra) { + is($xdb->get_metadata($k) // '', '', + "metadata $k unset in shard #$i"); + } + ++$i; + } +}; + +{ + my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*")); + my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*")); + ok($full_size > $mid_size, 'full size > mid size') or + diag "full=$full_size mid=$mid_size"; + for my $l (qw(med ext)) { + ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]), + "reindex $l"); + } + $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*")); + $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*")); + ok($full_size > $mid_size, 'full size > mid size after reindex') or + diag "full=$full_size mid=$mid_size"; + my $csrch = PublicInbox::CodeSearch->new("$tmp/med"); + my ($xdb0, @xdb) = $csrch->xdb_shards_flat; + $no_metadata_set->(0, [], [ $xdb0 ]); + is($xdb0->get_metadata('indexlevel'), 'medium', + 'indexlevel set in shard #0'); + $no_metadata_set->(1, ['indexlevel'], \@xdb); + + ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]), + 'reindex medium as full'); + @xdb = $csrch->xdb_shards_flat; + $no_metadata_set->(0, ['indexlevel'], \@xdb); +} + use_ok 'PublicInbox::CodeSearch'; + if ('multi-repo search') { my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); my $mset = $csrch->mset('NUL'); @@ -86,6 +129,8 @@ if ('multi-repo search') { $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") }); @have = sort(map { $_->get_document->get_data } $mset->items); is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter'); + my @xdb = $csrch->xdb_shards_flat; + $no_metadata_set->(0, ['indexlevel'], \@xdb); } if ('--update') {