From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id F23921F670 for ; Mon, 7 Mar 2022 10:57:37 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH] index|extindex: support --dangerous flag Date: Mon, 7 Mar 2022 10:57:37 +0000 Message-Id: <20220307105737.7480-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: This enables Xapian::DB_DANGEROUS to support in-place updates. This can speed up the initial index and reduce I/O at the cost of preventing concurrent readers and being unsafe in the face of any abnormal terminations. This is more dangerous than --no-fsync. --no-fsync is only unsafe in the event of a power loss or kernel crash; --dangerous is unsafe even on SIGKILL. --- Documentation/public-inbox-extindex.pod | 14 ++++++++++++-- Documentation/public-inbox-index.pod | 9 ++++++++- lib/PublicInbox/ExtSearchIdx.pm | 3 ++- lib/PublicInbox/MiscIdx.pm | 1 + lib/PublicInbox/SearchIdx.pm | 9 ++++++++- script/public-inbox-extindex | 4 ++-- script/public-inbox-index | 3 ++- t/extsearch.t | 5 +++-- t/indexlevels-mirror.t | 4 ++-- 9 files changed, 40 insertions(+), 12 deletions(-) diff --git a/Documentation/public-inbox-extindex.pod b/Documentation/public-inbox-extindex.pod index 9731dfb0..f71a90e5 100644 --- a/Documentation/public-inbox-extindex.pod +++ b/Documentation/public-inbox-extindex.pod @@ -24,7 +24,17 @@ along with L and L Perl modules. =item --jobs=JOBS -... TODO, see L +=item --no-fsync + +=item --dangerous + +=item --rethread + +=item --max-size SIZE + +=item --batch-size SIZE + +These switches behave as they do for L =item --all @@ -119,7 +129,7 @@ L =head1 COPYRIGHT -Copyright 2021 all contributors L +Copyright all contributors L License: AGPL-3.0+ L diff --git a/Documentation/public-inbox-index.pod b/Documentation/public-inbox-index.pod index c92b6de4..011ade3c 100644 --- a/Documentation/public-inbox-index.pod +++ b/Documentation/public-inbox-index.pod @@ -150,6 +150,13 @@ data to accumulate, resulting on latency spikes from writeback. Available in public-inbox 1.6.0+. +=item --dangerous + +Speed up initial index by using in-place updates and denying support for +concurrent readers. This is only effective with Xapian 1.4+. + +Available in public-inbox 1.8.0+ + =item --sequential-shard Sets or overrides L on a @@ -324,7 +331,7 @@ L =head1 COPYRIGHT -Copyright 2016-2021 all contributors L +Copyright all contributors L License: AGPL-3.0+ L diff --git a/lib/PublicInbox/ExtSearchIdx.pm b/lib/PublicInbox/ExtSearchIdx.pm index 4b46fa16..7c44a1a4 100644 --- a/lib/PublicInbox/ExtSearchIdx.pm +++ b/lib/PublicInbox/ExtSearchIdx.pm @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ # Detached/external index cross inbox search indexing support @@ -59,6 +59,7 @@ sub new { nproc_shards({ nproc => $opt->{jobs} }); my $oidx = PublicInbox::OverIdx->new("$self->{xpfx}/over.sqlite3"); $self->{-no_fsync} = $oidx->{-no_fsync} = 1 if !$opt->{fsync}; + $self->{-dangerous} = 1 if $opt->{dangerous}; $self->{oidx} = $oidx; $self } diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm index dc15442d..5faf5c66 100644 --- a/lib/PublicInbox/MiscIdx.pm +++ b/lib/PublicInbox/MiscIdx.pm @@ -31,6 +31,7 @@ sub new { PublicInbox::Syscall::nodatacow_dir($mi_dir); my $flags = $PublicInbox::SearchIdx::DB_CREATE_OR_OPEN; $flags |= $PublicInbox::SearchIdx::DB_NO_SYNC if $eidx->{-no_fsync}; + $flags |= $PublicInbox::SearchIdx::DB_DANGEROUS if $eidx->{-dangerous}; $json //= PublicInbox::Config::json(); bless { mi_dir => $mi_dir, diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 95b14c3a..85fae4ad 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -30,6 +30,7 @@ our @EXPORT_OK = qw(log2stack is_ancestor check_size prepare_stack my $X = \%PublicInbox::Search::X; our ($DB_CREATE_OR_OPEN, $DB_OPEN); our $DB_NO_SYNC = 0; +our $DB_DANGEROUS = 0; our $BATCH_BYTES = $ENV{XAPIAN_FLUSH_THRESHOLD} ? 0x7fffffff : # assume a typical 64-bit system has 8x more RAM than a # typical 32-bit system: @@ -115,7 +116,10 @@ sub load_xapian_writable () { my $ver = (eval($xap.'::major_version()') << 16) | (eval($xap.'::minor_version()') << 8) | eval($xap.'::revision()'); - $DB_NO_SYNC = 0x4 if $ver >= 0x10400; + if ($ver >= 0x10400) { + $DB_NO_SYNC = 0x4; + $DB_DANGEROUS = 0x10; + } # Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks $X->{CLOEXEC_UNSET} = 1 if $ver >= 0x010215 && $ver <= 0x010218; 1; @@ -142,6 +146,9 @@ sub idx_acquire { require PublicInbox::Syscall; PublicInbox::Syscall::nodatacow_dir($dir); $self->{-set_has_threadid_once} = 1; + if (($self->{ibx} // $self->{eidx})->{-dangerous}) { + $flag |= $DB_DANGEROUS; + } } } return unless defined $flag; diff --git a/script/public-inbox-extindex b/script/public-inbox-extindex index c63f5dc2..bee824b1 100755 --- a/script/public-inbox-extindex +++ b/script/public-inbox-extindex @@ -1,5 +1,5 @@ #!perl -w -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ use strict; use v5.10.1; @@ -28,7 +28,7 @@ See public-inbox-extindex(1) man page for full documentation. EOF my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i - fsync|sync! fast + fsync|sync! fast dangerous indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s dedupe:s@ gc commit-interval=i watch scan! dry-run|n diff --git a/script/public-inbox-index b/script/public-inbox-index index 053d8b94..a04be9fc 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -39,7 +39,7 @@ my $opt = { 'update-extindex' => [], # ":s@" optional arg sets '' if no arg given }; GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i prune - fsync|sync! xapian_only|xapian-only + fsync|sync! xapian_only|xapian-only dangerous indexlevel|index-level|L=s max_size|max-size=s batch_size|batch-size=s since|after=s until|before=s @@ -126,6 +126,7 @@ for my $ibx (@ibxs) { PublicInbox::Xapcmd::run($ibx, 'compact', $opt->{compact_opt}); } $ibx->{-no_fsync} = 1 if !$opt->{fsync}; + $ibx->{-dangerous} = 1 if $opt->{dangerous}; $ibx->{-skip_docdata} //= $opt->{'skip-docdata'}; my $ibx_opt = $opt; diff --git a/t/extsearch.t b/t/extsearch.t index dfc190e2..09cbdabe 100644 --- a/t/extsearch.t +++ b/t/extsearch.t @@ -1,5 +1,5 @@ #!perl -w -# Copyright (C) 2020-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ use strict; use Test::More; @@ -54,7 +54,8 @@ run_script(['-mda', '--no-precheck'], $env, { 0 => $fh }) or BAIL_OUT '-mda'; run_script([qw(-index -Lbasic), "$home/v1test"]) or BAIL_OUT "index $?"; -ok(run_script([qw(-extindex --all), "$home/extindex"]), 'extindex init'); +ok(run_script([qw(-extindex --dangerous --all), "$home/extindex"]), + 'extindex init'); { my $es = PublicInbox::ExtSearch->new("$home/extindex"); ok($es->has_threadid, '->has_threadid'); diff --git a/t/indexlevels-mirror.t b/t/indexlevels-mirror.t index e606e79b..ac85643d 100644 --- a/t/indexlevels-mirror.t +++ b/t/indexlevels-mirror.t @@ -1,5 +1,5 @@ #!perl -w -# Copyright (C) 2019-2021 all contributors +# Copyright (C) all contributors # License: AGPL-3.0+ use strict; use v5.10.1; @@ -34,7 +34,7 @@ my $import_index_incremental = sub { local $ENV{PI_CONFIG} = "$tmpdir/config"; # index master (required for v1) - my @cmd = (qw(-index -j0), $ibx->{inboxdir}, "-L$level"); + my @cmd = (qw(-index -j0 --dangerous), $ibx->{inboxdir}, "-L$level"); push @cmd, '-c' if have_xapian_compact; ok(run_script(\@cmd, undef, { 2 => \$err }), 'index master'); my $ro_master = PublicInbox::Inbox->new({