From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id E9F771F8C8; Wed, 22 Sep 2021 09:45:17 +0000 (UTC) Date: Wed, 22 Sep 2021 09:45:17 +0000 From: Eric Wong To: Konstantin Ryabitsev Cc: meta@public-inbox.org Subject: [PATCH] gcf2 + extsearch: check for unlinked files on Linux Message-ID: <20210922094517.GA8771@dcvr> References: <20210921144754.gulkneuulzo27qbw@meerkat.local> <20210921190653.GA16367@dcvr> <20210921205756.2kgyln6kwwbncani@meerkat.local> <20210921213755.GA25671@dcvr> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20210921213755.GA25671@dcvr> List-Id: Eric Wong wrote: > Konstantin Ryabitsev wrote: > > Well, it may also not be something that's the responsibility of public-inbox > > either, e.g. other long-running daemons don't perform such checks. We can just > > issue a reload after we've done repacking. > > The less operational knowledge required, the better; especially > since I'd like to see more self-hosted instances pop up. I've lightly tested the patch below; but I'm questioning the need for Gcf2 if /all/ is configured (more on that later, need nap) -----------8<--------- Subject: [PATCH] gcf2 + extsearch: check for unlinked files on Linux Check for unlinked mmap-ed files via /proc/$PID/maps every 60s or so. ExtSearch (extindex) is compatible-enough with Inbox objects to be wired into the old per-inbox code, but the startup cost is projected to be much higher down the line when there's >30K inboxes, so we scan /proc/$PID/maps for deleted files before unlinking. With old Inbox objects, it was (and is) simpler to just kill processes w/o checking due to the low startup cost (and non-portability of checking). Reported-by: Konstantin Ryabitsev Link: https://public-inbox.org/meta/20210921144754.gulkneuulzo27qbw@meerkat.local/ --- lib/PublicInbox/ExtSearch.pm | 10 ++++++++-- lib/PublicInbox/Gcf2.pm | 26 +++++++++++++++++++++++--- lib/PublicInbox/Git.pm | 16 +++++++++++++++- lib/PublicInbox/Inbox.pm | 11 +++++++++-- 4 files changed, 55 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/ExtSearch.pm b/lib/PublicInbox/ExtSearch.pm index bd301158..cd7cba2e 100644 --- a/lib/PublicInbox/ExtSearch.pm +++ b/lib/PublicInbox/ExtSearch.pm @@ -33,12 +33,18 @@ sub misc { # same as per-inbox ->over, for now... sub over { my ($self) = @_; - $self->{over} //= PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + $self->{over} //= do { + PublicInbox::Inbox::_cleanup_later($self); + PublicInbox::Over->new("$self->{xpfx}/over.sqlite3"); + }; } sub git { my ($self) = @_; - $self->{git} //= PublicInbox::Git->new("$self->{topdir}/ALL.git"); + $self->{git} //= do { + PublicInbox::Inbox::_cleanup_later($self); + PublicInbox::Git->new("$self->{topdir}/ALL.git"); + }; } # returns a hashref of { $NEWSGROUP_NAME => $ART_NO } using the `xref3' table diff --git a/lib/PublicInbox/Gcf2.pm b/lib/PublicInbox/Gcf2.pm index 64945ca6..f546208f 100644 --- a/lib/PublicInbox/Gcf2.pm +++ b/lib/PublicInbox/Gcf2.pm @@ -8,6 +8,7 @@ use strict; use v5.10.1; use PublicInbox::Spawn qw(which popen_rd); # may set PERL_INLINE_DIRECTORY use Fcntl qw(LOCK_EX SEEK_SET); +use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC); use IO::Handle; # autoflush BEGIN { my (%CFG, $c_src); @@ -96,11 +97,21 @@ sub add_alt ($$) { 1; } -# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop +sub have_unlinked_files () { + # FIXME: port gcf2-like over to git.git so we won't need to + # deal with libgit2 + return 1 if $^O ne 'linux'; + open my $fh, '<', "/proc/$$/maps" or return; + while (<$fh>) { return 1 if /\.(?:idx|pack) \(deleted\)$/ } + undef; +} + +# Usage: $^X -MPublicInbox::Gcf2 -e PublicInbox::Gcf2::loop [EXPIRE-TIMEOUT] # (see lib/PublicInbox/Gcf2Client.pm) -sub loop () { +sub loop (;$) { + my $exp = $_[0] || $ARGV[0] || 60; # seconds my $gcf2 = new(); - my %seen; + my (%seen, $check_at); STDERR->autoflush(1); STDOUT->autoflush(1); @@ -116,6 +127,7 @@ sub loop () { $gcf2 = new(); %seen = ($git_dir => add_alt($gcf2,"$git_dir/objects")); + $check_at = clock_gettime(CLOCK_MONOTONIC) + $exp; if ($gcf2->cat_oid(1, $oid)) { warn "I: $$ $oid found after retry\n"; @@ -123,6 +135,14 @@ sub loop () { warn "W: $$ $oid missing after retry\n"; print "$oid missing\n"; # mimic git-cat-file } + } else { # check expiry to deal with deleted pack files + my $now = clock_gettime(CLOCK_MONOTONIC); + $check_at //= $now + $exp; + if ($now > $check_at && have_unlinked_files()) { + undef $check_at; + $gcf2 = new(); + %seen = (); + } } } } diff --git a/lib/PublicInbox/Git.pm b/lib/PublicInbox/Git.pm index 5ef1db2f..cf51239f 100644 --- a/lib/PublicInbox/Git.pm +++ b/lib/PublicInbox/Git.pm @@ -12,7 +12,7 @@ use v5.10.1; use parent qw(Exporter); use POSIX (); use IO::Handle; # ->autoflush -use Errno qw(EINTR EAGAIN); +use Errno qw(EINTR EAGAIN ENOENT); use File::Glob qw(bsd_glob GLOB_NOSORT); use File::Spec (); use Time::HiRes qw(stat); @@ -523,6 +523,20 @@ sub manifest_entry { $ent; } +sub cleanup_if_unlinked { + my ($self) = @_; + return cleanup($self) if $^O ne 'linux'; + # Linux-specific /proc/$PID/maps access + # TODO: support this inside git.git + for my $fld (qw(pid pid_c)) { + my $pid = $self->{$fld} // next; + open my $fh, '<', "/proc/$pid/maps" or next; + while (<$fh>) { + return cleanup($self) if /\.(?:idx|pack) \(deleted\)$/; + } + } +} + 1; __END__ =pod diff --git a/lib/PublicInbox/Inbox.pm b/lib/PublicInbox/Inbox.pm index 20f8c884..02ffc7be 100644 --- a/lib/PublicInbox/Inbox.pm +++ b/lib/PublicInbox/Inbox.pm @@ -22,8 +22,15 @@ my $CLEANUP = {}; # string(inbox) -> inbox sub git_cleanup ($) { my ($self) = @_; - my $git = $self->{git} or return; - $git->cleanup; + if ($self->isa(__PACKAGE__)) { + # normal Inbox; low startup cost, and likely to have many + # many so this keeps process/pipe counts in check + $self->{git}->cleanup if $self->{git}; + } else { + # ExtSearch, high startup cost if ->ALL, and probably + # only one per-daemon, so teardown only if required: + $self->git->cleanup_if_unlinked; + } } sub cleanup_task () {