From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id DEC671F9FC for ; Wed, 10 Feb 2021 07:07:49 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 1/6] lei *external: glob improvements, ls-external filtering Date: Wed, 10 Feb 2021 07:07:44 +0000 Message-Id: <20210210070749.30391-2-e@80x24.org> In-Reply-To: <20210210070749.30391-1-e@80x24.org> References: <20210210070749.30391-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: The "ls-external" now accepts the same glob patterns used by with lei q --{include,only,exclude}. If no glob is detected, it will be treated as a literal substring match (like "grep -F"). Inverting matches is also supported ("grep -v"). --- lib/PublicInbox/LEI.pm | 4 +- lib/PublicInbox/LeiExternal.pm | 74 ++++++++++++++++++++++++---------- t/lei_external.t | 20 +++++++-- 3 files changed, 71 insertions(+), 27 deletions(-) diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm index dd831c54..eb5a646e 100644 --- a/lib/PublicInbox/LEI.pm +++ b/lib/PublicInbox/LEI.pm @@ -124,8 +124,8 @@ our %CMD = ( # sorted in order of importance/use: qw(boost=i c=s@ mirror=s no-torsocks torsocks=s inbox-version=i), qw(quiet|q verbose|v+), index_opt(), PublicInbox::LeiQuery::curl_opt() ], -'ls-external' => [ '[FILTER...]', 'list publicinbox|extindex locations', - qw(format|f=s z|0 local remote quiet|q) ], +'ls-external' => [ '[FILTER]', 'list publicinbox|extindex locations', + qw(format|f=s z|0 globoff|g invert-match|v local remote) ], 'forget-external' => [ 'LOCATION...|--prune', 'exclude further results from a publicinbox|extindex', qw(prune quiet|q) ], diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm index b65dc87c..bac15226 100644 --- a/lib/PublicInbox/LeiExternal.pm +++ b/lib/PublicInbox/LeiExternal.pm @@ -22,22 +22,16 @@ sub externals_each { # highest boost first, but stable for alphabetic tie break use sort 'stable'; my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost; - return @order if !$cb; - for my $loc (@order) { - $cb->(@arg, $loc, $boost{$loc}); + if (ref($cb) eq 'CODE') { + for my $loc (@order) { + $cb->(@arg, $loc, $boost{$loc}); + } + } elsif (ref($cb) eq 'HASH') { + %$cb = %boost; } @order; # scalar or array } -sub lei_ls_external { - my ($self, @argv) = @_; - my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); - externals_each($self, sub { - my ($loc, $boost_val) = @_; - $self->out($loc, $OFS, 'boost=', $boost_val, $ORS); - }); -} - sub ext_canonicalize { my ($location) = @_; if ($location !~ m!\Ahttps?://!) { @@ -52,28 +46,47 @@ sub ext_canonicalize { } } -my %patmap = ('*' => '[^/]*?', '?' => '[^/]', '[' => '[', ']' => ']'); -sub glob2pat { - my ($glob) = @_; - $glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge; - $glob; +my %re_map = ( '*' => '[^/]*?', '?' => '[^/]', + '[' => '[', ']' => ']', ',' => ',' ); + +sub glob2re { + my ($re) = @_; + my $p = ''; + my $in_bracket = 0; + my $qm = 0; + my $changes = ($re =~ s!(.)! + $re_map{$p eq '\\' ? '' : do { + if ($1 eq '[') { ++$in_bracket } + elsif ($1 eq ']') { --$in_bracket } + $p = $1; + }} // do { + $p = $1; + ($p eq '-' && $in_bracket) ? $p : (++$qm, "\Q$p") + }!sge); + # bashism (also supported by curl): {a,b,c} => (a|b|c) + $re =~ s/([^\\]*)\\\{([^,]*?,[^\\]*?)\\\}/ + (my $in_braces = $2) =~ tr!,!|!; + $1."($in_braces)"; + /sge; + ($changes - $qm) ? $re : undef; } +# get canonicalized externals list matching $loc +# $is_exclude denotes it's for --exclude +# otherwise it's for --only/--include is assumed sub get_externals { - my ($self, $loc, $exclude) = @_; + my ($self, $loc, $is_exclude) = @_; return (ext_canonicalize($loc)) if -e $loc; - my @m; my @cur = externals_each($self); my $do_glob = !$self->{opt}->{globoff}; # glob by default - if ($do_glob && ($loc =~ /[\*\?]/s || $loc =~ /\[.*\]/s)) { - my $re = glob2pat($loc); + if ($do_glob && (my $re = glob2re($loc))) { @m = grep(m!$re!, @cur); return @m if scalar(@m); } elsif (index($loc, '/') < 0) { # exact basename match: @m = grep(m!/\Q$loc\E/?\z!, @cur); return @m if scalar(@m) == 1; - } elsif ($exclude) { # URL, maybe: + } elsif ($is_exclude) { # URL, maybe: my $canon = ext_canonicalize($loc); @m = grep(m!\A\Q$canon\E\z!, @cur); return @m if scalar(@m) == 1; @@ -88,6 +101,23 @@ sub get_externals { (); } +sub lei_ls_external { + my ($self, $filter) = @_; + my $do_glob = !$self->{opt}->{globoff}; # glob by default + my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n"); + $filter //= '*'; + my $re = $do_glob ? glob2re($filter) : undef; + $re //= index($filter, '/') < 0 ? + qr!/\Q$filter\E/?\z! : # exact basename match + qr/\Q$filter\E/; # grep -F semantics + my @ext = externals_each($self, my $boost = {}); + @ext = $self->{opt}->{'invert-match'} ? grep(!/$re/, @ext) + : grep(/$re/, @ext); + for my $loc (@ext) { + $self->out($loc, $OFS, 'boost=', $boost->{$loc}, $ORS); + } +} + sub add_external_finish { my ($self, $location) = @_; my $cfg = $self->_lei_cfg(1); diff --git a/t/lei_external.t b/t/lei_external.t index 587990db..0ef6633d 100644 --- a/t/lei_external.t +++ b/t/lei_external.t @@ -1,7 +1,8 @@ #!perl -w -use strict; -use v5.10.1; -use Test::More; +# Copyright (C) 2020-2021 all contributors +# License: AGPL-3.0+ +# internal unit test, see t/lei-externals.t for functional tests +use strict; use v5.10.1; use Test::More; my $cls = 'PublicInbox::LeiExternal'; require_ok $cls; my $canon = $cls->can('ext_canonicalize'); @@ -15,4 +16,17 @@ is($canon->('/this/path/is/nonexistent/'), '/this/path/is/nonexistent', is($canon->('/this//path/'), '/this/path', 'extra slashes gone'); is($canon->('/ALL/CAPS'), '/ALL/CAPS', 'caps preserved'); +my $glob2re = $cls->can('glob2re'); +is($glob2re->('foo'), undef, 'plain string unchanged'); +is_deeply($glob2re->('[f-o]'), '[f-o]' , 'range accepted'); +is_deeply($glob2re->('*'), '[^/]*?' , 'wildcard accepted'); +is_deeply($glob2re->('{a,b,c}'), '(a|b|c)' , 'braces'); +is_deeply($glob2re->('{,b,c}'), '(|b|c)' , 'brace with empty @ start'); +is_deeply($glob2re->('{a,b,}'), '(a|b|)' , 'brace with empty @ end'); +is_deeply($glob2re->('{a}'), undef, 'ungrouped brace'); +is_deeply($glob2re->('{a'), undef, 'open left brace'); +is_deeply($glob2re->('a}'), undef, 'open right brace'); +is_deeply($glob2re->('*.[ch]'), '[^/]*?\\.[ch]', 'suffix glob'); +is_deeply($glob2re->('{[a-z],9,}'), '([a-z]|9|)' , 'brace with range'); + done_testing;