unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH 1/6] lei *external: glob improvements, ls-external filtering
Date: Wed, 10 Feb 2021 07:07:44 +0000	[thread overview]
Message-ID: <20210210070749.30391-2-e@80x24.org> (raw)
In-Reply-To: <20210210070749.30391-1-e@80x24.org>

The "ls-external" now accepts the same glob patterns used by
with lei q --{include,only,exclude}.  If no glob is detected, it
will be treated as a literal substring match (like "grep -F").

Inverting matches is also supported ("grep -v").
---
 lib/PublicInbox/LEI.pm         |  4 +-
 lib/PublicInbox/LeiExternal.pm | 74 ++++++++++++++++++++++++----------
 t/lei_external.t               | 20 +++++++--
 3 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/lib/PublicInbox/LEI.pm b/lib/PublicInbox/LEI.pm
index dd831c54..eb5a646e 100644
--- a/lib/PublicInbox/LEI.pm
+++ b/lib/PublicInbox/LEI.pm
@@ -124,8 +124,8 @@ our %CMD = ( # sorted in order of importance/use:
 	qw(boost=i c=s@ mirror=s no-torsocks torsocks=s inbox-version=i),
 	qw(quiet|q verbose|v+),
 	index_opt(), PublicInbox::LeiQuery::curl_opt() ],
-'ls-external' => [ '[FILTER...]', 'list publicinbox|extindex locations',
-	qw(format|f=s z|0 local remote quiet|q) ],
+'ls-external' => [ '[FILTER]', 'list publicinbox|extindex locations',
+	qw(format|f=s z|0 globoff|g invert-match|v local remote) ],
 'forget-external' => [ 'LOCATION...|--prune',
 	'exclude further results from a publicinbox|extindex',
 	qw(prune quiet|q) ],
diff --git a/lib/PublicInbox/LeiExternal.pm b/lib/PublicInbox/LeiExternal.pm
index b65dc87c..bac15226 100644
--- a/lib/PublicInbox/LeiExternal.pm
+++ b/lib/PublicInbox/LeiExternal.pm
@@ -22,22 +22,16 @@ sub externals_each {
 	# highest boost first, but stable for alphabetic tie break
 	use sort 'stable';
 	my @order = sort { $boost{$b} <=> $boost{$a} } sort keys %boost;
-	return @order if !$cb;
-	for my $loc (@order) {
-		$cb->(@arg, $loc, $boost{$loc});
+	if (ref($cb) eq 'CODE') {
+		for my $loc (@order) {
+			$cb->(@arg, $loc, $boost{$loc});
+		}
+	} elsif (ref($cb) eq 'HASH') {
+		%$cb = %boost;
 	}
 	@order; # scalar or array
 }
 
-sub lei_ls_external {
-	my ($self, @argv) = @_;
-	my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
-	externals_each($self, sub {
-		my ($loc, $boost_val) = @_;
-		$self->out($loc, $OFS, 'boost=', $boost_val, $ORS);
-	});
-}
-
 sub ext_canonicalize {
 	my ($location) = @_;
 	if ($location !~ m!\Ahttps?://!) {
@@ -52,28 +46,47 @@ sub ext_canonicalize {
 	}
 }
 
-my %patmap = ('*' => '[^/]*?', '?' => '[^/]', '[' => '[', ']' => ']');
-sub glob2pat {
-	my ($glob) = @_;
-        $glob =~ s!(.)!$patmap{$1} || "\Q$1"!ge;
-        $glob;
+my %re_map = ( '*' => '[^/]*?', '?' => '[^/]',
+		'[' => '[', ']' => ']', ',' => ',' );
+
+sub glob2re {
+	my ($re) = @_;
+	my $p = '';
+	my $in_bracket = 0;
+	my $qm = 0;
+	my $changes = ($re =~ s!(.)!
+		$re_map{$p eq '\\' ? '' : do {
+			if ($1 eq '[') { ++$in_bracket }
+			elsif ($1 eq ']') { --$in_bracket }
+			$p = $1;
+		}} // do {
+			$p = $1;
+			($p eq '-' && $in_bracket) ? $p : (++$qm, "\Q$p")
+		}!sge);
+	# bashism (also supported by curl): {a,b,c} => (a|b|c)
+	$re =~ s/([^\\]*)\\\{([^,]*?,[^\\]*?)\\\}/
+		(my $in_braces = $2) =~ tr!,!|!;
+		$1."($in_braces)";
+		/sge;
+	($changes - $qm) ? $re : undef;
 }
 
+# get canonicalized externals list matching $loc
+# $is_exclude denotes it's for --exclude
+# otherwise it's for --only/--include is assumed
 sub get_externals {
-	my ($self, $loc, $exclude) = @_;
+	my ($self, $loc, $is_exclude) = @_;
 	return (ext_canonicalize($loc)) if -e $loc;
-
 	my @m;
 	my @cur = externals_each($self);
 	my $do_glob = !$self->{opt}->{globoff}; # glob by default
-	if ($do_glob && ($loc =~ /[\*\?]/s || $loc =~ /\[.*\]/s)) {
-		my $re = glob2pat($loc);
+	if ($do_glob && (my $re = glob2re($loc))) {
 		@m = grep(m!$re!, @cur);
 		return @m if scalar(@m);
 	} elsif (index($loc, '/') < 0) { # exact basename match:
 		@m = grep(m!/\Q$loc\E/?\z!, @cur);
 		return @m if scalar(@m) == 1;
-	} elsif ($exclude) { # URL, maybe:
+	} elsif ($is_exclude) { # URL, maybe:
 		my $canon = ext_canonicalize($loc);
 		@m = grep(m!\A\Q$canon\E\z!, @cur);
 		return @m if scalar(@m) == 1;
@@ -88,6 +101,23 @@ sub get_externals {
 	();
 }
 
+sub lei_ls_external {
+	my ($self, $filter) = @_;
+	my $do_glob = !$self->{opt}->{globoff}; # glob by default
+	my ($OFS, $ORS) = $self->{opt}->{z} ? ("\0", "\0\0") : (" ", "\n");
+	$filter //= '*';
+	my $re = $do_glob ? glob2re($filter) : undef;
+	$re //= index($filter, '/') < 0 ?
+			qr!/\Q$filter\E/?\z! : # exact basename match
+			qr/\Q$filter\E/; # grep -F semantics
+	my @ext = externals_each($self, my $boost = {});
+	@ext = $self->{opt}->{'invert-match'} ? grep(!/$re/, @ext)
+					: grep(/$re/, @ext);
+	for my $loc (@ext) {
+		$self->out($loc, $OFS, 'boost=', $boost->{$loc}, $ORS);
+	}
+}
+
 sub add_external_finish {
 	my ($self, $location) = @_;
 	my $cfg = $self->_lei_cfg(1);
diff --git a/t/lei_external.t b/t/lei_external.t
index 587990db..0ef6633d 100644
--- a/t/lei_external.t
+++ b/t/lei_external.t
@@ -1,7 +1,8 @@
 #!perl -w
-use strict;
-use v5.10.1;
-use Test::More;
+# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+# internal unit test, see t/lei-externals.t for functional tests
+use strict; use v5.10.1; use Test::More;
 my $cls = 'PublicInbox::LeiExternal';
 require_ok $cls;
 my $canon = $cls->can('ext_canonicalize');
@@ -15,4 +16,17 @@ is($canon->('/this/path/is/nonexistent/'), '/this/path/is/nonexistent',
 is($canon->('/this//path/'), '/this/path', 'extra slashes gone');
 is($canon->('/ALL/CAPS'), '/ALL/CAPS', 'caps preserved');
 
+my $glob2re = $cls->can('glob2re');
+is($glob2re->('foo'), undef, 'plain string unchanged');
+is_deeply($glob2re->('[f-o]'), '[f-o]' , 'range accepted');
+is_deeply($glob2re->('*'), '[^/]*?' , 'wildcard accepted');
+is_deeply($glob2re->('{a,b,c}'), '(a|b|c)' , 'braces');
+is_deeply($glob2re->('{,b,c}'), '(|b|c)' , 'brace with empty @ start');
+is_deeply($glob2re->('{a,b,}'), '(a|b|)' , 'brace with empty @ end');
+is_deeply($glob2re->('{a}'), undef, 'ungrouped brace');
+is_deeply($glob2re->('{a'), undef, 'open left brace');
+is_deeply($glob2re->('a}'), undef, 'open right brace');
+is_deeply($glob2re->('*.[ch]'), '[^/]*?\\.[ch]', 'suffix glob');
+is_deeply($glob2re->('{[a-z],9,}'), '([a-z]|9|)' , 'brace with range');
+
 done_testing;

  reply	other threads:[~2021-02-10  7:07 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-10  7:07 [PATCH 0/6] more lei stuffs Eric Wong
2021-02-10  7:07 ` Eric Wong [this message]
2021-02-10  7:07 ` [PATCH 2/6] lei_external: remove unnecessary Exporter use Eric Wong
2021-02-10  7:07 ` [PATCH 3/6] test_common: support lei-daemon only testing Eric Wong
2021-02-10  7:07 ` [PATCH 4/6] lei ls-external: support --local and --remote Eric Wong
2021-02-10  7:07 ` [PATCH 5/6] lei: note some TODO items (curl, externals) Eric Wong
2021-02-10  7:07 ` [PATCH 6/6] net_reader: new package split from -watch Eric Wong
2021-02-10  8:38 ` [PATCH 7/6] lei_external: fix+test handling of escaped braces Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210210070749.30391-2-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).