From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.0 required=3.0 tests=ALL_TRUSTED,BAYES_00 shortcircuit=no autolearn=ham autolearn_force=no version=3.4.2 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id AF3ED1FC0C for ; Tue, 21 Sep 2021 07:41:59 +0000 (UTC) From: Eric Wong To: meta@public-inbox.org Subject: [PATCH 07/12] lei lcat: support NNTP URLs Date: Tue, 21 Sep 2021 07:41:54 +0000 Message-Id: <20210921074159.20052-8-e@80x24.org> In-Reply-To: <20210921074159.20052-1-e@80x24.org> References: <20210921074159.20052-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: NNTP URLs are probably more prevalent in public message archives than IMAP URLs. --- lib/PublicInbox/LeiLcat.pm | 66 +++++++++++++++++++++------------- lib/PublicInbox/LeiMailSync.pm | 14 +++++--- t/lei-import-nntp.t | 23 ++++++++++++ 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/lib/PublicInbox/LeiLcat.pm b/lib/PublicInbox/LeiLcat.pm index 1a4a988e..0902c213 100644 --- a/lib/PublicInbox/LeiLcat.pm +++ b/lib/PublicInbox/LeiLcat.pm @@ -11,47 +11,64 @@ use PublicInbox::LeiViewText; use URI::Escape qw(uri_unescape); use PublicInbox::MID qw($MID_EXTRACT); -sub lcat_folder ($$$) { - my ($lei, $lms, $folder) = @_; - $lms //= $lei->lms or return; - my $folders = [ $folder]; +sub lcat_folder ($$;$$) { + my ($lei, $folder, $beg, $end) = @_; + my $lms = $lei->{-lms_ro} //= $lei->lms // return; + my $folders = [ $folder ]; eval { $lms->arg2folder($lei, $folders) }; - if ($@) { - $lei->child_error(0, "# unknown folder: $folder"); - } else { - for my $f (@$folders) { - my $fid = $lms->fid_for($f); - push @{$lei->{lcat_todo}}, { fid => $fid }; - } + return $lei->child_error(0, "# unknown folder: $folder") if $@; + my %range; + if (defined($beg)) { # NNTP article range + $range{min} = $beg; + $range{max} = $end // $beg; + } + for my $f (@$folders) { + my $fid = $lms->fid_for($f); + push @{$lei->{lcat_todo}}, { fid => $fid, %range }; } } sub lcat_imap_uri ($$) { my ($lei, $uri) = @_; - my $lms = $lei->lms or return; - # cf. LeiXsearch->lcat_dump + # cf. LeiXSearch->lcat_dump + my $lms = $lei->{-lms_ro} //= $lei->lms // return; if (defined $uri->uid) { push @{$lei->{lcat_todo}}, $lms->imap_oidhex($lei, $uri); } elsif (defined(my $fid = $lms->fid_for($$uri))) { push @{$lei->{lcat_todo}}, { fid => $fid }; } else { - lcat_folder($lei, $lms, $$uri); + lcat_folder($lei, $$uri); } } +sub lcat_nntp_uri ($$) { + my ($lei, $uri) = @_; + my $mid = $uri->message; # already unescaped by URI::news + return "mid:$mid" if defined($mid); + my $lms = $lei->{-lms_ro} //= $lei->lms // return; + my ($ng, $beg, $end) = $uri->group; + $uri->group($ng); + lcat_folder($lei, $$uri, $beg, $end); + '""'; +} + sub extract_1 ($$) { my ($lei, $x) = @_; - if ($x =~ m!\b(imaps?://[^>]+)!i) { - my $u = $1; - require PublicInbox::URIimap; - lcat_imap_uri($lei, PublicInbox::URIimap->new($u)); - '""'; # blank query, using {lcat_todo} - } elsif ($x =~ m!\b(maildir:.+)!i) { - lcat_folder($lei, undef, $1); + if ($x =~ m!\b(maildir:.+)!i) { + lcat_folder($lei, $1); '""'; # blank query, using {lcat_todo} - } elsif ($x =~ m!\b([a-z]+?://\S+)!i) { - my $u = $1; + } elsif ($x =~ m!\b(([a-z]+)://\S+)!i) { + my ($u, $scheme) = ($1, $2); $u =~ s/[\>\]\)\,\.\;]+\z//; + if ($scheme =~ m!\A(imaps?)\z!i) { + require PublicInbox::URIimap; + lcat_imap_uri($lei, PublicInbox::URIimap->new($u)); + return '""'; # blank query, using {lcat_todo} + } elsif ($scheme =~ m!\A(?:nntps?|s?news)\z!i) { + require PublicInbox::URInntps; + $u = PublicInbox::URInntps->new($u); + return lcat_nntp_uri($lei, $u); + } # http, or something else: require URI; $u = URI->new($u); my $p = $u->path; @@ -93,7 +110,7 @@ sub extract_all { my $strict = !$lei->{opt}->{stdin}; my @q; for my $x (@argv) { - if (my $term = extract_1($lei,$x)) { + if (my $term = extract_1($lei, $x)) { push @q, $term; } elsif ($strict) { return $lei->fail(<<""); @@ -101,6 +118,7 @@ could not extract Message-ID from $x } } + delete $lei->{-lms_ro}; @q ? join(' OR ', @q) : $lei->fail("no Message-ID in: @argv"); } diff --git a/lib/PublicInbox/LeiMailSync.pm b/lib/PublicInbox/LeiMailSync.pm index f83c7de2..522a5ebc 100644 --- a/lib/PublicInbox/LeiMailSync.pm +++ b/lib/PublicInbox/LeiMailSync.pm @@ -197,9 +197,12 @@ INSERT OR IGNORE INTO blob2name (oidbin, fid, name) VALUES (?, ?, ?) sub each_src { my ($self, $folder, $cb, @args) = @_; my $dbh = $self->{dbh} //= dbh_new($self); - my $fid; + my ($fid, @rng); + my $and_ge_le = ''; if (ref($folder) eq 'HASH') { $fid = $folder->{fid} // die "BUG: no `fid'"; + @rng = grep(defined, @$folder{qw(min max)}); + $and_ge_le = 'AND uid >= ? AND uid <= ?' if @rng; } else { $fid = $self->{fmap}->{$folder} //= fid_for($self, $folder) // return; @@ -208,16 +211,17 @@ sub each_src { # minimize implicit txn time to avoid blocking writers by # batching SELECTs. This looks wonky but is necessary since # $cb-> may access the DB on its own. - my $ary = $dbh->selectall_arrayref(<<'', undef, $fid); -SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? + my $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng); +SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? $and_ge_le ORDER BY _rowid_ ASC LIMIT 1000 my $min = @$ary ? $ary->[-1]->[0] : undef; while (defined $min) { for my $row (@$ary) { $cb->($row->[1], $row->[2], @args) } - $ary = $dbh->selectall_arrayref(<<'', undef, $fid, $min); -SELECT _rowid_,oidbin,uid FROM blob2num WHERE fid = ? AND _rowid_ > ? + $ary = $dbh->selectall_arrayref(<<"", undef, $fid, @rng, $min); +SELECT _rowid_,oidbin,uid FROM blob2num +WHERE fid = ? $and_ge_le AND _rowid_ > ? ORDER BY _rowid_ ASC LIMIT 1000 $min = @$ary ? $ary->[-1]->[0] : undef; diff --git a/t/lei-import-nntp.t b/t/lei-import-nntp.t index 0b080781..eb1ae312 100644 --- a/t/lei-import-nntp.t +++ b/t/lei-import-nntp.t @@ -25,6 +25,11 @@ test_lei({ tmpdir => $tmpdir }, sub { is(ref(json_utf8->decode($lei_out)), 'ARRAY', 'ls-mail-source JSON'); lei_ok('import', $url); + lei_ok "lcat", "nntp://$host_port/testmessage\@example.com"; + my $local = $lei_out; + lei_ok "lcat", "nntp://example.com/testmessage\@example.com"; + my $remote = $lei_out; + is($local, $remote, 'Message-ID used even from unknown host'); lei_ok(qw(q z:1..)); $out = json_utf8->decode($lei_out); ok(scalar(@$out) > 1, 'got imported messages'); @@ -57,6 +62,11 @@ test_lei({ tmpdir => $tmpdir }, sub { lei_ok('inspect', "$url/$high"); my $x = json_utf8->decode($lei_out); like($x->{$url}->{$high}, qr/\A[a-f0-9]{40,}\z/, 'inspect shows blob'); + lei_ok qw(lcat -f json), "$url/$high"; + my $lcat = json_utf8->decode($lei_out); + is($lcat->[1], undef, 'only one result for lcat'); + is($lcat->[0]->{blob}, $x->{$url}->{$high}, + 'lcat showed correct blob'); lei_ok 'ls-mail-sync'; is($lei_out, "$url\n", 'article number not stored as folder'); @@ -78,6 +88,19 @@ test_lei({ tmpdir => $tmpdir }, sub { is(scalar(grep(/\A[a-f0-9]{40,}\z/, values %{$x->{$url}})), $end - $low + 1, 'all values are git blobs'); + lei_ok qw(lcat -f json), "$url/$low"; + $lcat = json_utf8->decode($lei_out); + is($lcat->[1], undef, 'only one result for lcat'); + is($lcat->[0]->{blob}, $x->{$url}->{$low}, + 'lcat showed correct blob'); + lei_ok qw(lcat -f json), "$url/$low-$end"; + $lcat = json_utf8->decode($lei_out); + pop @$lcat; + for ($low..$end) { + my $tip = shift @$lcat; + is($x->{$url}->{$_}, $tip->{blob}, "blob matches art #$_"); + } + lei_ok 'ls-mail-sync'; is($lei_out, "$url\n", 'article range not stored as folder'); lei_ok qw(q z:0..); my $start = json_utf8->decode($lei_out);