* [PATCH] search: force large mbox result downloads to POST
2017-12-07 20:30 [PATCH] searchview: nofollow on mbox downloads Eric Wong
@ 2017-12-08 21:01 ` Eric Wong
0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2017-12-08 21:01 UTC (permalink / raw)
To: meta
This should prevent crawlers (including most robots.txt ignoring
ones) from burning our CPU time without severely compromising
usability for humans.
---
Using nofollow doesn't help with some bots or results which
already ended up in their crawling lists.
lib/PublicInbox/SearchView.pm | 19 ++++++++++++-------
lib/PublicInbox/WWW.pm | 22 ++++++++++++++++++----
2 files changed, 30 insertions(+), 11 deletions(-)
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 8e0c3cf..13e9c17 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -18,12 +18,19 @@ our $LIM = 200;
sub noop {}
+sub mbox_results {
+ my ($ctx) = @_;
+ my $q = PublicInbox::SearchQuery->new($ctx->{qp});
+ my $x = $q->{x};
+ return PublicInbox::Mbox::mbox_all($ctx, $q->{'q'}) if $x eq 'm';
+ sres_top_html($ctx);
+}
+
sub sres_top_html {
my ($ctx) = @_;
my $q = PublicInbox::SearchQuery->new($ctx->{qp});
my $x = $q->{x};
my $query = $q->{'q'};
- return PublicInbox::Mbox::mbox_all($ctx, $query) if $x eq 'm';
my $code = 200;
# double the limit for expanded views:
@@ -60,7 +67,7 @@ retry:
} else {
return adump($_[0], $mset, $q, $ctx) if $x eq 'A';
- $ctx->{-html_tip} = search_nav_top($mset, $q, $ctx) . "\n\n";
+ $ctx->{-html_tip} = search_nav_top($mset, $q, $ctx);
if ($x eq 't') {
$cb = mset_thread($ctx, $mset, $q);
} else {
@@ -131,8 +138,8 @@ sub err_txt {
sub search_nav_top {
my ($mset, $q, $ctx) = @_;
-
- my $rv = '<pre>';
+ my $m = $q->qs_html(x => 'm', r => undef);
+ my $rv = qq{<form\naction="?$m"\nmethod="post"><pre>};
my $initial_q = $ctx->{-uxs_retried};
if (defined $initial_q) {
my $rewritten = $q->{'q'};
@@ -166,10 +173,8 @@ sub search_nav_top {
}
my $A = $q->qs_html(x => 'A', r => undef);
$rv .= qq{|<a\nhref="?$A">Atom feed</a>]};
- my $m = $q->qs_html(x => 'm', r => undef);
- warn "m: $m\n";
$rv .= qq{\n\t\t\t\t\t\tdownload: };
- $rv .= qq{<a\nhref="?$m"\nrel="nofollow">mbox.gz</a>};
+ $rv .= qq{<input\ntype=submit\nvalue="mbox.gz"/></pre></form><pre>};
}
sub search_nav_bot {
diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm
index f3c702e..3fd77d4 100644
--- a/lib/PublicInbox/WWW.pm
+++ b/lib/PublicInbox/WWW.pm
@@ -53,10 +53,14 @@ sub call {
my $path_info = $env->{PATH_INFO};
my $method = $env->{REQUEST_METHOD};
- if ($method eq 'POST' &&
- $path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) {
- my $path = $2;
- return invalid_inbox($ctx, $1) || serve_git($ctx, $path);
+ if ($method eq 'POST') {
+ if ($path_info =~ m!$INBOX_RE/(git-upload-pack)\z!) {
+ my $path = $2;
+ return invalid_inbox($ctx, $1) ||
+ serve_git($ctx, $path);
+ } elsif ($path_info =~ m!$INBOX_RE/!o) {
+ return invalid_inbox($ctx, $1) || mbox_results($ctx);
+ }
}
elsif ($method !~ /\AGET|HEAD\z/) {
return r(405, 'Method Not Allowed');
@@ -400,6 +404,16 @@ sub serve_git {
PublicInbox::GitHTTPBackend::serve($ctx->{env}, $ctx->{git}, $path);
}
+sub mbox_results {
+ my ($ctx) = @_;
+ if ($ctx->{env}->{QUERY_STRING} =~ /(?:\A|[&;])q=/) {
+ searcher($ctx) or return need_search($ctx);
+ require PublicInbox::SearchView;
+ return PublicInbox::SearchView::mbox_results($ctx);
+ }
+ r404();
+}
+
sub serve_mbox_range {
my ($ctx, $inbox, $range) = @_;
invalid_inbox($ctx, $inbox) || eval {
--
EW
^ permalink raw reply related [flat|nested] 2+ messages in thread