* [PATCH 01/11] search: reduce redundant doc data @ 2015-09-01 8:55 Eric Wong 2015-09-01 8:55 ` [PATCH 02/11] search: allow querying all mail with '' Eric Wong ` (9 more replies) 0 siblings, 10 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta Redundant document data increases our database size, pull the smsg->mid off the unique term, the smsg->ts off the value, and only generate the formatted display date off smsg->ts. --- lib/PublicInbox/Search.pm | 7 ++++--- lib/PublicInbox/SearchIdx.pm | 2 -- lib/PublicInbox/SearchMsg.pm | 42 ++++++++++++++++++++++-------------------- lib/PublicInbox/View.pm | 15 ++++++++------- 4 files changed, 34 insertions(+), 32 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index d3faaeb..b7b215f 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -4,8 +4,9 @@ package PublicInbox::Search; use strict; use warnings; -use PublicInbox::SearchMsg; +use constant TS => 0; use Search::Xapian qw/:standard/; +use PublicInbox::SearchMsg; use Email::MIME; use PublicInbox::MID qw/mid_clean mid_compress/; @@ -15,7 +16,6 @@ our $REPLY_RE = qr/^re:\s+/i; our $LANG = 'english'; use constant { - TS => 0, # SCHEMA_VERSION history # 0 - initial # 1 - subject_path is lower-cased @@ -25,7 +25,8 @@ use constant { # 5 - subject_path drops trailing '.' # 6 - preserve References: order in document data # 7 - remove references and inreplyto terms - SCHEMA_VERSION => 7, + # 8 - remove redundant/unneeded document data + SCHEMA_VERSION => 8, QP_FLAGS => FLAG_PHRASE|FLAG_BOOLEAN|FLAG_LOVEHATE|FLAG_WILDCARD, }; diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index dec3333..32e0714 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -81,8 +81,6 @@ sub add_message { $doc->add_term(xpfx('path') . mid_compress($path)); } - my $from = $smsg->from_name; - my $date = $smsg->date; my $ts = Search::Xapian::sortable_serialise($smsg->ts); $doc->add_value(PublicInbox::Search::TS, $ts); diff --git a/lib/PublicInbox/SearchMsg.pm b/lib/PublicInbox/SearchMsg.pm index 4ad8a0c..1821b07 100644 --- a/lib/PublicInbox/SearchMsg.pm +++ b/lib/PublicInbox/SearchMsg.pm @@ -13,6 +13,7 @@ use PublicInbox::MID qw/mid_clean mid_compress/; use Encode qw/find_encoding/; my $enc_utf8 = find_encoding('UTF-8'); our $PFX2TERM_RE = undef; +use constant EPOCH_822 => 'Thu, 01 Jan 1970 00:00:00 +0000'; sub new { my ($class, $mime) = @_; @@ -30,13 +31,17 @@ sub wrap { sub load_doc { my ($class, $doc) = @_; my $data = $doc->get_data; + my $ts = eval { + no strict 'subs'; + $doc->get_value(PublicInbox::Search::TS); + }; + $ts = Search::Xapian::sortable_unserialise($ts); $data = $enc_utf8->decode($data); - my ($mid, $subj, $from, $date, $refs) = split(/\n/, $data); + my ($subj, $from, $refs) = split(/\n/, $data); bless { doc => $doc, - mid => $mid, subject => $subj, - date => $date, + ts => $ts, from_name => $from, references_sorted => $refs, }, $class; @@ -77,27 +82,13 @@ sub from_name { sub ts { my ($self) = @_; - my $ts = $self->{ts}; - return $ts if $ts; - $self->{ts} = eval { - str2time($self->date || $self->mime->header('Date')) - } || 0; -} - -sub date { - my ($self) = @_; - my $date = $self->{date}; - return $date if $date; - my $ts = eval { str2time($self->mime->header('Date')) }; - $self->{date} = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); + $self->{ts} ||= eval { str2time($self->mime->header('Date')) } || 0; } sub to_doc_data { my ($self) = @_; - $self->mid . "\n" . PublicInbox::Search::subject_summary($self->subject) . "\n" . $self->from_name . "\n". - $self->date . "\n" . $self->references_sorted; } @@ -139,14 +130,23 @@ sub mini_mime { my @h = ( Subject => $self->subject, 'X-PI-From' => $self->from_name, - 'X-PI-Date' => $self->date, 'X-PI-TS' => $self->ts, 'Message-ID' => "<$self->{mid}>", + + # prevent Email::Simple::Creator from running, + # this header is useless for threading as we use X-PI-TS + # for sorting and display: + 'Date' => EPOCH_822, ); my $refs = $self->{references_sorted}; my $mime = Email::MIME->create(header_str => \@h); - $mime->header_set('References', $refs) if (defined $refs); + my $h = $mime->header_obj; + $h->header_set('References', $refs) if (defined $refs); + + # drop useless headers Email::MIME set for us + $h->header_set('Date'); + $h->header_set('MIME-Version'); $mime; } @@ -155,6 +155,8 @@ sub mid { if (defined $mid) { $self->{mid} = $mid; + } elsif (my $rv = $self->{mid}) { + $rv; } else { $self->ensure_metadata; # needed for ghosts $self->{mid} ||= $self->_extract_mid; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 584a2d7..477c4b6 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -86,12 +86,7 @@ sub index_entry { $subj = "<u\nid=\"u\">$subj</u>"; } - my $ts = $mime->header('X-PI-TS'); - unless (defined $ts) { - $ts = msg_timestamp($mime); - } - $ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); - + my $ts = _msg_date($mime); my $rv = "<table\nsummary=l$level><tr>"; if ($level) { $rv .= '<td><pre>' . (' ' x $level) . '</pre></td>'; @@ -561,6 +556,12 @@ sub missing_thread { EOF } +sub _msg_date { + my ($mime) = @_; + my $ts = $mime->header('X-PI-TS') || msg_timestamp($mime); + POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); +} + sub _inline_header { my ($dst, $state, $mime, $level) = @_; my $pfx = ' ' x $level; @@ -568,7 +569,7 @@ sub _inline_header { my $cur = $state->{cur}; my $mid = $mime->header('Message-ID'); my $f = $mime->header('X-PI-From'); - my $d = $mime->header('X-PI-Date'); + my $d = _msg_date($mime); $f = PublicInbox::Hval->new($f); $d = PublicInbox::Hval->new($d); $f = $f->as_html; -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 02/11] search: allow querying all mail with '' 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 03/11] search: show newest results first Eric Wong ` (8 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta This makes dumping recent topics easier, hopefully. --- lib/PublicInbox/Search.pm | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index b7b215f..831c4fd 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -74,10 +74,14 @@ sub reopen { $_[0]->{xdb}->reopen } # read-only sub query { my ($self, $query_string, $opts) = @_; - my $query = $self->qp->parse_query($query_string, QP_FLAGS); + my $query; $opts ||= {}; - $opts->{relevance} = 1; + unless ($query_string eq '') { + $query = $self->qp->parse_query($query_string, QP_FLAGS); + $opts->{relevance} = 1; + } + $self->do_enquire($query, $opts); } @@ -104,8 +108,11 @@ sub get_thread { sub do_enquire { my ($self, $query, $opts) = @_; my $enquire = $self->enquire; - - $query = Search::Xapian::Query->new(OP_AND, $query, $mail_query); + if (defined $query) { + $query = Search::Xapian::Query->new(OP_AND,$query,$mail_query); + } else { + $query = $mail_query; + } $enquire->set_query($query); if ($opts->{relevance}) { $enquire->set_sort_by_relevance_then_value(TS, 0); -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 03/11] search: show newest results first 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong 2015-09-01 8:55 ` [PATCH 02/11] search: allow querying all mail with '' Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 04/11] feed: use updated date based on git commit date Eric Wong ` (7 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta Like revision control history, older stuff is less relevant, so favor newer stuff, first. --- lib/PublicInbox/Search.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 831c4fd..8b32ef3 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -115,9 +115,9 @@ sub do_enquire { } $enquire->set_query($query); if ($opts->{relevance}) { - $enquire->set_sort_by_relevance_then_value(TS, 0); + $enquire->set_sort_by_relevance_then_value(TS, 1); } else { - $enquire->set_sort_by_value(TS, 0); + $enquire->set_sort_by_value(TS, 1); } $opts ||= {}; my $offset = $opts->{offset} || 0; -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 04/11] feed: use updated date based on git commit date 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong 2015-09-01 8:55 ` [PATCH 02/11] search: allow querying all mail with '' Eric Wong 2015-09-01 8:55 ` [PATCH 03/11] search: show newest results first Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 05/11] feed: extract atom header generation Eric Wong ` (6 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta This will hopefully make life easier for feed readers. --- lib/PublicInbox/Feed.pm | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index bc76cdc..71042d7 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -10,6 +10,7 @@ use PublicInbox::Hval; use PublicInbox::GitCatFile; use PublicInbox::View; use PublicInbox::MID qw/mid_clean mid_compress/; +use POSIX qw/strftime/; use constant { DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # atom standard MAX_PER_PAGE => 25, # this needs to be tunable @@ -33,7 +34,6 @@ sub generate_html_index { sub emit_atom { my ($cb, $ctx) = @_; - require POSIX; my $fh = $cb->([ 200, ['Content-Type' => 'application/xml']]); my $max = $ctx->{max} || MAX_PER_PAGE; my $feed_opts = get_feedopts($ctx); @@ -45,18 +45,23 @@ sub emit_atom { my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; my $url = $feed_opts->{url} || "http://example.com/"; my $atomurl = $feed_opts->{atomurl}; - $fh->write(qq(<?xml version="1.0" encoding="us-ascii"?>\n) . + my $x = qq(<?xml version="1.0" encoding="us-ascii"?>\n) . qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . qq{<title$type>$title</title>} . qq{<link\nhref="$url"/>} . qq{<link\nrel="self"\nhref="$atomurl"/>} . - qq{<id>mailto:$addr</id>} . - '<updated>' . POSIX::strftime(DATEFMT, gmtime) . '</updated>'); + qq{<id>mailto:$addr</id>}; my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); each_recent_blob($ctx, sub { - my ($add, undef) = @_; - add_to_feed($feed_opts, $fh, $add, $git); + my ($path, undef, $ts) = @_; + if (defined $x) { + $fh->write($x . '<updated>'. + strftime(DATEFMT, gmtime($ts)) . + '</updated>'); + $x = undef; + } + add_to_feed($feed_opts, $fh, $path, $git); }); $git = undef; # destroy pipes Email::Address->purge_cache; @@ -259,7 +264,7 @@ sub feed_date { my ($date) = @_; my @t = eval { strptime($date) }; - scalar(@t) ? POSIX::strftime(DATEFMT, @t) : 0; + scalar(@t) ? strftime(DATEFMT, @t) : 0; } # returns 0 (skipped) or 1 (added) @@ -363,7 +368,7 @@ sub dump_topics { $subj = PublicInbox::Hval->new($subj)->as_html; $u = PublicInbox::Hval->new($u)->as_html; $dst .= "\n<a\nhref=\"t/$mid/#u\"><b>$subj</b></a>\n- "; - $ts = POSIX::strftime('%Y-%m-%d %H:%M', gmtime($ts)); + $ts = strftime('%Y-%m-%d %H:%M', gmtime($ts)); if ($n == 1) { $dst .= "created by $u @ $ts UTC\n" } else { -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 05/11] feed: extract atom header generation 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (2 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 04/11] feed: use updated date based on git commit date Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 06/11] implement per-thread Atom feeds Eric Wong ` (5 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta We'll be using it for per-thread subscriptions --- lib/PublicInbox/Feed.pm | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 71042d7..3540e9a 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -32,26 +32,26 @@ sub generate_html_index { # private subs +sub atom_header { + my ($feed_opts) = @_; + my $title = $feed_opts->{description}; + $title = PublicInbox::Hval->new_oneline($title)->as_html; + my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + + qq(<?xml version="1.0" encoding="us-ascii"?>\n) . + qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . + qq{<title$type>$title</title>} . + qq(<link\nhref="$feed_opts->{url}"/>) . + qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) . + qq(<id>mailto:$feed_opts->{id_addr}</id>); +} + sub emit_atom { my ($cb, $ctx) = @_; my $fh = $cb->([ 200, ['Content-Type' => 'application/xml']]); my $max = $ctx->{max} || MAX_PER_PAGE; my $feed_opts = get_feedopts($ctx); - my $addr = $feed_opts->{address}; - $addr = $addr->[0] if ref($addr); - $addr ||= 'public-inbox@example.com'; - my $title = $feed_opts->{description} || "unnamed feed"; - $title = PublicInbox::Hval->new_oneline($title)->as_html; - my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; - my $url = $feed_opts->{url} || "http://example.com/"; - my $atomurl = $feed_opts->{atomurl}; - my $x = qq(<?xml version="1.0" encoding="us-ascii"?>\n) . - qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . - qq{<title$type>$title</title>} . - qq{<link\nhref="$url"/>} . - qq{<link\nrel="self"\nhref="$atomurl"/>} . - qq{<id>mailto:$addr</id>}; - + my $x = atom_header($feed_opts); my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); each_recent_blob($ctx, sub { my ($path, undef, $ts) = @_; @@ -219,13 +219,18 @@ sub get_feedopts { if (open my $fh, '<', "$ctx->{git_dir}/description") { chomp($rv{description} = <$fh>); close $fh; + } else { + $rv{description} = '($GIT_DIR/description missing)'; } if ($pi_config && defined $listname && $listname ne '') { - foreach my $key (qw(address)) { - $rv{$key} = $pi_config->get($listname, $key) || ""; - } + my $addr = $pi_config->get($listname, 'address') || ""; + $rv{address} = $addr; + $addr = $addr->[0] if ref($addr); + $rv{id_addr} = $addr; } + $rv{id_addr} ||= 'public-inbox@example.com'; + my $url_base; if ($cgi) { my $path_info = $cgi->path_info; -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 06/11] implement per-thread Atom feeds 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (3 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 05/11] feed: extract atom header generation Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 9:30 ` [13/11 PATCH] feed: fix <updated> tag in Atom feed Eric Wong 2015-09-01 8:55 ` [PATCH 07/11] www: compile mbox regexp only once Eric Wong ` (4 subsequent siblings) 9 siblings, 1 reply; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta This allows users to subscribe to only a single thread with their feed reader without subscribing to the rest of the thread. Update our endpoint notes while we're at it. --- Documentation/design_www.txt | 31 ++++++++------- lib/PublicInbox/Feed.pm | 92 ++++++++++++++++++++++++++++++++------------ lib/PublicInbox/View.pm | 3 +- lib/PublicInbox/WWW.pm | 13 +++++++ t/cgi.t | 12 ++++++ 5 files changed, 112 insertions(+), 39 deletions(-) diff --git a/Documentation/design_www.txt b/Documentation/design_www.txt index 55e9268..d25afca 100644 --- a/Documentation/design_www.txt +++ b/Documentation/design_www.txt @@ -6,25 +6,30 @@ URL naming /$LISTNAME/atom.xml -> Atom feed #### Optional, relies on Search::Xapian -/$LISTNAME/t/$MESSAGE_ID.html -> HTML content of thread +/$LISTNAME/t/$MESSAGE_ID/ -> HTML content of thread +/$LISTNAME/t/$MESSAGE_ID/atom -> Atom feed for thread +/$LISTNAME/t/$MESSAGE_ID/mbox.gz -> gzipped mbox of thread ### Stable endpoints -/$LISTNAME/m/$MESSAGE_ID.html -> HTML content (short quotes) -/$LISTNAME/m/$MESSAGE_ID.txt -> raw mbox -/$LISTNAME/m/$MESSAGE_ID -> 301 to .html version -/$LISTNAME/f/$MESSAGE_ID.html -> HTML content (full quotes) -/$LISTNAME/f/$MESSAGE_ID -> 301 to .html version -/$LISTNAME/f/$MESSAGE_ID.txt -> 301 to ../m/$MESSAGE_ID.txt +/$LISTNAME/m/$MESSAGE_ID/ -> HTML content (short quotes) +/$LISTNAME/m/$MESSAGE_ID -> 301 to above +/$LISTNAME/m/$MESSAGE_ID/raw -> raw mbox +/$LISTNAME/f/$MESSAGE_ID/ -> HTML content (full quotes) +/$LISTNAME/f/$MESSAGE_ID -> 301 to above +/$LISTNAME/f/$MESSAGE_ID/raw (*) -> 301 to ../m/$MESSAGE_ID/raw + +### Legacy endpoints (may be ambiguous given Message-IDs with similar suffies) +/$LISTNAME/m/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ +/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to $MESSAGE_ID/raw +/$LISTNAME/f/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ +/$LISTNAME/f/$MESSAGE_ID.txt (*) -> 301 to ../m/$MESSAGE_ID/raw + FIXME: we must refactor/cleanup/add tests for most of our CGI before adding more endpoints and features. -Maybe TODO (these might be expensive) -------------------------------------- -/$LISTNAME/t/$MESSAGE_ID.mbox -> mbox content of thread - -We use file name suffixes on all of these (except /) so URLs may easily -cached/memoized using a static file server. +(*) These URLs were never linked, but only exist as a convenience to folks + who edit existing URLs Encoding notes -------------- diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 3540e9a..1fef984 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -9,7 +9,7 @@ use Date::Parse qw(strptime); use PublicInbox::Hval; use PublicInbox::GitCatFile; use PublicInbox::View; -use PublicInbox::MID qw/mid_clean mid_compress/; +use PublicInbox::MID qw/mid_clean mid_compress mid2path/; use POSIX qw/strftime/; use constant { DATEFMT => '%Y-%m-%dT%H:%M:%SZ', # atom standard @@ -25,6 +25,11 @@ sub generate { sub { emit_atom($_[0], $ctx) }; } +sub generate_thread_atom { + my ($ctx) = @_; + sub { emit_atom_thread($_[0], $ctx) }; +} + sub generate_html_index { my ($ctx) = @_; sub { emit_html_index($_[0], $ctx) }; @@ -32,15 +37,22 @@ sub generate_html_index { # private subs -sub atom_header { - my ($feed_opts) = @_; - my $title = $feed_opts->{description}; +sub title_tag { + my ($title) = @_; + # try to avoid the type attribute in title: $title = PublicInbox::Hval->new_oneline($title)->as_html; my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + "<title$type>$title</title>"; +} + +sub atom_header { + my ($feed_opts, $title) = @_; + + $title = title_tag($feed_opts->{description}) unless (defined $title); qq(<?xml version="1.0" encoding="us-ascii"?>\n) . qq{<feed\nxmlns="http://www.w3.org/2005/Atom">} . - qq{<title$type>$title</title>} . + qq{$title} . qq(<link\nhref="$feed_opts->{url}"/>) . qq(<link\nrel="self"\nhref="$feed_opts->{atomurl}"/>) . qq(<id>mailto:$feed_opts->{id_addr}</id>); @@ -56,19 +68,50 @@ sub emit_atom { each_recent_blob($ctx, sub { my ($path, undef, $ts) = @_; if (defined $x) { - $fh->write($x . '<updated>'. - strftime(DATEFMT, gmtime($ts)) . - '</updated>'); + $fh->write($x . '<updated>' . + strftime(DATEFMT, gmtime($ts)) . + '</updated>'); $x = undef; } add_to_feed($feed_opts, $fh, $path, $git); }); $git = undef; # destroy pipes + _end_feed($fh); +} + +sub _no_thread { + my ($cb) = @_; + my $fh = $cb->([404, ['Content-Type' => 'text/plain']]); + $fh->write("No feed found for thread\n"); + $fh->close; +} + +sub _end_feed { + my ($fh) = @_; Email::Address->purge_cache; - $fh->write("</feed>"); + $fh->write('</feed>'); $fh->close; } +sub emit_atom_thread { + my ($cb, $ctx) = @_; + my $res = $ctx->{srch}->get_thread($ctx->{mid}); + return _no_thread($cb) unless $res->{total}; + my $fh = $cb->([200, ['Content-Type' => 'application/xml']]); + my $feed_opts = get_feedopts($ctx); + + my $html_url = $feed_opts->{atomurl} = $ctx->{self_url}; + $html_url =~ s!/atom\z!/!; + $feed_opts->{url} = $html_url; + $feed_opts->{emit_header} = 1; + + my $git = PublicInbox::GitCatFile->new($ctx->{git_dir}); + foreach my $msg (@{$res->{msgs}}) { + add_to_feed($feed_opts, $fh, mid2path($msg->mid), $git); + } + $git = undef; # destroy pipes + _end_feed($fh); +} sub emit_html_index { my ($cb, $ctx) = @_; @@ -233,7 +276,6 @@ sub get_feedopts { my $url_base; if ($cgi) { - my $path_info = $cgi->path_info; my $base; if (ref($cgi) eq 'CGI') { $base = $cgi->url(-base); @@ -241,13 +283,11 @@ sub get_feedopts { $base = $cgi->base->as_string; $base =~ s!/\z!!; } - $url_base = $path_info; - if ($url_base =~ s!/(?:|index\.html)?\z!!) { - $rv{atomurl} = "$base$url_base/atom.xml"; + $url_base = "$base/$listname"; + if (my $mid = $ctx->{mid}) { # per-thread feed: + $rv{atomurl} = "$url_base/t/$mid/atom"; } else { - $url_base =~ s!/atom\.xml\z!!; - $rv{atomurl} = $base . $path_info; - $url_base = $base . $url_base; # XXX is this needed? + $rv{atomurl} = "$url_base/atom.xml"; } } else { $url_base = "http://example.com"; @@ -288,9 +328,12 @@ sub add_to_feed { defined($content) or return 0; $mime = undef; + my $date = $header_obj->header('Date'); + $date = PublicInbox::Hval->new_oneline($date); + $date = feed_date($date->raw) or return 0; + my $title = mime_header($header_obj, 'Subject') or return 0; - $title = PublicInbox::Hval->new_oneline($title)->as_html; - my $type = index($title, '&') >= 0 ? "\ntype=\"html\"" : ''; + $title = title_tag($title); my $from = mime_header($header_obj, 'From') or return 0; my @from = Email::Address->parse($from) or return 0; @@ -298,13 +341,12 @@ sub add_to_feed { my $email = $from[0]->address; $email = PublicInbox::Hval->new_oneline($email)->as_html; - my $date = $header_obj->header('Date'); - $date = PublicInbox::Hval->new_oneline($date); - $date = feed_date($date->raw) or return 0; - + if (delete $feed_opts->{emit_header}) { + $fh->write(atom_header($feed_opts, $title) . + "<updated>$date</updated>"); + } $fh->write("<entry><author><name>$name</name><email>$email</email>" . - "</author><title$type>$title</title>" . - "<updated>$date</updated>" . + "</author>$title$date" . qq{<content\ntype="xhtml">} . qq{<div\nxmlns="http://www.w3.org/1999/xhtml">}); $fh->write($content); @@ -313,7 +355,7 @@ sub add_to_feed { my $h = '[a-f0-9]'; my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); my $id = 'urn:uuid:' . join('-', @uuid5); - my $midurl = $feed_opts->{midurl} || 'http://example.com/m/'; + my $midurl = $feed_opts->{midurl}; $fh->write(qq{</div></content><link\nhref="$midurl$href"/>}. "<id>$id</id></entry>"); 1; diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 477c4b6..a30bf70 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -160,7 +160,8 @@ sub emit_thread_html { my $next = "<a\nid=\"s$final_anchor\">"; $next .= $final_anchor == 1 ? 'only message in' : 'end of'; $next .= " thread</a>, back to <a\nhref=\"../../\">index</a>\n"; - $next .= "download: <a\nhref=\"mbox.gz\">mbox.gz</a>\n\n"; + $next .= "download: <a\nhref=\"mbox.gz\">mbox.gz</a>"; + $next .= " / <a\nhref=\"atom\">Atom feed</a>\n\n"; $fh->write("<hr />" . PRE_WRAP . $next . $foot . "</pre></body></html>"); $fh->close; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index e6eec3d..c99c25f 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -56,6 +56,9 @@ sub run { invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $sfx); + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/atom\z!o) { + invalid_list_mid(\%ctx, $1, $2) || get_thread_atom(\%ctx); + # legacy redirects } elsif ($path_info =~ m!$LISTNAME_RE/(t|m|f)/(\S+)\.html\z!o) { my $pfx = $2; @@ -348,4 +351,14 @@ sub get_thread_mbox { PublicInbox::Mbox::thread_mbox($ctx, $srch, $sfx); } + +# /$LISTNAME/t/$MESSAGE_ID/atom -> thread as Atom feed +sub get_thread_atom { + my ($ctx) = @_; + searcher($ctx) or return need_search($ctx); + $ctx->{self_url} = self_url($ctx->{cgi}); + require PublicInbox::Feed; + PublicInbox::Feed::generate_thread_atom($ctx); +} + 1; diff --git a/t/cgi.t b/t/cgi.t index fc28ae3..d84e634 100644 --- a/t/cgi.t +++ b/t/cgi.t @@ -200,6 +200,18 @@ EOF } else { like($res->{head}, qr/^Status: 501 /, "search not available"); } + + my $have_xml_feed = eval { require XML::Feed; 1 } if $indexed; + if ($have_xml_feed) { + $path = "/test/t/blahblah%40example.com/atom"; + $res = cgi_run($path); + like($res->{head}, qr/^Status: 200 /, "atom returned 200"); + like($res->{head}, qr!^Content-Type: application/xml!m, + "search returned atom"); + my $p = XML::Feed->parse(\($res->{body})); + is($p->format, "Atom", "parsed atom feed"); + is(scalar $p->entries, 3, "parsed three entries"); + } } # redirect list-name-only URLs -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [13/11 PATCH] feed: fix <updated> tag in Atom feed 2015-09-01 8:55 ` [PATCH 06/11] implement per-thread Atom feeds Eric Wong @ 2015-09-01 9:30 ` Eric Wong 0 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 9:30 UTC (permalink / raw) To: meta Fixes commit d44ed46ee92c78aaaed64975c4d6846613963be4 ("implement per-thread Atom feeds") --- lib/PublicInbox/Feed.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 4420fde..9615880 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -331,6 +331,7 @@ sub add_to_feed { my $date = $header_obj->header('Date'); $date = PublicInbox::Hval->new_oneline($date); $date = feed_date($date->raw) or return 0; + $date = "<updated>$date</updated>"; my $title = mime_header($header_obj, 'Subject') or return 0; $title = title_tag($title); @@ -342,8 +343,7 @@ sub add_to_feed { $email = PublicInbox::Hval->new_oneline($email)->as_html; if (delete $feed_opts->{emit_header}) { - $fh->write(atom_header($feed_opts, $title) . - "<updated>$date</updated>"); + $fh->write(atom_header($feed_opts, $title) . $date); } $fh->write("<entry><author><name>$name</name><email>$email</email>" . "</author>$title$date" . -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 07/11] www: compile mbox regexp only once 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (4 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 06/11] implement per-thread Atom feeds Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 08/11] www: root atom feed is "new.atom" and not "atom.xml" Eric Wong ` (3 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta No need for 'x' modifier to span more lines, though --- lib/PublicInbox/WWW.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index c99c25f..278d786 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -51,7 +51,7 @@ sub run { } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!x) { + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!o) { my $sfx = $3; invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $sfx); -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 08/11] www: root atom feed is "new.atom" and not "atom.xml" 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (5 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 07/11] www: compile mbox regexp only once Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 09/11] completely revamp URL structure to shorten permalinks Eric Wong ` (2 subsequent siblings) 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta The MIME type entry for Atom feed relies on "atom", so allow properly-configured static file servers to serve it with the correct Content-Type header. --- Documentation/design_www.txt | 36 ++++++++++++++++++++---------------- lib/PublicInbox/Feed.pm | 4 ++-- lib/PublicInbox/WWW.pm | 4 ++-- t/plack.t | 2 +- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/Documentation/design_www.txt b/Documentation/design_www.txt index d25afca..a11c389 100644 --- a/Documentation/design_www.txt +++ b/Documentation/design_www.txt @@ -3,34 +3,38 @@ URL naming ### Unstable endpoints /$LISTNAME/?r=$GIT_COMMIT -> HTML only -/$LISTNAME/atom.xml -> Atom feed +/$LISTNAME/new.atom -> Atom feed #### Optional, relies on Search::Xapian -/$LISTNAME/t/$MESSAGE_ID/ -> HTML content of thread -/$LISTNAME/t/$MESSAGE_ID/atom -> Atom feed for thread -/$LISTNAME/t/$MESSAGE_ID/mbox.gz -> gzipped mbox of thread +/$LISTNAME/t/$MESSAGE_ID/ -> HTML content of thread +/$LISTNAME/t/$MESSAGE_ID/atom -> Atom feed for thread +/$LISTNAME/t/$MESSAGE_ID/mbox.gz -> gzipped mbox of thread ### Stable endpoints -/$LISTNAME/m/$MESSAGE_ID/ -> HTML content (short quotes) -/$LISTNAME/m/$MESSAGE_ID -> 301 to above -/$LISTNAME/m/$MESSAGE_ID/raw -> raw mbox -/$LISTNAME/f/$MESSAGE_ID/ -> HTML content (full quotes) -/$LISTNAME/f/$MESSAGE_ID -> 301 to above -/$LISTNAME/f/$MESSAGE_ID/raw (*) -> 301 to ../m/$MESSAGE_ID/raw +/$LISTNAME/m/$MESSAGE_ID/ -> HTML content (short quotes) +/$LISTNAME/m/$MESSAGE_ID -> 301 to above +/$LISTNAME/m/$MESSAGE_ID/raw -> raw mbox +/$LISTNAME/f/$MESSAGE_ID/ -> HTML content (full quotes) +/$LISTNAME/f/$MESSAGE_ID -> 301 to above +/$LISTNAME/f/$MESSAGE_ID/raw [1] -> 301 to ../m/$MESSAGE_ID/raw -### Legacy endpoints (may be ambiguous given Message-IDs with similar suffies) -/$LISTNAME/m/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ -/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to $MESSAGE_ID/raw -/$LISTNAME/f/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ -/$LISTNAME/f/$MESSAGE_ID.txt (*) -> 301 to ../m/$MESSAGE_ID/raw +### Legacy endpoints (may be ambiguous given Message-IDs with similar suffixes) +/$LISTNAME/m/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ +/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to $MESSAGE_ID/raw +/$LISTNAME/f/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ +/$LISTNAME/f/$MESSAGE_ID.txt [1] -> 301 to ../m/$MESSAGE_ID/raw +/$LISTNAME/atom.xml [2] -> identical to /$LISTNAME/new.atom FIXME: we must refactor/cleanup/add tests for most of our CGI before adding more endpoints and features. -(*) These URLs were never linked, but only exist as a convenience to folks +[1] These URLs were never linked, but only exist as a convenience to folks who edit existing URLs +[2] Do not make this into a 301 since feed readers may not follow them as well + as normal browsers do. + Encoding notes -------------- diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 1fef984..9d58193 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -287,11 +287,11 @@ sub get_feedopts { if (my $mid = $ctx->{mid}) { # per-thread feed: $rv{atomurl} = "$url_base/t/$mid/atom"; } else { - $rv{atomurl} = "$url_base/atom.xml"; + $rv{atomurl} = "$url_base/new.atom"; } } else { $url_base = "http://example.com"; - $rv{atomurl} = "$url_base/atom.xml"; + $rv{atomurl} = "$url_base/new.atom"; } $rv{url} ||= "$url_base/"; $rv{midurl} = "$url_base/m/"; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index 278d786..a9cb6d7 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -34,7 +34,7 @@ sub run { invalid_list(\%ctx, $1) || redirect_list_index($cgi); } elsif ($path_info =~ m!$LISTNAME_RE(?:/|/index\.html)?\z!o) { invalid_list(\%ctx, $1) || get_index(\%ctx); - } elsif ($path_info =~ m!$LISTNAME_RE/atom\.xml\z!o) { + } elsif ($path_info =~ m!$LISTNAME_RE/(?:atom\.xml|new\.atom)\z!o) { invalid_list(\%ctx, $1) || get_atom(\%ctx); # single-message pages @@ -128,7 +128,7 @@ sub invalid_list_mid { $ret; } -# /$LISTNAME/atom.xml -> Atom feed, includes replies +# /$LISTNAME/new.atom -> Atom feed, includes replies sub get_atom { my ($ctx) = @_; require PublicInbox::Feed; diff --git a/t/plack.t b/t/plack.t index b3c8764..50c9e60 100644 --- a/t/plack.t +++ b/t/plack.t @@ -83,7 +83,7 @@ EOF test_psgi($app, sub { my ($cb) = @_; - my $atomurl = 'http://example.com/test/atom.xml'; + my $atomurl = 'http://example.com/test/new.atom'; my $res = $cb->(GET('http://example.com/test/')); is(200, $res->code, 'success response received'); like($res->content, qr!href="\Q$atomurl\E"!, -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 09/11] completely revamp URL structure to shorten permalinks 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (6 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 08/11] www: root atom feed is "new.atom" and not "atom.xml" Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 10/11] view: drop extra '</a>' tag Eric Wong 2015-09-01 8:55 ` [PATCH 11/11] view: more robust link generation Eric Wong 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta This allows common /m/ links to be used without a prefix, saving 2 precious bytes for permalinks and raw messages. Old URLs continue to redirect. --- Documentation/design_www.txt | 37 +++++---- lib/PublicInbox/Feed.pm | 19 +++-- lib/PublicInbox/View.pm | 48 ++++++------ lib/PublicInbox/WWW.pm | 177 +++++++++++++++++++++++-------------------- t/cgi.t | 20 ++--- t/feed.t | 2 +- t/plack.t | 32 +++++--- t/view.t | 6 +- 8 files changed, 179 insertions(+), 162 deletions(-) diff --git a/Documentation/design_www.txt b/Documentation/design_www.txt index a11c389..b73a798 100644 --- a/Documentation/design_www.txt +++ b/Documentation/design_www.txt @@ -2,29 +2,28 @@ URL naming ---------- ### Unstable endpoints -/$LISTNAME/?r=$GIT_COMMIT -> HTML only -/$LISTNAME/new.atom -> Atom feed +/$LISTNAME/?r=$GIT_COMMIT -> HTML only +/$LISTNAME/new.atom -> Atom feed #### Optional, relies on Search::Xapian -/$LISTNAME/t/$MESSAGE_ID/ -> HTML content of thread -/$LISTNAME/t/$MESSAGE_ID/atom -> Atom feed for thread -/$LISTNAME/t/$MESSAGE_ID/mbox.gz -> gzipped mbox of thread +/$LISTNAME/$MESSAGE_ID/t/ -> HTML content of thread +/$LISTNAME/$MESSAGE_ID/t.atom -> Atom feed for thread +/$LISTNAME/$MESSAGE_ID/t.mbox.gz -> gzipped mbox of thread ### Stable endpoints -/$LISTNAME/m/$MESSAGE_ID/ -> HTML content (short quotes) -/$LISTNAME/m/$MESSAGE_ID -> 301 to above -/$LISTNAME/m/$MESSAGE_ID/raw -> raw mbox -/$LISTNAME/f/$MESSAGE_ID/ -> HTML content (full quotes) -/$LISTNAME/f/$MESSAGE_ID -> 301 to above -/$LISTNAME/f/$MESSAGE_ID/raw [1] -> 301 to ../m/$MESSAGE_ID/raw - -### Legacy endpoints (may be ambiguous given Message-IDs with similar suffixes) -/$LISTNAME/m/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ -/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to $MESSAGE_ID/raw -/$LISTNAME/f/$MESSAGE_ID.html -> 301 to $MESSAGE_ID/ -/$LISTNAME/f/$MESSAGE_ID.txt [1] -> 301 to ../m/$MESSAGE_ID/raw - -/$LISTNAME/atom.xml [2] -> identical to /$LISTNAME/new.atom +/$LISTNAME/$MESSAGE_ID/ -> HTML content (short quotes) +/$LISTNAME/$MESSAGE_ID -> 301 to /$LISTNAME/$MESSAGE_ID +/$LISTNAME/$MESSAGE_ID/raw -> raw mbox +/$LISTNAME/$MESSAGE_ID/f/ -> HTML content (full quotes) + +### Legacy endpoints (may be ambiguous given Message-IDs with similar suffies) +/$LISTNAME/m/$MESSAGE_ID/ -> 301 to /$LISTNAME/$MESSAGE_ID/ +/$LISTNAME/m/$MESSAGE_ID.html -> 301 to /$LISTNAME/$MESSAGE_ID/ +/$LISTNAME/m/$MESSAGE_ID.txt -> 301 to /$LISTNAME/$MESSAGE_ID/raw +/$LISTNAME/f/$MESSAGE_ID.html -> 301 to /$LISTNAME/$MESSAGE_ID/f/ +/$LISTNAME/f/$MESSAGE_ID.txt [1] -> 301 to /$LISTNAME/$MESSAGE_ID/raw + +/$LISTNAME/atom.xml [2] -> identical to /$LISTNAME/new.atom FIXME: we must refactor/cleanup/add tests for most of our CGI before adding more endpoints and features. diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm index 9d58193..4420fde 100644 --- a/lib/PublicInbox/Feed.pm +++ b/lib/PublicInbox/Feed.pm @@ -101,7 +101,7 @@ sub emit_atom_thread { my $feed_opts = get_feedopts($ctx); my $html_url = $feed_opts->{atomurl} = $ctx->{self_url}; - $html_url =~ s!/atom\z!/!; + $html_url =~ s!/t\.atom\z!/!; $feed_opts->{url} = $html_url; $feed_opts->{emit_header} = 1; @@ -285,7 +285,7 @@ sub get_feedopts { } $url_base = "$base/$listname"; if (my $mid = $ctx->{mid}) { # per-thread feed: - $rv{atomurl} = "$url_base/t/$mid/atom"; + $rv{atomurl} = "$url_base/$mid/t.atom"; } else { $rv{atomurl} = "$url_base/new.atom"; } @@ -294,8 +294,7 @@ sub get_feedopts { $rv{atomurl} = "$url_base/new.atom"; } $rv{url} ||= "$url_base/"; - $rv{midurl} = "$url_base/m/"; - $rv{fullurl} = "$url_base/f/"; + $rv{midurl} = "$url_base/"; \%rv; } @@ -317,14 +316,15 @@ sub add_to_feed { my ($feed_opts, $fh, $add, $git) = @_; my $mime = do_cat_mail($git, $add) or return 0; - my $fullurl = $feed_opts->{fullurl} || 'http://example.com/f/'; + my $url = $feed_opts->{url}; + my $midurl = $feed_opts->{midurl}; my $header_obj = $mime->header_obj; my $mid = $header_obj->header('Message-ID'); defined $mid or return 0; $mid = PublicInbox::Hval->new_msgid($mid); - my $href = $mid->as_href . '/'; - my $content = PublicInbox::View->feed_entry($mime, $fullurl . $href); + my $href = $mid->as_href; + my $content = PublicInbox::View->feed_entry($mime, "$midurl$href/f/"); defined($content) or return 0; $mime = undef; @@ -355,8 +355,7 @@ sub add_to_feed { my $h = '[a-f0-9]'; my (@uuid5) = ($add =~ m!\A($h{8})($h{4})($h{4})($h{4})($h{12})!o); my $id = 'urn:uuid:' . join('-', @uuid5); - my $midurl = $feed_opts->{midurl}; - $fh->write(qq{</div></content><link\nhref="$midurl$href"/>}. + $fh->write(qq!</div></content><link\nhref="$midurl$href/"/>!. "<id>$id</id></entry>"); 1; } @@ -414,7 +413,7 @@ sub dump_topics { $mid = PublicInbox::Hval->new($mid)->as_href; $subj = PublicInbox::Hval->new($subj)->as_html; $u = PublicInbox::Hval->new($u)->as_html; - $dst .= "\n<a\nhref=\"t/$mid/#u\"><b>$subj</b></a>\n- "; + $dst .= "\n<a\nhref=\"$mid/t/#u\"><b>$subj</b></a>\n- "; $ts = strftime('%Y-%m-%d %H:%M', gmtime($ts)); if ($n == 1) { $dst .= "created by $u @ $ts UTC\n" diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index a30bf70..2be16b4 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -80,7 +80,7 @@ sub index_entry { $anchor = $seen->{$anchor_idx}; } if ($srch) { - $subj = "<a\nhref=\"${path}t/$href/#u\">$subj</a>"; + $subj = "<a\nhref=\"${path}$href/t/#u\">$subj</a>"; } if ($root_anchor && $root_anchor eq $id) { $subj = "<u\nid=\"u\">$subj</u>"; @@ -101,9 +101,9 @@ sub index_entry { $fh->write($rv .= "\n\n"); my ($fhref, $more_ref); - my $mhref = "${path}m/$href/"; + my $mhref = "${path}$href/"; if ($level > 0) { - $fhref = "${path}f/$href/"; + $fhref = "${path}$href/f/"; $more_ref = \$more; } # scan through all parts, looking for displayable text @@ -112,7 +112,7 @@ sub index_entry { }); $mime->body_set(''); - my $txt = "${path}m/$href/raw"; + my $txt = "${path}$href/raw"; $rv = "\n<a\nhref=\"$mhref\">$more</a> <a\nhref=\"$txt\">raw</a> "; $rv .= html_footer($mime, 0, undef, $ctx); @@ -120,7 +120,7 @@ sub index_entry { unless (defined $anchor) { my $v = PublicInbox::Hval->new_msgid($irt); $v = $v->as_href; - $anchor = "${path}m/$v/"; + $anchor = "${path}$v/"; $seen->{$anchor_idx} = $anchor; } $rv .= " <a\nhref=\"$anchor\">parent</a>"; @@ -160,8 +160,8 @@ sub emit_thread_html { my $next = "<a\nid=\"s$final_anchor\">"; $next .= $final_anchor == 1 ? 'only message in' : 'end of'; $next .= " thread</a>, back to <a\nhref=\"../../\">index</a>\n"; - $next .= "download: <a\nhref=\"mbox.gz\">mbox.gz</a>"; - $next .= " / <a\nhref=\"atom\">Atom feed</a>\n\n"; + $next .= "download: <a\nhref=\"../t.mbox.gz\">mbox.gz</a>"; + $next .= " / <a\nhref=\"../t.atom\">Atom feed</a>\n\n"; $fh->write("<hr />" . PRE_WRAP . $next . $foot . "</pre></body></html>"); $fh->close; @@ -349,8 +349,8 @@ sub headers_to_html_header { } elsif ($h eq 'Subject') { $title[0] = $v->as_html; if ($srch) { - $rv .= "$h: <a\nid=\"t\"\n" . - "href=\"../../t/$mid_href/\">"; + my $p = $full_pfx ? '' : '../'; + $rv .= "$h: <a\nid=\"t\"\nhref=\"${p}t/#u\">"; $rv .= $v->as_html . "</a>\n"; next; } @@ -359,7 +359,7 @@ sub headers_to_html_header { } $rv .= 'Message-ID: <' . $mid->as_html . '> '; - my $raw_ref = $full_pfx ? 'raw' : "../../m/$mid_href/raw"; + my $raw_ref = $full_pfx ? 'raw' : '../raw'; $rv .= "(<a\nhref=\"$raw_ref\">raw</a>)\n"; if ($srch) { $rv .= "<a\nhref=\"#r\">References: [see below]</a>\n"; @@ -373,7 +373,7 @@ sub headers_to_html_header { } sub thread_inline { - my ($dst, $ctx, $cur) = @_; + my ($dst, $ctx, $cur, $full_pfx) = @_; my $srch = $ctx->{srch}; my $mid = mid_compress(mid_clean($cur->header('Message-ID'))); my $res = $srch->get_thread($mid); @@ -383,9 +383,10 @@ sub thread_inline { $$dst .= "\n[no followups, yet]</a>\n"; return; } + my $upfx = $full_pfx ? '' : '../'; $$dst .= "\n\n~$nr messages in thread: ". - "(<a\nhref=\"../../t/$mid/#u\">expand</a>)\n"; + "(<a\nhref=\"${upfx}t/#u\">expand</a>)\n"; my $subj = $srch->subject_path($cur->header('Subject')); my $state = { seen => { $subj => 1 }, @@ -393,7 +394,7 @@ sub thread_inline { cur => $mid, }; for (thread_results(load_results($res))->rootset) { - inline_dump($dst, $state, $_, 0); + inline_dump($dst, $state, $upfx, $_, 0); } $state->{next_msg}; } @@ -461,19 +462,20 @@ sub html_footer { my $href = "mailto:$to?In-Reply-To=$irt&Cc=${cc}&Subject=$subj"; my $srch = $ctx->{srch} if $ctx; - my $idx = $standalone ? " <a\nhref=\"../../\">index</a>" : ''; + my $upfx = $full_pfx ? '../' : '../../'; + my $idx = $standalone ? "<a\nhref=\"$upfx\">index</a>" : ''; if ($idx && $srch) { - my $next = thread_inline(\$idx, $ctx, $mime); + my $next = thread_inline(\$idx, $ctx, $mime, $full_pfx); $irt = $mime->header('In-Reply-To'); if (defined $irt) { $irt = PublicInbox::Hval->new_msgid($irt); $irt = $irt->as_href; - $irt = "<a\nhref=\"../$irt/\">parent</a> "; + $irt = "<a\nhref=\"$upfx$irt/\">parent</a> "; } else { $irt = ' ' x length('parent '); } if ($next) { - $irt .= "<a\nhref=\"../$next/\">next</a> "; + $irt .= "<a\nhref=\"$upfx$next/\">next</a> "; } else { $irt .= ' '; } @@ -564,7 +566,7 @@ sub _msg_date { } sub _inline_header { - my ($dst, $state, $mime, $level) = @_; + my ($dst, $state, $upfx, $mime, $level) = @_; my $pfx = ' ' x $level; my $cur = $state->{cur}; @@ -601,7 +603,7 @@ sub _inline_header { $s = $s->as_html; } my $m = PublicInbox::Hval->new_msgid($mid); - $m = '../' . $m->as_href . '/'; + $m = $upfx . '../' . $m->as_href . '/'; if (defined $s) { $$dst .= "$pfx` <a\nhref=\"$m\">$s</a>\n" . "$pfx $f @ $d\n"; @@ -611,14 +613,14 @@ sub _inline_header { } sub inline_dump { - my ($dst, $state, $node, $level) = @_; + my ($dst, $state, $upfx, $node, $level) = @_; return unless $node; return if $state->{stopped}; if (my $mime = $node->message) { - _inline_header($dst, $state, $mime, $level); + _inline_header($dst, $state, $upfx, $mime, $level); } - inline_dump($dst, $state, $node->child, $level+1); - inline_dump($dst, $state, $node->next, $level); + inline_dump($dst, $state, $upfx, $node->child, $level+1); + inline_dump($dst, $state, $upfx, $node->next, $level); } 1; diff --git a/lib/PublicInbox/WWW.pm b/lib/PublicInbox/WWW.pm index a9cb6d7..d666a1b 100644 --- a/lib/PublicInbox/WWW.pm +++ b/lib/PublicInbox/WWW.pm @@ -16,6 +16,7 @@ use URI::Escape qw(uri_escape_utf8 uri_unescape); use constant SSOMA_URL => 'http://ssoma.public-inbox.org/'; use constant PI_URL => 'http://public-inbox.org/'; our $LISTNAME_RE = qr!\A/([\w\.\-]+)!; +our $MID_RE = qr!([^/]+)!; our $pi_config; sub run { @@ -31,56 +32,37 @@ sub run { if ($path_info eq '/') { r404(); } elsif ($path_info =~ m!$LISTNAME_RE\z!o) { - invalid_list(\%ctx, $1) || redirect_list_index($cgi); + invalid_list(\%ctx, $1) || r301(\%ctx, $1); } elsif ($path_info =~ m!$LISTNAME_RE(?:/|/index\.html)?\z!o) { invalid_list(\%ctx, $1) || get_index(\%ctx); } elsif ($path_info =~ m!$LISTNAME_RE/(?:atom\.xml|new\.atom)\z!o) { invalid_list(\%ctx, $1) || get_atom(\%ctx); + # thread display + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/t/\z!o) { + invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx); + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/t\.mbox(\.gz)?\z!o) { + my $sfx = $3; + invalid_list_mid(\%ctx, $1, $2) || get_thread_mbox(\%ctx, $sfx); + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/t\.atom\z!o) { + invalid_list_mid(\%ctx, $1, $2) || get_thread_atom(\%ctx); + # single-message pages - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)/\z!o) { + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_mid_html(\%ctx); - } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)/raw\z!o) { + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/raw\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_mid_txt(\%ctx); # full-message page - } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)/\z!o) { + } elsif ($path_info =~ m!$LISTNAME_RE/$MID_RE/f/\z!o) { invalid_list_mid(\%ctx, $1, $2) || get_full_html(\%ctx); - # thread display - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/\z!o) { - invalid_list_mid(\%ctx, $1, $2) || get_thread(\%ctx); - - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!o) { - my $sfx = $3; - invalid_list_mid(\%ctx, $1, $2) || - get_thread_mbox(\%ctx, $sfx); - - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/atom\z!o) { - invalid_list_mid(\%ctx, $1, $2) || get_thread_atom(\%ctx); - - # legacy redirects - } elsif ($path_info =~ m!$LISTNAME_RE/(t|m|f)/(\S+)\.html\z!o) { - my $pfx = $2; - invalid_list_mid(\%ctx, $1, $3) || - redirect_mid(\%ctx, $pfx, qr/\.html\z/, '/'); - } elsif ($path_info =~ m!$LISTNAME_RE/(m|f)/(\S+)\.txt\z!o) { - my $pfx = $2; - invalid_list_mid(\%ctx, $1, $3) || - redirect_mid(\%ctx, $pfx, qr/\.txt\z/, '/raw'); - } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { - my $end = $3; - invalid_list_mid(\%ctx, $1, $2) || - redirect_mid(\%ctx, 't', $end, '/mbox.gz'); - - # convenience redirects, order matters - } elsif ($path_info =~ m!$LISTNAME_RE/(m|f|t|s)/(\S+)\z!o) { - my $pfx = $2; - invalid_list_mid(\%ctx, $1, $3) || - redirect_mid(\%ctx, $pfx, qr/\z/, '/'); + # convenience redirects order matters + } elsif ($path_info =~ m!$LISTNAME_RE/([^/]{2,})\z!o) { + r301(\%ctx, $1, $2); } else { - r404(); + legacy_redirects(\%ctx, $path_info); } } @@ -163,7 +145,7 @@ sub mid2blob { } } -# /$LISTNAME/m/$MESSAGE_ID.txt -> raw mbox +# /$LISTNAME/$MESSAGE_ID/raw -> raw mbox sub get_mid_txt { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404(); @@ -171,22 +153,21 @@ sub get_mid_txt { PublicInbox::Mbox::emit1($x); } -# /$LISTNAME/m/$MESSAGE_ID.html -> HTML content (short quotes) +# /$LISTNAME/$MESSAGE_ID/ -> HTML content (short quotes) sub get_mid_html { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404(); require PublicInbox::View; - my $pfx = msg_pfx($ctx); my $foot = footer($ctx); require Email::MIME; my $mime = Email::MIME->new($x); searcher($ctx); [ 200, [ 'Content-Type' => 'text/html; charset=UTF-8' ], - [ PublicInbox::View::msg_html($ctx, $mime, $pfx, $foot) ] ]; + [ PublicInbox::View::msg_html($ctx, $mime, 'f/', $foot) ] ]; } -# /$LISTNAME/f/$MESSAGE_ID.html -> HTML content (fullquotes) +# /$LISTNAME/$MESSAGE_ID/f/ -> HTML content (fullquotes) sub get_full_html { my ($ctx) = @_; my $x = mid2blob($ctx) or return r404(); @@ -200,7 +181,7 @@ sub get_full_html { [ PublicInbox::View::msg_html($ctx, $mime, undef, $foot)] ]; } -# /$LISTNAME/t/$MESSAGE_ID.html +# /$LISTNAME/$MESSAGE_ID/t/ sub get_thread { my ($ctx) = @_; my $srch = searcher($ctx) or return need_search($ctx); @@ -214,39 +195,6 @@ sub self_url { ref($cgi) eq 'CGI' ? $cgi->self_url : $cgi->uri->as_string; } -sub redirect_list_index { - my ($cgi) = @_; - do_redirect(self_url($cgi) . "/"); -} - -sub redirect_mid { - my ($ctx, $pfx, $old, $sfx) = @_; - my $url = self_url($ctx->{cgi}); - my $anchor = ''; - if (lc($pfx) eq 't' && $sfx eq '/') { - $anchor = '#u'; # <u id='#u'> is used to highlight in View.pm - } - $url =~ s/$old/$sfx/; - do_redirect($url . $anchor); -} - -# only hit when somebody tries to guess URLs manually: -sub redirect_mid_txt { - my ($ctx, $pfx) = @_; - my $listname = $ctx->{listname}; - my $url = self_url($ctx->{cgi}); - $url =~ s!/$listname/f/(\S+\.txt)\z!/$listname/m/$1!; - do_redirect($url); -} - -sub do_redirect { - my ($url) = @_; - [ 301, - [ Location => $url, 'Content-Type' => 'text/plain' ], - [ "Redirecting to $url\n" ] - ] -} - sub ctx_get { my ($ctx, $key) = @_; my $val = $ctx->{$key}; @@ -333,14 +281,8 @@ EOF [ 501, [ 'Content-Type' => 'text/html; charset=UTF-8' ], [ $msg ] ]; } -sub msg_pfx { - my ($ctx) = @_; - my $href = PublicInbox::Hval::ascii_html(uri_escape_utf8($ctx->{mid})); - "../../f/$href/"; -} - -# /$LISTNAME/t/$MESSAGE_ID/mbox -> thread as mbox -# /$LISTNAME/t/$MESSAGE_ID/mbox.gz -> thread as gzipped mbox +# /$LISTNAME/$MESSAGE_ID/t.mbox -> thread as mbox +# /$LISTNAME/$MESSAGE_ID/t.mbox.gz -> thread as gzipped mbox # note: I'm not a big fan of other compression formats since they're # significantly more expensive on CPU than gzip and less-widely available, # especially on older systems. Stick to zlib since that's what git uses. @@ -352,7 +294,7 @@ sub get_thread_mbox { } -# /$LISTNAME/t/$MESSAGE_ID/atom -> thread as Atom feed +# /$LISTNAME/$MESSAGE_ID/t.atom -> thread as Atom feed sub get_thread_atom { my ($ctx) = @_; searcher($ctx) or return need_search($ctx); @@ -361,4 +303,71 @@ sub get_thread_atom { PublicInbox::Feed::generate_thread_atom($ctx); } +sub legacy_redirects { + my ($ctx, $path_info) = @_; + + # single-message pages + if ($path_info =~ m!$LISTNAME_RE/m/(\S+)/\z!o) { + r301($ctx, $1, $2); + } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)/raw\z!o) { + r301($ctx, $1, $2, 'raw'); + + } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)/\z!o) { + r301($ctx, $1, $2, 'f/'); + + # thread display + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/\z!o) { + r301($ctx, $1, $2, 't/#u'); + + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)/mbox(\.gz)?\z!o) { + r301($ctx, $1, $2, "t.mbox$3"); + + # even older legacy redirects + } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\.html\z!o) { + r301($ctx, $1, $2); + + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\.html\z!o) { + r301($ctx, $1, $2, 't/#u'); + + } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\.html\z!o) { + r301($ctx, $1, $2, 'f/'); + + } elsif ($path_info =~ m!$LISTNAME_RE/(?:m|f)/(\S+)\.txt\z!o) { + r301($ctx, $1, $2, 'raw'); + + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)(\.mbox(?:\.gz)?)\z!o) { + r301($ctx, $1, $2, "t$3"); + + # legacy convenience redirects, order still matters + } elsif ($path_info =~ m!$LISTNAME_RE/m/(\S+)\z!o) { + r301($ctx, $1, $2); + } elsif ($path_info =~ m!$LISTNAME_RE/t/(\S+)\z!o) { + r301($ctx, $1, $2, 't/#u'); + } elsif ($path_info =~ m!$LISTNAME_RE/f/(\S+)\z!o) { + r301($ctx, $1, $2, 'f/'); + + } else { + r404(); + } +} + +sub r301 { + my ($ctx, $listname, $mid, $suffix) = @_; + my $cgi = $ctx->{cgi}; + my $url; + if (ref($cgi) eq 'CGI') { + $url = $cgi->url(-base) . '/'; + } else { + $url = $cgi->base->as_string; + } + + $url .= $listname . '/'; + $url .= (uri_escape_utf8($mid) . '/') if (defined $mid); + $url .= $suffix if (defined $suffix); + + [ 301, + [ Location => $url, 'Content-Type' => 'text/plain' ], + [ "Redirecting to $url\n" ] ] +} + 1; diff --git a/t/cgi.t b/t/cgi.t index d84e634..a6600c2 100644 --- a/t/cgi.t +++ b/t/cgi.t @@ -109,7 +109,7 @@ EOF like($res->{body}, qr/<title>test for public-inbox/, "set title in XML feed"); like($res->{body}, - qr!http://test\.example\.com/test/m/blah%40example\.com!, + qr!http://test\.example\.com/test/blah%40example\.com/!, "link id set"); like($res->{body}, qr/what\?/, "reply included"); } @@ -152,26 +152,26 @@ EOF } local $ENV{GIT_DIR} = $maindir; - my $res = cgi_run("/test/m/slashy%2fasdf%40example.com/raw"); + my $res = cgi_run("/test/slashy%2fasdf%40example.com/raw"); like($res->{body}, qr/Message-Id: <\Q$slashy_mid\E>/, "slashy mid raw hit"); - $res = cgi_run("/test/m/blahblah\@example.com/raw"); + $res = cgi_run("/test/blahblah\@example.com/raw"); like($res->{body}, qr/Message-Id: <blahblah\@example\.com>/, "mid raw hit"); - $res = cgi_run("/test/m/blahblah\@example.con/raw"); + $res = cgi_run("/test/blahblah\@example.con/raw"); like($res->{head}, qr/Status: 404 Not Found/, "mid raw miss"); - $res = cgi_run("/test/m/blahblah\@example.com/"); + $res = cgi_run("/test/blahblah\@example.com/"); like($res->{body}, qr/\A<html>/, "mid html hit"); like($res->{head}, qr/Status: 200 OK/, "200 response"); - $res = cgi_run("/test/m/blahblah\@example.con/"); + $res = cgi_run("/test/blahblah\@example.con/"); like($res->{head}, qr/Status: 404 Not Found/, "mid html miss"); - $res = cgi_run("/test/f/blahblah\@example.com/"); + $res = cgi_run("/test/blahblah\@example.com/f/"); like($res->{body}, qr/\A<html>/, "mid html"); like($res->{head}, qr/Status: 200 OK/, "200 response"); - $res = cgi_run("/test/f/blahblah\@example.con/"); + $res = cgi_run("/test/blahblah\@example.con/f/"); like($res->{head}, qr/Status: 404 Not Found/, "mid html miss"); $res = cgi_run("/test/"); @@ -183,7 +183,7 @@ EOF { local $ENV{HOME} = $home; local $ENV{PATH} = $main_path; - my $path = "/test/t/blahblah%40example.com/mbox.gz"; + my $path = "/test/blahblah%40example.com/t.mbox.gz"; my $res = cgi_run($path); like($res->{head}, qr/^Status: 501 /, "search not-yet-enabled"); my $indexed = system($index, $maindir) == 0; @@ -203,7 +203,7 @@ EOF my $have_xml_feed = eval { require XML::Feed; 1 } if $indexed; if ($have_xml_feed) { - $path = "/test/t/blahblah%40example.com/atom"; + $path = "/test/blahblah%40example.com/t.atom"; $res = cgi_run($path); like($res->{head}, qr/^Status: 200 /, "atom returned 200"); like($res->{head}, qr!^Content-Type: application/xml!m, diff --git a/t/feed.t b/t/feed.t index a9955f0..e4ec752 100644 --- a/t/feed.t +++ b/t/feed.t @@ -77,7 +77,7 @@ EOF } unlike($feed, qr/drop me/, "long quoted text dropped"); - like($feed, qr!/f/\d%40example\.com/#q!, + like($feed, qr!/\d%40example\.com/f/#q!, "/f/ url generated for long quoted text"); like($feed, qr/inline me here/, "short quoted text kept"); like($feed, qr/keep me/, "unquoted text saved"); diff --git a/t/plack.t b/t/plack.t index 50c9e60..067a593 100644 --- a/t/plack.t +++ b/t/plack.t @@ -88,7 +88,7 @@ EOF is(200, $res->code, 'success response received'); like($res->content, qr!href="\Q$atomurl\E"!, 'atom URL generated'); - like($res->content, qr!href="m/blah%40example\.com/"!, + like($res->content, qr!href="blah%40example\.com/"!, 'index generated'); }); @@ -98,14 +98,14 @@ EOF my $res = $cb->(GET($pfx . '/atom.xml')); is(200, $res->code, 'success response received for atom'); like($res->content, - qr!link\s+href="\Q$pfx\E/m/blah%40example\.com/"!s, + qr!link\s+href="\Q$pfx\E/blah%40example\.com/"!s, 'atom feed generated correct URL'); }); - foreach my $t (qw(f m)) { + foreach my $t (('', 'f/')) { test_psgi($app, sub { my ($cb) = @_; - my $path = "/$t/blah%40example.com/"; + my $path = "/blah%40example.com/$t"; my $res = $cb->(GET($pfx . $path)); is(200, $res->code, "success for $path"); like($res->content, qr!<title>hihi - Me</title>!, @@ -114,8 +114,8 @@ EOF } test_psgi($app, sub { my ($cb) = @_; - my $res = $cb->(GET($pfx . '/m/blah%40example.com/raw')); - is(200, $res->code, 'success response received for /m/*/raw'); + my $res = $cb->(GET($pfx . '/blah%40example.com/raw')); + is(200, $res->code, 'success response received for /*/raw'); like($res->content, qr!\AFrom !, "mbox returned"); }); @@ -126,18 +126,25 @@ EOF my $res = $cb->(GET($pfx . "/$t/blah%40example.com.txt")); is(301, $res->code, "redirect for old $t .txt link"); my $location = $res->header('Location'); - like($location, qr!/$t/blah%40example\.com/raw\z!, + like($location, qr!/blah%40example\.com/raw\z!, ".txt redirected to /raw"); }); } - foreach my $t (qw(m f t)) { + + my %umap = ( + 'm' => '', + 'f' => 'f/', + 't' => 't/', + ); + while (my ($t, $e) = each %umap) { test_psgi($app, sub { my ($cb) = @_; my $res = $cb->(GET($pfx . "/$t/blah%40example.com.html")); is(301, $res->code, "redirect for old $t .html link"); my $location = $res->header('Location'); - like($location, qr!/$t/blah%40example\.com/(?:#u)?\z!, - ".html redirected to /raw"); + like($location, + qr!/blah%40example\.com/$e(?:#u)?\z!, + ".html redirected to new location"); }); } foreach my $sfx (qw(mbox mbox.gz)) { @@ -146,8 +153,9 @@ EOF my $res = $cb->(GET($pfx . "/t/blah%40example.com.$sfx")); is(301, $res->code, 'redirect for old thread link'); my $location = $res->header('Location'); - like($location, qr!/t/blah%40example\.com/mbox\.gz\z!, - "$sfx redirected to /mbox.gz"); + like($location, + qr!/blah%40example\.com/t\.mbox(?:\.gz)?\z!, + "$sfx redirected to /mbox.gz"); }); } } diff --git a/t/view.t b/t/view.t index 77cf3a3..83823d8 100644 --- a/t/view.t +++ b/t/view.t @@ -44,17 +44,17 @@ EOF my $html = PublicInbox::View::msg_html(undef, $mime); # ghetto tests - like($html, qr!<a\nhref="\.\./\.\./m/hello%40!s, "MID link present"); + like($html, qr!<a\nhref="\.\./raw"!s, "raw link present"); like($html, qr/hello world\b/, "body present"); like($html, qr/> keep this inline/, "short quoted text is inline"); like($html, qr/<a\nid=[^>]+><\/a>> Long and wordy/, "long quoted text is anchored"); # short page - my $pfx = "../../f/hello%40example.com/"; + my $pfx = "../hello%40example.com/f/"; $mime = Email::MIME->new($s); my $short = PublicInbox::View::msg_html(undef, $mime, $pfx); - like($short, qr!<a\nhref="\.\./\.\./f/hello%40example\.com/!s, + like($short, qr!<a\nhref="\.\./hello%40example\.com/f/!s, "MID link present"); like($short, qr/\n> keep this inline/, "short quoted text is inline"); -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 10/11] view: drop extra '</a>' tag 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (7 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 09/11] completely revamp URL structure to shorten permalinks Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 8:55 ` [PATCH 11/11] view: more robust link generation Eric Wong 9 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta Oops. --- lib/PublicInbox/View.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 2be16b4..45f559e 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -380,7 +380,7 @@ sub thread_inline { my $nr = $res->{total}; if ($nr <= 1) { - $$dst .= "\n[no followups, yet]</a>\n"; + $$dst .= "\n[no followups, yet]\n"; return; } my $upfx = $full_pfx ? '' : '../'; -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 11/11] view: more robust link generation 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong ` (8 preceding siblings ...) 2015-09-01 8:55 ` [PATCH 10/11] view: drop extra '</a>' tag Eric Wong @ 2015-09-01 8:55 ` Eric Wong 2015-09-01 9:08 ` [PATCH 12/11] view: add missing space Eric Wong 9 siblings, 1 reply; 13+ messages in thread From: Eric Wong @ 2015-09-01 8:55 UTC (permalink / raw) To: meta We must avoid double-escaping in cases where we have URLs anchored by "<>" in the plain-text as is common (and AFAIK recommended) convention. So we must use a two step linkification process to prevent double-escaping. --- lib/PublicInbox/View.pm | 62 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 45f559e..3d7ba6f 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -10,7 +10,8 @@ use Encode::MIME::Header; use Email::MIME::ContentType qw/parse_content_type/; use PublicInbox::Hval; use PublicInbox::MID qw/mid_clean mid_compress mid2path/; -use Digest::SHA; +use Digest::SHA qw/sha1_hex/; +my $SALT = rand; require POSIX; # TODO: make these constants tunable @@ -235,10 +236,35 @@ my $LINK_RE = qr!\b((?:ftp|https?|nntp):// [\@:\w\.-]+/ ?[\@\w\+\&\?\.\%\;/#=-]*)!x; -sub linkify { - # no newlines added here since it'd break the splitting we do - # to fold quotes - $_[0] =~ s!$LINK_RE!<a\nhref="$1">$1</a>!g; +sub linkify_1 { + my ($link_map, $s) = @_; + $s =~ s!$LINK_RE! + my $url = $1; + # salt this, as this could be exploited to show + # links in the HTML which don't show up in the raw mail. + my $key = sha1_hex($url . $SALT); + $link_map->{$key} = $url; + 'PI-LINK-'. $key; + !ge; + $s; +} + +sub linkify_2 { + my ($link_map, $s) = @_; + + # Added "PI-LINK-" prefix to avoid false-positives on git commits + $s =~ s!\bPI-LINK-([a-f0-9]{40})\b! + my $key = $1; + my $url = $link_map->{$key}; + if (defined $url) { + $url = ascii_html($url); + "<a\nhref=\"$url\">$url</a>"; + } else { + # false positive or somebody tried to mess with us + $key; + } + !ge; + $s; } sub flush_quote { @@ -247,13 +273,15 @@ sub flush_quote { if ($full_pfx) { if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) { # show quote inline - my $rv = join('', map { linkify($_); $_ } @$quot); + my %l; + my $rv = join('', map { linkify_1(\%l, $_) } @$quot); @$quot = (); - return $rv; + $rv = ascii_html($rv); + return linkify_2(\%l, $rv); } # show a short snippet of quoted text and link to full version: - @$quot = map { s/^(?:>\s*)+//gm; $_ } @$quot; + @$quot = map { s/^(?:>\s*)+//gm; $_ } @$quot; my $cur = join(' ', @$quot); @$quot = split(/\s+/, $cur); $cur = ''; @@ -268,16 +296,19 @@ sub flush_quote { } while (@$quot && length($cur) < MAX_TRUNC_LEN); @$quot = (); $cur =~ s/ \z/ .../s; + $cur = ascii_html($cur); my $nr = ++$$n; "> [<a\nhref=\"$full_pfx#q${part_nr}_$nr\">$cur</a>]\n"; } else { # show everything in the full version with anchor from # short version (see above) my $nr = ++$$n; - my $rv = "<a\nid=q${part_nr}_$nr></a>"; - $rv .= join('', map { linkify($_); $_ } @$quot); + my $rv = ""; + my %l; + $rv .= join('', map { linkify_1(\%l, $_) } @$quot); @$quot = (); - $rv; + $rv = ascii_html($rv); + "<a\nid=q${part_nr}_$nr></a>" . linkify_2(\%l, $rv); } } @@ -297,7 +328,6 @@ sub add_text_body { my $s = $part->body; $part->body_set(''); $s = $enc->decode($s); - $s = ascii_html($s); my @lines = split(/^/m, $s); $s = ''; @@ -309,7 +339,7 @@ sub add_text_body { my @quot; while (defined(my $cur = shift @lines)) { - if ($cur !~ /^>/) { + if ($cur !~ /^>/) { # show the previously buffered quote inline if (scalar @quot) { $s .= flush_quote(\@quot, \$n, $$part_nr, @@ -317,8 +347,10 @@ sub add_text_body { } # regular line, OK - linkify($cur); - $s .= $cur; + my %l; + $cur = linkify_1(\%l, $cur); + $cur = ascii_html($cur); + $s .= linkify_2(\%l, $cur); } else { push @quot, $cur; } -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [PATCH 12/11] view: add missing space 2015-09-01 8:55 ` [PATCH 11/11] view: more robust link generation Eric Wong @ 2015-09-01 9:08 ` Eric Wong 0 siblings, 0 replies; 13+ messages in thread From: Eric Wong @ 2015-09-01 9:08 UTC (permalink / raw) To: meta This fixes a regression introduced in commit 1b4b2c7b8b2f2df8f114617d2e875eaf5c839ce0 ("completely revamp URL structure to shorten permalinks") --- lib/PublicInbox/View.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm index 3d7ba6f..29888f9 100644 --- a/lib/PublicInbox/View.pm +++ b/lib/PublicInbox/View.pm @@ -495,7 +495,7 @@ sub html_footer { my $srch = $ctx->{srch} if $ctx; my $upfx = $full_pfx ? '../' : '../../'; - my $idx = $standalone ? "<a\nhref=\"$upfx\">index</a>" : ''; + my $idx = $standalone ? " <a\nhref=\"$upfx\">index</a>" : ''; if ($idx && $srch) { my $next = thread_inline(\$idx, $ctx, $mime, $full_pfx); $irt = $mime->header('In-Reply-To'); -- EW ^ permalink raw reply related [flat|nested] 13+ messages in thread
end of thread, other threads:[~2015-09-01 9:30 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2015-09-01 8:55 [PATCH 01/11] search: reduce redundant doc data Eric Wong 2015-09-01 8:55 ` [PATCH 02/11] search: allow querying all mail with '' Eric Wong 2015-09-01 8:55 ` [PATCH 03/11] search: show newest results first Eric Wong 2015-09-01 8:55 ` [PATCH 04/11] feed: use updated date based on git commit date Eric Wong 2015-09-01 8:55 ` [PATCH 05/11] feed: extract atom header generation Eric Wong 2015-09-01 8:55 ` [PATCH 06/11] implement per-thread Atom feeds Eric Wong 2015-09-01 9:30 ` [13/11 PATCH] feed: fix <updated> tag in Atom feed Eric Wong 2015-09-01 8:55 ` [PATCH 07/11] www: compile mbox regexp only once Eric Wong 2015-09-01 8:55 ` [PATCH 08/11] www: root atom feed is "new.atom" and not "atom.xml" Eric Wong 2015-09-01 8:55 ` [PATCH 09/11] completely revamp URL structure to shorten permalinks Eric Wong 2015-09-01 8:55 ` [PATCH 10/11] view: drop extra '</a>' tag Eric Wong 2015-09-01 8:55 ` [PATCH 11/11] view: more robust link generation Eric Wong 2015-09-01 9:08 ` [PATCH 12/11] view: add missing space Eric Wong
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).