unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/3] www: addr2urlmap fixes
@ 2024-09-13 22:07 Eric Wong
  2024-09-13 22:07 ` [PATCH 1/3] view: addr2urlmap matches HTML-escaped addresses, only Eric Wong
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Eric Wong @ 2024-09-13 22:07 UTC (permalink / raw)
  To: meta

All tested manually by now; automated tests pending, but writing
PSGI tests is tedious and slow :<

I've been running 1 and 2 for a few days, at least

Eric Wong (3):
  view: addr2urlmap matches HTML-escaped addresses, only
  view: fix addr2urlmap with Plack::Builder::mount
  view: disable address URL-fication of possible HTML escapes

 lib/PublicInbox/Feed.pm       |  3 ++
 lib/PublicInbox/SearchView.pm |  1 +
 lib/PublicInbox/View.pm       | 65 ++++++++++++++++++++---------------
 3 files changed, 41 insertions(+), 28 deletions(-)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH 1/3] view: addr2urlmap matches HTML-escaped addresses, only
  2024-09-13 22:07 [PATCH 0/3] www: addr2urlmap fixes Eric Wong
@ 2024-09-13 22:07 ` Eric Wong
  2024-09-13 22:07 ` [PATCH 2/3] view: fix addr2urlmap with Plack::Builder::mount Eric Wong
  2024-09-13 22:07 ` [PATCH 3/3] view: disable address URL-fication of possible HTML escapes Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2024-09-13 22:07 UTC (permalink / raw)
  To: meta

Some odd email addresses may require HTML escaping, and
the resulting regexp is run on escaped HTML.
---
 lib/PublicInbox/View.pm | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 37c910f8..19f4168e 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -196,7 +196,8 @@ sub addr2urlmap ($) {
 		my (%addr2url, $url);
 		while (my ($addr, $ibx) = each %$by_addr) {
 			$url = $ibx->base_url // $ibx->base_url($ctx->{env});
-			$addr2url{$addr} = ascii_html($url) if defined $url;
+			$addr2url{ascii_html($addr)} = ascii_html($url) if
+				defined $url;
 		}
 		# don't allow attackers to randomly change Host: headers
 		# and OOM us if the server handles all hostnames:

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/3] view: fix addr2urlmap with Plack::Builder::mount
  2024-09-13 22:07 [PATCH 0/3] www: addr2urlmap fixes Eric Wong
  2024-09-13 22:07 ` [PATCH 1/3] view: addr2urlmap matches HTML-escaped addresses, only Eric Wong
@ 2024-09-13 22:07 ` Eric Wong
  2024-09-13 22:07 ` [PATCH 3/3] view: disable address URL-fication of possible HTML escapes Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2024-09-13 22:07 UTC (permalink / raw)
  To: meta

Plack::App::URLMap does not preserve SCRIPT_NAME set for PSGI
`mount' directives when running response callbacks.  Thus we
must get $ibx->base_url($ctx->{env}) calls to generate correct
full URLs when relying on publicinbox.nameIsUrl up front before
the PSGI response callback is returned.
---
 lib/PublicInbox/Feed.pm       |  3 ++
 lib/PublicInbox/SearchView.pm |  1 +
 lib/PublicInbox/View.pm       | 61 ++++++++++++++++++-----------------
 3 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/lib/PublicInbox/Feed.pm b/lib/PublicInbox/Feed.pm
index a1c1d4d6..b6e4267f 100644
--- a/lib/PublicInbox/Feed.pm
+++ b/lib/PublicInbox/Feed.pm
@@ -19,6 +19,7 @@ sub generate {
 	my ($ctx) = @_;
 	my $msgs = $ctx->{msgs} = recent_msgs($ctx);
 	return _no_thread() unless @$msgs;
+	PublicInbox::View::addr2urlmap $ctx;
 	PublicInbox::WwwAtomStream->response($ctx, \&generate_i);
 }
 
@@ -26,6 +27,7 @@ sub generate_thread_atom {
 	my ($ctx) = @_;
 	my $msgs = $ctx->{msgs} = $ctx->{ibx}->over->get_thread($ctx->{mid});
 	return _no_thread() unless @$msgs;
+	PublicInbox::View::addr2urlmap $ctx;
 	PublicInbox::WwwAtomStream->response($ctx, \&generate_i);
 }
 
@@ -69,6 +71,7 @@ sub new_html {
 	$ctx->{-upfx} = '';
 	$ctx->{-spfx} = '' if $ctx->{ibx}->{coderepo};
 	$ctx->{-hr} = 1;
+	PublicInbox::View::addr2urlmap $ctx;
 	PublicInbox::WwwStream::aresponse($ctx, \&new_html_i);
 }
 
diff --git a/lib/PublicInbox/SearchView.pm b/lib/PublicInbox/SearchView.pm
index 3729c27b..1213b3d3 100644
--- a/lib/PublicInbox/SearchView.pm
+++ b/lib/PublicInbox/SearchView.pm
@@ -33,6 +33,7 @@ sub sres_top_html {
 	my $srch = $ctx->{srch} = $ctx->{ibx}->isrch or
 		return PublicInbox::WWW::need($ctx, 'Search');
 	my $q = PublicInbox::SearchQuery->new($ctx->{qp});
+	PublicInbox::View::addr2urlmap $ctx if $q->{x} eq 't';
 	my $o = $q->{o} // 0;
 	my $asc;
 	if ($o < 0) {
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 19f4168e..75387dce 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -64,10 +64,39 @@ sub no_over_html ($) {
 	$ctx->html_done;
 }
 
+sub addr2urlmap ($) {
+	my ($ctx) = @_;
+	# cache makes a huge difference with /[tT] and large threads
+	my $key = PublicInbox::Git::host_prefix_url($ctx->{env}, '');
+	my $ent = $ctx->{www}->{pi_cfg}->{-addr2urlmap}->{$key} // do {
+		my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr};
+		my (%addr2url, $url);
+		while (my ($addr, $ibx) = each %$by_addr) {
+			$url = $ibx->base_url // $ibx->base_url($ctx->{env});
+			$addr2url{ascii_html($addr)} = ascii_html($url) if
+				defined $url
+		}
+		# don't allow attackers to randomly change Host: headers
+		# and OOM us if the server handles all hostnames:
+		my $tmp = $ctx->{www}->{pi_cfg}->{-addr2urlmap};
+		my @k = keys %$tmp; # random order
+		delete @$tmp{@k[0..3]} if scalar(@k) > 7;
+		if (scalar keys %addr2url) {
+			my $re = join('|', map { quotemeta } keys %addr2url);
+			$tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ];
+		} else { # nothing? NUL should never match:
+			[ qr/(\0)/, { "\0" => './' } ];
+		}
+	};
+	$ctx->{-addr2urlmap} = $ent;
+}
+
 # public functions: (unstable)
 
+# GET /$INBOX/$MSGID/ (single message page)
 sub msg_page {
 	my ($ctx) = @_;
+	addr2urlmap $ctx;
 	my $ibx = $ctx->{ibx};
 	$ctx->{-obfs_ibx} = $ibx->{obfuscate} ? $ibx : undef;
 	my $over = $ibx->over or return no_over_html($ctx);
@@ -187,38 +216,10 @@ sub nr_to_s ($$$) {
 	$nr == 1 ? "$nr $singular" : "$nr $plural";
 }
 
-sub addr2urlmap ($) {
-	my ($ctx) = @_;
-	# cache makes a huge difference with /[tT] and large threads
-	my $key = PublicInbox::Git::host_prefix_url($ctx->{env}, '');
-	my $ent = $ctx->{www}->{pi_cfg}->{-addr2urlmap}->{$key} // do {
-		my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr};
-		my (%addr2url, $url);
-		while (my ($addr, $ibx) = each %$by_addr) {
-			$url = $ibx->base_url // $ibx->base_url($ctx->{env});
-			$addr2url{ascii_html($addr)} = ascii_html($url) if
-				defined $url;
-		}
-		# don't allow attackers to randomly change Host: headers
-		# and OOM us if the server handles all hostnames:
-		my $tmp = $ctx->{www}->{pi_cfg}->{-addr2urlmap};
-		my @k = keys %$tmp; # random order
-		delete @$tmp{@k[0..3]} if scalar(@k) > 7;
-		if (scalar keys %addr2url) {
-			my $re = join('|', map { quotemeta } keys %addr2url);
-			$tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ];
-		} else { # nothing? NUL should never match:
-			[ qr/(\0)/, { "\0" => './' } ];
-		}
-	};
-	@$ent;
-}
-
 # called by /$INBOX/$MSGID/[tT]/
 sub to_cc_html ($$$$) {
 	my ($ctx, $eml, $field, $t) = @_;
 	my @vals = $eml->header($field) or return ('', 0);
-	my (undef, $addr2url) = addr2urlmap($ctx);
 	my $pairs = PublicInbox::Address::pairs(join(', ', @vals));
 	my ($len, $line_len, $html) = (0, 0, '');
 	my ($pair, $url);
@@ -227,6 +228,7 @@ sub to_cc_html ($$$$) {
 	my @html = split /\n/, ascii_html(join("\n", map {
 		$_->[0] // (split(/\@/, $_->[1]))[0]; # addr user if no name
 	} @$pairs));
+	my (undef, $addr2url) = @{$ctx->{-addr2urlmap}};
 	for my $n (@html) {
 		$pair = shift @$pairs;
 		if ($line_len) { # 9 = display width of ",\t":
@@ -552,6 +554,7 @@ EOM
 	walk_thread($rootset, $ctx, \&pre_thread); # pushes to ctx->{skel}
 
 	push @{$ctx->{skel}}, '</pre>';
+	addr2urlmap $ctx;
 	return stream_thread($rootset, $ctx) unless $ctx->{flat};
 
 	# flat display: lazy load the full message from smsg
@@ -724,7 +727,7 @@ href="d/">diff</a>)</pre><pre>];
 	$hbuf .= "Date: $_\n" for $eml->header('Date');
 	$hbuf = ascii_html($hbuf);
 	my $t = $ts ? '?t='.ts2str($ts) : '';
-	my ($re, $addr2url) = addr2urlmap($ctx);
+	my ($re, $addr2url) = @{$ctx->{-addr2urlmap}};
 	# $url is relative to /$INBOX/$MSGID/
 	$hbuf =~ s#$re#
 		my ($addr, $url) = ($1, $addr2url->{lc $1});

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/3] view: disable address URL-fication of possible HTML escapes
  2024-09-13 22:07 [PATCH 0/3] www: addr2urlmap fixes Eric Wong
  2024-09-13 22:07 ` [PATCH 1/3] view: addr2urlmap matches HTML-escaped addresses, only Eric Wong
  2024-09-13 22:07 ` [PATCH 2/3] view: fix addr2urlmap with Plack::Builder::mount Eric Wong
@ 2024-09-13 22:07 ` Eric Wong
  2 siblings, 0 replies; 4+ messages in thread
From: Eric Wong @ 2024-09-13 22:07 UTC (permalink / raw)
  To: meta

In case somebody uses local email address of `lt' or `gt' (with
no domain component, or something matching /#\d+/a), disable
URL-fication of such addresses to prevent breaking HTML output.

Somebody with better Perl regexp knowledge than I can attempt to
write a regexp which functions like \b but avoids matching `&'
to allow such local email addresses.  But I suspect the use of
local-only email addresses to be limited and this isn't a real
problem in practice.
---
 lib/PublicInbox/View.pm | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 75387dce..5d474292 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -72,6 +72,9 @@ sub addr2urlmap ($) {
 		my $by_addr = $ctx->{www}->{pi_cfg}->{-by_addr};
 		my (%addr2url, $url);
 		while (my ($addr, $ibx) = each %$by_addr) {
+			# FIXME: use negative look(behind|ahead) in s// for
+			# `&' and `;' to make them not match \b
+			next if $addr =~ /\A(?:gt|lt|#[0-9]+)\z/;
 			$url = $ibx->base_url // $ibx->base_url($ctx->{env});
 			$addr2url{ascii_html($addr)} = ascii_html($url) if
 				defined $url
@@ -83,6 +86,8 @@ sub addr2urlmap ($) {
 		delete @$tmp{@k[0..3]} if scalar(@k) > 7;
 		if (scalar keys %addr2url) {
 			my $re = join('|', map { quotemeta } keys %addr2url);
+			# FIXME: fix this regexp to allow `lt' and `gt' as
+			# local email addresses:
 			$tmp->{$key} = [ qr/\b($re)\b/i, \%addr2url ];
 		} else { # nothing? NUL should never match:
 			[ qr/(\0)/, { "\0" => './' } ];

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-09-13 22:07 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-09-13 22:07 [PATCH 0/3] www: addr2urlmap fixes Eric Wong
2024-09-13 22:07 ` [PATCH 1/3] view: addr2urlmap matches HTML-escaped addresses, only Eric Wong
2024-09-13 22:07 ` [PATCH 2/3] view: fix addr2urlmap with Plack::Builder::mount Eric Wong
2024-09-13 22:07 ` [PATCH 3/3] view: disable address URL-fication of possible HTML escapes Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).