unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH 0/2] linkification improvements
@ 2016-03-01  3:50 Eric Wong
  2016-03-01  3:50 ` [PATCH 1/2] extract linkification code to a separate package Eric Wong
  2016-03-01  3:50 ` [PATCH 2/2] linkify: do not capture trailing '.' or ';' in URLs Eric Wong
  0 siblings, 2 replies; 3+ messages in thread
From: Eric Wong @ 2016-03-01  3:50 UTC (permalink / raw)
  To: meta

We'll be reusing the linkification code in repobrowse :)

Eric Wong (2):
      extract linkification code to a separate package
      linkify: do not capture trailing '.' or ';' in URLs

 MANIFEST                   |  2 ++
 lib/PublicInbox/Linkify.pm | 65 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/View.pm    | 58 ++++++++---------------------------------
 t/linkify.t                | 26 +++++++++++++++++++
 4 files changed, 104 insertions(+), 47 deletions(-)
 create mode 100644 lib/PublicInbox/Linkify.pm
 create mode 100644 t/linkify.t

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [PATCH 1/2] extract linkification code to a separate package
  2016-03-01  3:50 [PATCH 0/2] linkification improvements Eric Wong
@ 2016-03-01  3:50 ` Eric Wong
  2016-03-01  3:50 ` [PATCH 2/2] linkify: do not capture trailing '.' or ';' in URLs Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-03-01  3:50 UTC (permalink / raw)
  To: meta

This will allow us to more easily reuse it elsewhere.
---
 MANIFEST                   |  1 +
 lib/PublicInbox/Linkify.pm | 57 +++++++++++++++++++++++++++++++++++++++++++++
 lib/PublicInbox/View.pm    | 58 +++++++++-------------------------------------
 3 files changed, 69 insertions(+), 47 deletions(-)
 create mode 100644 lib/PublicInbox/Linkify.pm

diff --git a/MANIFEST b/MANIFEST
index 857a3a7..5d790f9 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -30,6 +30,7 @@ lib/PublicInbox/Git.pm
 lib/PublicInbox/GitHTTPBackend.pm
 lib/PublicInbox/HTTP.pm
 lib/PublicInbox/Hval.pm
+lib/PublicInbox/Linkify.pm
 lib/PublicInbox/Listener.pm
 lib/PublicInbox/MDA.pm
 lib/PublicInbox/MID.pm
diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm
new file mode 100644
index 0000000..8f634f4
--- /dev/null
+++ b/lib/PublicInbox/Linkify.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2014-2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# two-step linkification.
+# intended usage is in the following order:
+#
+#   linkify_1
+#   <escape unsafe chars for HTML>
+#   linkify_2
+#
+# Maybe this could be done more efficiently...
+package PublicInbox::Linkify;
+use strict;
+use warnings;
+use Digest::SHA qw/sha1_hex/;
+
+my $SALT = rand;
+my $LINK_RE = qr!\b((?:ftp|https?|nntp)://
+		 [\@:\w\.-]+/
+		 ?[\@\w\+\&\?\.\%\;/#=-]*)!x;
+
+sub new { bless {}, shift }
+
+sub linkify_1 {
+	my ($self, $s) = @_;
+	$s =~ s!$LINK_RE!
+		my $url = $1;
+		# salt this, as this could be exploited to show
+		# links in the HTML which don't show up in the raw mail.
+		my $key = sha1_hex($url . $SALT);
+
+		# only escape ampersands, others do not match LINK_RE
+		$url =~ s/&/&#38;/g;
+		$self->{$key} = $url;
+		'PI-LINK-'. $key;
+	!ge;
+	$s;
+}
+
+sub linkify_2 {
+	my ($self, $s) = @_;
+
+	# Added "PI-LINK-" prefix to avoid false-positives on git commits
+	$s =~ s!\bPI-LINK-([a-f0-9]{40})\b!
+		my $key = $1;
+		my $url = $self->{$key};
+		if (defined $url) {
+			"<a\nhref=\"$url\">$url</a>";
+		} else {
+			# false positive or somebody tried to mess with us
+			$key;
+		}
+	!ge;
+	$s;
+}
+
+1;
diff --git a/lib/PublicInbox/View.pm b/lib/PublicInbox/View.pm
index 61eb890..4692b22 100644
--- a/lib/PublicInbox/View.pm
+++ b/lib/PublicInbox/View.pm
@@ -12,9 +12,8 @@ use Encode qw/find_encoding/;
 use Encode::MIME::Header;
 use Email::MIME::ContentType qw/parse_content_type/;
 use PublicInbox::Hval;
+use PublicInbox::Linkify;
 use PublicInbox::MID qw/mid_clean id_compress mid2path/;
-use Digest::SHA qw/sha1_hex/;
-my $SALT = rand;
 require POSIX;
 
 # TODO: make these constants tunable
@@ -302,41 +301,6 @@ sub add_filename_line {
 	"$pad " . ascii_html($fn) . " $pad\n";
 }
 
-my $LINK_RE = qr!\b((?:ftp|https?|nntp)://
-		 [\@:\w\.-]+/
-		 ?[\@\w\+\&\?\.\%\;/#=-]*)!x;
-
-sub linkify_1 {
-	my ($link_map, $s) = @_;
-	$s =~ s!$LINK_RE!
-		my $url = $1;
-		# salt this, as this could be exploited to show
-		# links in the HTML which don't show up in the raw mail.
-		my $key = sha1_hex($url . $SALT);
-		$link_map->{$key} = $url;
-		'PI-LINK-'. $key;
-	!ge;
-	$s;
-}
-
-sub linkify_2 {
-	my ($link_map, $s) = @_;
-
-	# Added "PI-LINK-" prefix to avoid false-positives on git commits
-	$s =~ s!\bPI-LINK-([a-f0-9]{40})\b!
-		my $key = $1;
-		my $url = $link_map->{$key};
-		if (defined $url) {
-			$url = ascii_html($url);
-			"<a\nhref=\"$url\">$url</a>";
-		} else {
-			# false positive or somebody tried to mess with us
-			$key;
-		}
-	!ge;
-	$s;
-}
-
 sub flush_quote {
 	my ($quot, $n, $part_nr, $full_pfx, $final, $do_anchor) = @_;
 
@@ -346,11 +310,11 @@ sub flush_quote {
 	if ($full_pfx) {
 		if (!$final && scalar(@$quot) <= MAX_INLINE_QUOTED) {
 			# show quote inline
-			my %l;
-			my $rv = join('', map { linkify_1(\%l, $_) } @$quot);
+			my $l = PublicInbox::Linkify->new;
+			my $rv = join('', map { $l->linkify_1($_) } @$quot);
 			@$quot = ();
 			$rv = ascii_html($rv);
-			return linkify_2(\%l, $rv);
+			return $l->linkify_2($rv);
 		}
 
 		# show a short snippet of quoted text and link to full version:
@@ -375,13 +339,13 @@ sub flush_quote {
 	} else {
 		# show everything in the full version with anchor from
 		# short version (see above)
-		my %l;
-		my $rv .= join('', map { linkify_1(\%l, $_) } @$quot);
+		my $l = PublicInbox::Linkify->new;
+		my $rv .= join('', map { $l->linkify_1($_) } @$quot);
 		@$quot = ();
 		$rv = ascii_html($rv);
-		return linkify_2(\%l, $rv) unless $do_anchor;
+		return $l->linkify_2($rv) unless $do_anchor;
 		my $nr = ++$$n;
-		"<a\nid=q${part_nr}_$nr></a>" . linkify_2(\%l, $rv);
+		"<a\nid=q${part_nr}_$nr></a>" . $l->linkify_2($rv);
 	}
 }
 
@@ -420,10 +384,10 @@ sub add_text_body {
 			}
 
 			# regular line, OK
-			my %l;
-			$cur = linkify_1(\%l, $cur);
+			my $l = PublicInbox::Linkify->new;
+			$cur = $l->linkify_1($cur);
 			$cur = ascii_html($cur);
-			$s .= linkify_2(\%l, $cur);
+			$s .= $l->linkify_2($cur);
 		} else {
 			push @quot, $cur;
 		}
-- 
EW


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] linkify: do not capture trailing '.' or ';' in URLs
  2016-03-01  3:50 [PATCH 0/2] linkification improvements Eric Wong
  2016-03-01  3:50 ` [PATCH 1/2] extract linkification code to a separate package Eric Wong
@ 2016-03-01  3:50 ` Eric Wong
  1 sibling, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-03-01  3:50 UTC (permalink / raw)
  To: meta

It seems common for users to end statements with URLs,
while it is rare for a URL itself to end with a '.' or ';'.
So make a guess and assume the URL was intended to not
include the trailing '.' or ';'
---
 MANIFEST                   |  1 +
 lib/PublicInbox/Linkify.pm | 10 +++++++++-
 t/linkify.t                | 26 ++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 t/linkify.t

diff --git a/MANIFEST b/MANIFEST
index 5d790f9..259f42c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -80,6 +80,7 @@ t/httpd-corner.psgi
 t/httpd-corner.t
 t/httpd.t
 t/init.t
+t/linkify.t
 t/main-bin/spamc
 t/mda.t
 t/msgmap.t
diff --git a/lib/PublicInbox/Linkify.pm b/lib/PublicInbox/Linkify.pm
index 8f634f4..4eddedd 100644
--- a/lib/PublicInbox/Linkify.pm
+++ b/lib/PublicInbox/Linkify.pm
@@ -25,6 +25,14 @@ sub linkify_1 {
 	my ($self, $s) = @_;
 	$s =~ s!$LINK_RE!
 		my $url = $1;
+		my $end = '';
+
+		# it's fairly common to end URLs in messages with
+		# '.' or ';' to denote the end of a statement.
+		if ($url =~ s/(\.)\z// || $url =~ s/(;)\z//) {
+			$end = $1;
+		}
+
 		# salt this, as this could be exploited to show
 		# links in the HTML which don't show up in the raw mail.
 		my $key = sha1_hex($url . $SALT);
@@ -32,7 +40,7 @@ sub linkify_1 {
 		# only escape ampersands, others do not match LINK_RE
 		$url =~ s/&/&#38;/g;
 		$self->{$key} = $url;
-		'PI-LINK-'. $key;
+		'PI-LINK-'. $key . $end;
 	!ge;
 	$s;
 }
diff --git a/t/linkify.t b/t/linkify.t
new file mode 100644
index 0000000..586691a
--- /dev/null
+++ b/t/linkify.t
@@ -0,0 +1,26 @@
+# Copyright (C) 2016 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use warnings;
+use Test::More;
+use PublicInbox::Linkify;
+
+{
+	my $l = PublicInbox::Linkify->new;
+	my $u = 'http://example.com/url-with-trailing-period';
+	my $s = $u . '.';
+	$s = $l->linkify_1($s);
+	$s = $l->linkify_2($s);
+	is($s, qq(<a\nhref="$u">$u</a>.), 'trailing period not in URL');
+}
+
+{
+	my $l = PublicInbox::Linkify->new;
+	my $u = 'http://example.com/url-with-trailing-semicolon';
+	my $s = $u . ';';
+	$s = $l->linkify_1($s);
+	$s = $l->linkify_2($s);
+	is($s, qq(<a\nhref="$u">$u</a>;), 'trailing semicolon not in URL');
+}
+
+done_testing();
-- 
EW


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2016-03-01  3:50 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-03-01  3:50 [PATCH 0/2] linkification improvements Eric Wong
2016-03-01  3:50 ` [PATCH 1/2] extract linkification code to a separate package Eric Wong
2016-03-01  3:50 ` [PATCH 2/2] linkify: do not capture trailing '.' or ';' in URLs Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).