unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
* [PATCH] viewvcs: handle non-UTF-8 commit message
@ 2023-02-21 11:17 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2023-02-21 11:17 UTC (permalink / raw)
  To: meta

Back in the old days, git didn't store commit encodings
and allowed messages in various encodings to enter history.
Assuming such a commit is UTF-8 trips up s/// operations
on buffers read with the `:utf8' PerlIO layer.  So clear
Perl's internal UTF-8 flag if we end up with something
which isn't valid UTF-8

An example is commit 7eb93c89651c47c8095d476251f2e4314656b292
in git.git ([PATCH] Simplify git script, 2005-09-07)
---
 lib/PublicInbox/ViewVCS.pm |  4 +++-
 t/solver_git.t             | 40 +++++++++++++++++++++++++++++++++++---
 xt/solver.t                |  1 +
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index 0fb77c06..964b7345 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -157,9 +157,11 @@ sub show_commit_start { # ->psgi_qx callback
 	}
 	my $patchid = (split(/ /, $$bref))[0]; # ignore commit
 	$ctx->{-q_value_html} = "patchid:$patchid" if defined $patchid;
-	open my $fh, '<:utf8', "$ctx->{-tmp}/h" or
+	open my $fh, '<', "$ctx->{-tmp}/h" or
 		die "open $ctx->{-tmp}/h: $!";
 	chop(my $buf = do { local $/ = "\0"; <$fh> });
+	utf8::decode($buf);
+	utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
 	chomp $buf;
 	my ($P, $p);
 	($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
diff --git a/t/solver_git.t b/t/solver_git.t
index c65d9785..e8d9feb9 100644
--- a/t/solver_git.t
+++ b/t/solver_git.t
@@ -218,14 +218,13 @@ SKIP: {
 	my %oid; # (small|big) => OID
 	my $lk = bless { lock_path => $l }, 'PublicInbox::Lock';
 	my $acq = $lk->lock_for_scope;
-	my $stamp = "$binfoo/stamp";
+	my $stamp = "$binfoo/stamp-";
 	if (open my $fh, '<', $stamp) {
 		%oid = map { chomp; split(/=/, $_) } (<$fh>);
 	} else {
 		PublicInbox::Import::init_bare($binfoo);
 		my $cmd = [ qw(git hash-object -w --stdin) ];
 		my $env = { GIT_DIR => $binfoo };
-		open my $fh, '>', "$stamp.$$" or BAIL_OUT;
 		while (my ($label, $size) = each %bin) {
 			pipe(my ($rin, $win)) or BAIL_OUT;
 			my $rout = popen_rd($cmd , $env, { 0 => $rin });
@@ -234,9 +233,33 @@ SKIP: {
 			close $win or BAIL_OUT;
 			chomp(my $x = <$rout>);
 			close $rout or BAIL_OUT "$?";
-			print $fh "$label=$x\n" or BAIL_OUT;
 			$oid{$label} = $x;
 		}
+
+		open my $null, '<', '/dev/null' or xbail "open /dev/null: $!";
+		my $t = xqx([qw(git mktree)], $env, { 0 => $null });
+		xbail "mktree: $?" if $?;
+		chomp($t);
+		my $non_utf8 = "K\x{e5}g";
+		$env->{GIT_AUTHOR_NAME} = $non_utf8;
+		$env->{GIT_AUTHOR_EMAIL} = 'e@example.com';
+		$env->{GIT_COMMITTER_NAME} = $env->{GIT_AUTHOR_NAME};
+		$env->{GIT_COMMITTER_EMAIL} = $env->{GIT_AUTHOR_EMAIL};
+		my $in = \"$non_utf8\n\nK\x{e5}g\n";
+		my $c = xqx([qw(git commit-tree), $t], $env, { 0 => $in });
+		xbail "commit-tree: $?" if $?;
+		chomp($c);
+		$oid{'iso-8859-1'} = $c;
+
+		$c = xqx([qw(git commit-tree -p), $c, $t], $env, { 0 => $in });
+		xbail "commit-tree: $?" if $?;
+		chomp($c);
+		$oid{'8859-parent'} = $c;
+
+		open my $fh, '>', "$stamp.$$" or BAIL_OUT;
+		while (my ($k, $v) = each %oid) {
+			print $fh "$k=$v\n" or xbail "print: $!";
+		}
 		close $fh or BAIL_OUT;
 		rename("$stamp.$$", $stamp) or BAIL_OUT;
 	}
@@ -331,6 +354,17 @@ EOF
 			open STDERR, '>&', $olderr or xbail "open: $!";
 		is($res->code, 200, 'coderepo summary (binfoo)');
 		ok(!-s "$tmpdir/stderr.log");
+
+		$res = $cb->(GET("/binfoo/$oid{'iso-8859-1'}/s/"));
+		is($res->code, 200, 'ISO-8859-1 commit');
+		like($res->content, qr/K&#229;g/, 'ISO-8859-1 commit message');
+		ok(!-s "$tmpdir/stderr.log", 'nothing in stderr');
+
+		$res = $cb->(GET("/binfoo/$oid{'8859-parent'}/s/"));
+		is($res->code, 200, 'commit w/ ISO-8859-parent');
+		like($res->content, qr/K&#229;g/, 'ISO-8859-1 commit message');
+		ok(!-s "$tmpdir/stderr.log", 'nothing in stderr');
+
 		$res = $cb->(GET('/public-inbox/'));
 		is($res->code, 200, 'coderepo summary (public-inbox)');
 
diff --git a/xt/solver.t b/xt/solver.t
index 1b0af3d8..1f004bf5 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -30,6 +30,7 @@ my $todo = {
 		'96f1c7f/s/', # TODO: b=contrib/completion/git-completion.bash
 		'b76f2c0/s/?b=po/zh_CN.po',
 		'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
+		'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
 	],
 };
 

^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2023-02-21 11:17 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-21 11:17 [PATCH] viewvcs: handle non-UTF-8 commit message Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).