unofficial mirror of meta@public-inbox.org
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: meta@public-inbox.org
Subject: [PATCH] viewvcs: handle non-UTF-8 commit message
Date: Tue, 21 Feb 2023 11:17:58 +0000	[thread overview]
Message-ID: <20230221111758.3969926-1-e@80x24.org> (raw)

Back in the old days, git didn't store commit encodings
and allowed messages in various encodings to enter history.
Assuming such a commit is UTF-8 trips up s/// operations
on buffers read with the `:utf8' PerlIO layer.  So clear
Perl's internal UTF-8 flag if we end up with something
which isn't valid UTF-8

An example is commit 7eb93c89651c47c8095d476251f2e4314656b292
in git.git ([PATCH] Simplify git script, 2005-09-07)
---
 lib/PublicInbox/ViewVCS.pm |  4 +++-
 t/solver_git.t             | 40 +++++++++++++++++++++++++++++++++++---
 xt/solver.t                |  1 +
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index 0fb77c06..964b7345 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -157,9 +157,11 @@ sub show_commit_start { # ->psgi_qx callback
 	}
 	my $patchid = (split(/ /, $$bref))[0]; # ignore commit
 	$ctx->{-q_value_html} = "patchid:$patchid" if defined $patchid;
-	open my $fh, '<:utf8', "$ctx->{-tmp}/h" or
+	open my $fh, '<', "$ctx->{-tmp}/h" or
 		die "open $ctx->{-tmp}/h: $!";
 	chop(my $buf = do { local $/ = "\0"; <$fh> });
+	utf8::decode($buf);
+	utf8::valid($buf) or utf8::encode($buf); # non-UTF-8 commits exist
 	chomp $buf;
 	my ($P, $p);
 	($P, $p, @{$ctx->{cmt_info}}) = split(/\n/, $buf, 9);
diff --git a/t/solver_git.t b/t/solver_git.t
index c65d9785..e8d9feb9 100644
--- a/t/solver_git.t
+++ b/t/solver_git.t
@@ -218,14 +218,13 @@ SKIP: {
 	my %oid; # (small|big) => OID
 	my $lk = bless { lock_path => $l }, 'PublicInbox::Lock';
 	my $acq = $lk->lock_for_scope;
-	my $stamp = "$binfoo/stamp";
+	my $stamp = "$binfoo/stamp-";
 	if (open my $fh, '<', $stamp) {
 		%oid = map { chomp; split(/=/, $_) } (<$fh>);
 	} else {
 		PublicInbox::Import::init_bare($binfoo);
 		my $cmd = [ qw(git hash-object -w --stdin) ];
 		my $env = { GIT_DIR => $binfoo };
-		open my $fh, '>', "$stamp.$$" or BAIL_OUT;
 		while (my ($label, $size) = each %bin) {
 			pipe(my ($rin, $win)) or BAIL_OUT;
 			my $rout = popen_rd($cmd , $env, { 0 => $rin });
@@ -234,9 +233,33 @@ SKIP: {
 			close $win or BAIL_OUT;
 			chomp(my $x = <$rout>);
 			close $rout or BAIL_OUT "$?";
-			print $fh "$label=$x\n" or BAIL_OUT;
 			$oid{$label} = $x;
 		}
+
+		open my $null, '<', '/dev/null' or xbail "open /dev/null: $!";
+		my $t = xqx([qw(git mktree)], $env, { 0 => $null });
+		xbail "mktree: $?" if $?;
+		chomp($t);
+		my $non_utf8 = "K\x{e5}g";
+		$env->{GIT_AUTHOR_NAME} = $non_utf8;
+		$env->{GIT_AUTHOR_EMAIL} = 'e@example.com';
+		$env->{GIT_COMMITTER_NAME} = $env->{GIT_AUTHOR_NAME};
+		$env->{GIT_COMMITTER_EMAIL} = $env->{GIT_AUTHOR_EMAIL};
+		my $in = \"$non_utf8\n\nK\x{e5}g\n";
+		my $c = xqx([qw(git commit-tree), $t], $env, { 0 => $in });
+		xbail "commit-tree: $?" if $?;
+		chomp($c);
+		$oid{'iso-8859-1'} = $c;
+
+		$c = xqx([qw(git commit-tree -p), $c, $t], $env, { 0 => $in });
+		xbail "commit-tree: $?" if $?;
+		chomp($c);
+		$oid{'8859-parent'} = $c;
+
+		open my $fh, '>', "$stamp.$$" or BAIL_OUT;
+		while (my ($k, $v) = each %oid) {
+			print $fh "$k=$v\n" or xbail "print: $!";
+		}
 		close $fh or BAIL_OUT;
 		rename("$stamp.$$", $stamp) or BAIL_OUT;
 	}
@@ -331,6 +354,17 @@ EOF
 			open STDERR, '>&', $olderr or xbail "open: $!";
 		is($res->code, 200, 'coderepo summary (binfoo)');
 		ok(!-s "$tmpdir/stderr.log");
+
+		$res = $cb->(GET("/binfoo/$oid{'iso-8859-1'}/s/"));
+		is($res->code, 200, 'ISO-8859-1 commit');
+		like($res->content, qr/K&#229;g/, 'ISO-8859-1 commit message');
+		ok(!-s "$tmpdir/stderr.log", 'nothing in stderr');
+
+		$res = $cb->(GET("/binfoo/$oid{'8859-parent'}/s/"));
+		is($res->code, 200, 'commit w/ ISO-8859-parent');
+		like($res->content, qr/K&#229;g/, 'ISO-8859-1 commit message');
+		ok(!-s "$tmpdir/stderr.log", 'nothing in stderr');
+
 		$res = $cb->(GET('/public-inbox/'));
 		is($res->code, 200, 'coderepo summary (public-inbox)');
 
diff --git a/xt/solver.t b/xt/solver.t
index 1b0af3d8..1f004bf5 100644
--- a/xt/solver.t
+++ b/xt/solver.t
@@ -30,6 +30,7 @@ my $todo = {
 		'96f1c7f/s/', # TODO: b=contrib/completion/git-completion.bash
 		'b76f2c0/s/?b=po/zh_CN.po',
 		'c2f3bf071ee90b01f2d629921bb04c4f798f02fa/s/', # tag
+		'7eb93c89651c47c8095d476251f2e4314656b292/s/', # non-UTF-8
 	],
 };
 

                 reply	other threads:[~2023-02-21 11:17 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://public-inbox.org/README

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230221111758.3969926-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=meta@public-inbox.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).