From 7adb0e333624c617f7c95796053df60d0a504ec9 Fri Jun 18 11:57:13 2021 From: Vivien Kraus Date: Fri, 18 Jun 2021 11:57:13 +0200 Subject: [PATCH] Let uri-decode handle more cases with unicode input. Unicode characters are allowed in URIs. However, since Guile URIs are not automatically UTF-8, it is possible that the decoding fails in two different ways: 1. a character in the URI cannot be part of the specified encoding, for instance a non-trivial unicode character with ISO-8859-1; 2. the percent-encoding sequence cannot be decoded, for instance %FF alone in a UTF-8 URI. --- module/web/uri.scm | 17 ++++++++++++++--- test-suite/tests/web-uri.test | 11 +++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/module/web/uri.scm b/module/web/uri.scm index 8e0b9bee7..bb3573fab 100644 --- a/module/web/uri.scm +++ b/module/web/uri.scm @@ -424,10 +424,21 @@ ENCODING was ‘#f’." (put-u8 port (char->integer ch)) (lp (1+ i))) (else - (uri-error "Invalid character in encoded URI ~a: ~s" - str ch)))))))))) + (let ((encoded + (catch 'encoding-error + (lambda () + (string->bytevector (string ch) encoding)) + (lambda error + (uri-error "Invalid character in encoded URI ~a: ~s" + str ch))))) + (put-bytevector port encoded) + (lp (1+ i)))))))))))) (if encoding - (bytevector->string bv encoding) + (catch 'decoding-error + (lambda () + (bytevector->string bv encoding)) + (lambda error + (uri-error "Invalid character in encoded URI ~a" str))) ;; Otherwise return raw bytevector bv))) diff --git a/test-suite/tests/web-uri.test b/test-suite/tests/web-uri.test index 95fd82f16..dc1da79e6 100644 --- a/test-suite/tests/web-uri.test +++ b/test-suite/tests/web-uri.test @@ -686,6 +686,17 @@ (pass-if "foo+bar" (equal? "foo bar" (uri-decode "foo+bar"))) + (pass-if "foo✫bar" + (equal? "foo✫bar" (uri-decode "foo✫bar"))) + + (pass-if-uri-exception "foo✫bar in non-utf8 encoding" + "Invalid character in encoded URI ~a: ~s" + (uri-decode "foo✫bar" #:encoding "ISO-8859-1")) + + (pass-if-uri-exception "foo%FFbar in utf-8" + "Invalid character in encoded URI ~a" + (uri-decode "foo%FFbar in utf-8")) + (pass-if "foo+bar" (equal? '("foo+bar") (split-and-decode-uri-path "foo+bar")))) -- 2.32.0