From 39fd259ac1aa126ca311c687041baa5568cfdc09 Mon Sep 17 00:00:00 2001 From: Vivien Kraus Date: Fri, 18 Jun 2021 11:57:13 +0200 Subject: [PATCH] Let uri-decode handle more cases with unicode input. Unicode characters are allowed in URIs. However, since Guile URIs are not automatically UTF-8, it is possible that the decoding fails in two different ways: 1. a character in the URI cannot be part of the specified encoding, for instance a non-trivial unicode character with ISO-8859-1; 2. the percent-encoding sequence cannot be decoded, for instance %FF alone in a UTF-8 URI. --- doc/ref/web.texi | 6 +++++- module/web/uri.scm | 27 +++++++++++++++++++++++---- test-suite/tests/web-uri.test | 30 ++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/doc/ref/web.texi b/doc/ref/web.texi index 93cd0214f..108604781 100644 --- a/doc/ref/web.texi +++ b/doc/ref/web.texi @@ -243,7 +243,7 @@ resulting string will omit the fragment (if any). Declare a default port for the given URI scheme. @end deffn -@deffn {Scheme Procedure} uri-decode str [#:encoding=@code{"utf-8"}] [#:decode-plus-to-space? #t] +@deffn {Scheme Procedure} uri-decode str [#:encoding=@code{"utf-8"}] [#:decode-plus-to-space? #t] [#:allow-unescaped-unicode? #t] [#: Percent-decode the given @var{str}, according to @var{encoding}, which should be the name of a character encoding. @@ -265,6 +265,10 @@ replace instances of the plus character @samp{+} with a space character. This is needed when parsing @code{application/x-www-form-urlencoded} data. +If @var{allow-unescaped-unicode?} is true, which is also the default, accept +unescaped unicode characters in the input URI, as if it were an IRI (RFC +3987). + Returns a string of the decoded characters, or a bytevector if @var{encoding} was @code{#f}. @end deffn diff --git a/module/web/uri.scm b/module/web/uri.scm index 8e0b9bee7..a0e85725e 100644 --- a/module/web/uri.scm +++ b/module/web/uri.scm @@ -378,7 +378,7 @@ serialization." (define hex-chars (string->char-set "0123456789abcdefABCDEF")) -(define* (uri-decode str #:key (encoding "utf-8") (decode-plus-to-space? #t)) +(define* (uri-decode str #:key (encoding "utf-8") (decode-plus-to-space? #t) (allow-unescaped-unicode? #t)) "Percent-decode the given STR, according to ENCODING, which should be the name of a character encoding. @@ -398,6 +398,10 @@ If DECODE-PLUS-TO-SPACE? is true, which is the default, also replace instances of the plus character (+) with a space character. This is needed when parsing application/x-www-form-urlencoded data. +If ALLOW-UNESCAPED-UNICODE? is true, which is also the default, accept +unescaped unicode characters in the input URI, as if it were an IRI (RFC +3987). + Returns a string of the decoded characters, or a bytevector if ENCODING was ‘#f’." (let* ((len (string-length str)) @@ -424,10 +428,25 @@ ENCODING was ‘#f’." (put-u8 port (char->integer ch)) (lp (1+ i))) (else - (uri-error "Invalid character in encoded URI ~a: ~s" - str ch)))))))))) + (unless allow-unescaped-unicode? + (uri-error "Invalid character in encoded URI ~a: ~s" + str ch)) + (let ((encoded + (catch 'encoding-error + (lambda () + (string->bytevector (string ch) encoding)) + (lambda error + (uri-error "Invalid character in encoded URI ~a: ~s" + str ch))))) + (put-bytevector port encoded) + (lp (1+ i)))))))))))) (if encoding - (bytevector->string bv encoding) + (catch 'decoding-error + (lambda () + (string-normalize-nfc ;; for plain URIs, this does nothing + (bytevector->string bv encoding))) + (lambda error + (uri-error "Invalid character in encoded URI ~a" str))) ;; Otherwise return raw bytevector bv))) diff --git a/test-suite/tests/web-uri.test b/test-suite/tests/web-uri.test index 95fd82f16..81206ec54 100644 --- a/test-suite/tests/web-uri.test +++ b/test-suite/tests/web-uri.test @@ -686,6 +686,36 @@ (pass-if "foo+bar" (equal? "foo bar" (uri-decode "foo+bar"))) + (pass-if "foo✫bar" + (equal? "foo✫bar" (uri-decode "foo✫bar"))) + + (pass-if-uri-exception "foo✫bar in non-utf8 encoding" + "Invalid character in encoded URI ~a: ~s" + (uri-decode "foo✫bar" #:encoding "ISO-8859-1")) + + (pass-if-uri-exception "foo%FFbar in utf-8" + "Invalid character in encoded URI ~a" + (uri-decode "foo%FFbar in utf-8")) + + (pass-if-uri-exception "foo✫bar, but unicode MUST be escaped" + "Invalid character in encoded URI ~a: ~s" + (uri-decode "foo✫bar" #:allow-unescaped-unicode? #f)) + + ;; This normalization example uses the ô decomposition in + ;; https://www.unicode.org/reports/tr15/#Norm_Forms + (let ((non-normal (list->string + (list #\f #\o + #\o (integer->char (string->number "0302" 16))))) + (normal (list->string + (list #\f #\o + (integer->char (string->number "00F4" 16)))))) + (pass-if "extended URI decode: the strings are different" + (not (equal? non-normal normal))) + (pass-if "extended URI decode: one is the NFC of the other" + (equal? (string-normalize-nfc non-normal) normal)) + (pass-if "extended URI decode: NFC is performed correctly" + (equal? normal (uri-decode non-normal)))) + (pass-if "foo+bar" (equal? '("foo+bar") (split-and-decode-uri-path "foo+bar")))) -- 2.32.0