unofficial mirror of bug-guix@gnu.org 
 help / color / mirror / code / Atom feed
From: Timothy Sample <samplet@ngyro.com>
To: "Ludovic Courtès" <ludo@gnu.org>
Cc: 35785@debbugs.gnu.org, Einar Largenius <einar.largenius@gmail.com>
Subject: bug#35785: ‘string->uri’ is locale-dependent and breaks in ‘sv_SE’
Date: Tue, 04 Jun 2019 09:56:39 -0400	[thread overview]
Message-ID: <87sgsp8ne0.fsf@ngyro.com> (raw)
In-Reply-To: <87imtlhk3k.fsf@gnu.org> ("Ludovic \=\?utf-8\?Q\?Court\=C3\=A8s\=22'\?\= \=\?utf-8\?Q\?s\?\= message of "Tue, 04 Jun 2019 09:42:55 +0200")

[-- Attachment #1: Type: text/plain, Size: 695 bytes --]

Hi,

Ludovic Courtès <ludo@gnu.org> writes:

> Timothy Sample <samplet@ngyro.com> skribis:
>
> [...]
>
>> I needed to reload the modules like that to make the tests fail without
>> the patch and pass with it.  My understanding is that the bug happens
>> at regex compile time, which happens when the module is loaded.  If I
>> don’t reload the module, the old URI code passes the tests, since the
>> regexes were compiled with a locale that does not trigger the bug.  It’s
>> a little wacky, sure, but it was the best idea I could come up with.
>
> Oooh, I see.  Could you add a comment to explain this?  Then we’re done.

Here it is!  I hope it is clear.


-- Tim


[-- Attachment #2: patch --]
[-- Type: text/x-patch, Size: 5865 bytes --]

From 9ac8643e5315d4baaddb93ee246ba8db0b3448ab Mon Sep 17 00:00:00 2001
From: Timothy Sample <samplet@ngyro.com>
Date: Sun, 2 Jun 2019 14:41:20 -0400
Subject: [PATCH] Make URI handling locale independent.

Fixes <https://bugs.gnu.org/35785>.

* module/web/uri.scm (digits, hex-digits, letters): New variables.
(ipv4-regexp, ipv6-regexp, domain-label-regexp, top-label-regexp,
userinfo-pat, host-pat, ipv6-host-pat, port-pat, scheme-pat): Explicitly
list each character instead of using character ranges.
* test-suite/tests/web-uri.test: Add corresponding tests.
---
 module/web/uri.scm            | 31 +++++++++++++++++++++----------
 test-suite/tests/web-uri.test | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/module/web/uri.scm b/module/web/uri.scm
index 4c6fa5051..b4b89b9cc 100644
--- a/module/web/uri.scm
+++ b/module/web/uri.scm
@@ -1,6 +1,6 @@
 ;;;; (web uri) --- URI manipulation tools
 ;;;;
-;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014 Free Software Foundation, Inc.
+;;;; Copyright (C) 1997,2001,2002,2010,2011,2012,2013,2014,2019 Free Software Foundation, Inc.
 ;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -175,17 +175,28 @@ for ‘build-uri’ except there is no scheme."
 ;;; Converters.
 ;;;
 
+;; Since character ranges in regular expressions may depend on the
+;; current locale, we use explicit lists of characters instead.  See
+;; <https://bugs.gnu.org/35785> for details.
+(define digits "0123456789")
+(define hex-digits "0123456789ABCDEFabcdef")
+(define letters "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
+
 ;; See RFC 3986 #3.2.2 for comments on percent-encodings, IDNA (RFC
 ;; 3490), and non-ASCII host names.
 ;;
 (define ipv4-regexp
-  (make-regexp "^([0-9.]+)$"))
+  (make-regexp (string-append "^([" digits ".]+)$")))
 (define ipv6-regexp
-  (make-regexp "^([0-9a-fA-F:.]+)$"))
+  (make-regexp (string-append "^([" hex-digits ":.]+)$")))
 (define domain-label-regexp
-  (make-regexp "^[a-zA-Z0-9]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
+  (make-regexp
+   (string-append "^[" letters digits "]"
+                  "([" letters digits "-]*[" letters digits "])?$")))
 (define top-label-regexp
-  (make-regexp "^[a-zA-Z]([a-zA-Z0-9-]*[a-zA-Z0-9])?$"))
+  (make-regexp
+   (string-append "^[" letters "]"
+                  "([" letters digits "-]*[" letters digits "])?$")))
 
 (define (valid-host? host)
   (cond
@@ -203,13 +214,13 @@ for ‘build-uri’ except there is no scheme."
             (regexp-exec top-label-regexp host start)))))))
 
 (define userinfo-pat
-  "[a-zA-Z0-9_.!~*'();:&=+$,-]+")
+  (string-append "[" letters digits "_.!~*'();:&=+$,-]+"))
 (define host-pat
-  "[a-zA-Z0-9.-]+")
+  (string-append "[" letters digits ".-]+"))
 (define ipv6-host-pat
-  "[0-9a-fA-F:.]+")
+  (string-append "[" hex-digits ":.]+"))
 (define port-pat
-  "[0-9]*")
+  (string-append "[" digits "]*"))
 (define authority-regexp
   (make-regexp
    (format #f "^//((~a)@)?((~a)|(\\[(~a)\\]))(:(~a))?$"
@@ -246,7 +257,7 @@ for ‘build-uri’ except there is no scheme."
 ;;;   either.
 
 (define scheme-pat
-  "[a-zA-Z][a-zA-Z0-9+.-]*")
+  (string-append "[" letters "][" letters digits "+.-]*"))
 (define authority-pat
   "[^/?#]*")
 (define path-pat
diff --git a/test-suite/tests/web-uri.test b/test-suite/tests/web-uri.test
index 73391898c..94778acac 100644
--- a/test-suite/tests/web-uri.test
+++ b/test-suite/tests/web-uri.test
@@ -1,6 +1,6 @@
 ;;;; web-uri.test --- URI library          -*- mode: scheme; coding: utf-8; -*-
 ;;;;
-;;;; 	Copyright (C) 2010-2012, 2014, 2017 Free Software Foundation, Inc.
+;;;; 	Copyright (C) 2010-2012, 2014, 2017, 2019 Free Software Foundation, Inc.
 ;;;;
 ;;;; This library is free software; you can redistribute it and/or
 ;;;; modify it under the terms of the GNU Lesser General Public
@@ -121,7 +121,21 @@
 
   (pass-if-uri-exception "http://foo@"
                          "Expected.*host"
-                         (build-uri 'http #:userinfo "foo")))
+                         (build-uri 'http #:userinfo "foo"))
+
+  ;; In this test, we need to reload the '(web uri)' module with a
+  ;; different locale.  This is because some locale-dependent things
+  ;; (e.g., compiled regexes) are computed when the module is loaded.
+  (pass-if-uri-exception "http://illégal.com"
+                         "Expected.*host"
+                         (dynamic-wind
+                           (lambda () #t)
+                           (lambda ()
+                             (with-locale "en_US.utf8"
+                               (reload-module (resolve-module '(web uri)))
+                               (build-uri 'http #:host "illégal.com")))
+                           (lambda ()
+                             (reload-module (resolve-module '(web uri)))))))
 
 (with-test-prefix "build-uri-reference"
   (pass-if "//host/etc/foo"
@@ -290,7 +304,20 @@
            #:port 100
            #:path "/"
            #:query "q"
-           #:fragment "bar")))
+           #:fragment "bar"))
+
+  ;; This test reproduces bug #35785.  See the 'illégal' test above for
+  ;; why we reload the module.
+  (pass-if "http://www.example.com (sv_SE)"
+    (dynamic-wind
+      (lambda () #t)
+      (lambda ()
+        (with-locale "sv_SE.utf8"
+          (reload-module (resolve-module '(web uri)))
+          (uri=? (string->uri "http://www.example.com")
+                 #:scheme 'http #:host "www.example.com" #:path "")))
+      (lambda ()
+        (reload-module (resolve-module '(web uri)))))))
 
 (with-test-prefix "string->uri-reference"
   (pass-if "/foo"
-- 
2.21.0


      reply	other threads:[~2019-06-04 13:57 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-17 20:03 bug#35785: guix won't download if locale is set to swedish Einar Largenius
2019-05-18 11:55 ` Ludovic Courtès
2019-05-19 17:45   ` Einar Largenius
2019-05-20  8:20     ` Ludovic Courtès
2019-05-20  9:14     ` bug#35785: ‘string->uri’ is locale-dependent and breaks in ‘sv_SE’ Ludovic Courtès
2019-05-27 11:05       ` Ricardo Wurmus
2019-05-27 13:39         ` Timothy Sample
2019-05-28 11:17           ` Ludovic Courtès
2019-06-03  0:39             ` Timothy Sample
2019-06-03 13:01               ` Ludovic Courtès
2019-06-03 14:24                 ` Timothy Sample
2019-06-04  7:42                   ` Ludovic Courtès
2019-06-04 13:56                     ` Timothy Sample [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://guix.gnu.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87sgsp8ne0.fsf@ngyro.com \
    --to=samplet@ngyro.com \
    --cc=35785@debbugs.gnu.org \
    --cc=einar.largenius@gmail.com \
    --cc=ludo@gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/guix.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).