Re: SRFI-14 and locale settings

From: ludovic.courtes@laas.fr (Ludovic Courtès)
Cc: guile-devel@gnu.org
Subject: Re: SRFI-14 and locale settings
Date: Sat, 16 Sep 2006 15:46:31 +0200	[thread overview]
Message-ID: <87y7sjj554.fsf@laas.fr> (raw)
In-Reply-To: <877j05wkb5.fsf@ossau.uklinux.net> (Neil Jerram's message of "Fri, 15 Sep 2006 10:28:14 +0100")

[-- Attachment #1: Type: text/plain, Size: 2117 bytes --]

Hi,

Neil Jerram <neil@ossau.uklinux.net> writes:

> Yes; based on Kevin's and Ludovic's latest emails, I'm happy now with
> the isalpha() solution if we can make it leverage this "i18n"
> classification.

Below is a patch that does what we agreed on: keep using the <ctype.h>
functions for character classification, and recompute the standard
SRFI-14 char sets upon successful `setlocale'.  It also makes char set
computation more efficient and fixes `char-set:punctuation' and
`char-set:symbol' in ASCII.

There are still issues.  The bug in `char-set:punctuation' and
`char-set:symbol' I mention above is due to the fact that there is not
<ctype.h> equivalent to those char sets (in particular, `ispunct ()'
does not match `char-set:punctuation').  Fixing it for ASCII was easy,
but it's not so easy for Latin-1.

The reason we can hardly get `char-set:punctuation' and
`char-set:symbol' for Latin-1 is that we don't want to hard-code too
much Latin-1-specific knowledge: one goal is to have SRFI-14 provide
also sensible results for non-Latin-1 8-bit charsets.

With this patch, all standard char sets are those expected by SRFI-14 in
ASCII.  In Latin-1, `char-set:letter', as well as `lower-case',
`upper-case', and `iso-control' are correct (at least, using current
glibc locales), but `punctuation', for instance, is a superset of what
SRFI-14 expects while `symbol' is (correspondingly) a subset of what it
should be, and `blank' lacks the "no-break space" character (#\0240).

I'm not sure we can do much better than that until Guile fully supports
Unicode.  The right solution, in the end, would be to process the whole
`UnicodeData.txt' and generate a character classification strictly
following the SRFI-14 rules.  In the meantime, I think this patch can be
an acceptable solution.

I'd be glad if some of you could test it, and especially run the test
cases.  I added Latin-1-specific test cases, but they require that a
Latin-1 locale is available, and it will try to guess what that can be
(yes, it looks quite hackish but I couldn't think of anything
better...).

Comments welcome.

Thanks,
Ludovic.


[-- Attachment #2: The SRFI-14 patch against CVS HEAD --]
[-- Type: text/plain, Size: 14387 bytes --]

--- orig/configure.in
+++ mod/configure.in
@@ -598,9 +598,10 @@
 #   readdir_r - recent posix, not on old systems
 #   stat64 - SuS largefile stuff, not on old systems
 #   sysconf - not on old systems
+#   isblank - available as a GNU extension or in C99
 #   _NSGetEnviron - Darwin specific
 #
-AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index bcopy memcpy rindex unsetenv _NSGetEnviron])
+AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index bcopy memcpy rindex unsetenv isblank _NSGetEnviron])
 
 # Reasons for testing:
 #   netdb.h - not in mingw


--- orig/libguile/posix.c
+++ mod/libguile/posix.c
@@ -34,6 +34,7 @@
 #include "libguile/feature.h"
 #include "libguile/strings.h"
 #include "libguile/srfi-13.h"
+#include "libguile/srfi-14.h"
 #include "libguile/vectors.h"
 #include "libguile/lang.h"
 
@@ -1392,6 +1393,10 @@
       SCM_SYSERROR;
     }
 
+  /* Recompute the standard SRFI-14 character sets in a locale-dependent
+     (actually charset-dependent) way.  */
+  scm_srfi_14_compute_char_sets ();
+
   scm_dynwind_end ();
   return scm_from_locale_string (rv);
 }


--- orig/libguile/srfi-14.c
+++ mod/libguile/srfi-14.c
@@ -17,18 +17,25 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define _GNU_SOURCE  /* Ask for `isblank ()'.  */
 
 #include <string.h>
 #include <ctype.h>
 
+#include <config.h>
+
 #include "libguile.h"
 #include "libguile/srfi-14.h"
 
 
-#define SCM_CHARSET_SET(cs, idx) \
-  (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] |= \
+#define SCM_CHARSET_SET(cs, idx)				\
+  (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] |=	\
     (1L << ((idx) % SCM_BITS_PER_LONG)))
 
+#define SCM_CHARSET_UNSET(cs, idx)				\
+  (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] &=	\
+    (~(1L << ((idx) % SCM_BITS_PER_LONG))))
+
 #define BYTES_PER_CHARSET (SCM_CHARSET_SIZE / 8)
 #define LONGS_PER_CHARSET (SCM_CHARSET_SIZE / SCM_BITS_PER_LONG)
 
@@ -1393,6 +1400,9 @@
 }
 #undef FUNC_NAME
 
+\f
+/* Standard character sets.  */
+
 SCM scm_char_set_lower_case;
 SCM scm_char_set_upper_case;
 SCM scm_char_set_title_case;
@@ -1411,48 +1421,100 @@
 SCM scm_char_set_empty;
 SCM scm_char_set_full;
 
-static SCM
-make_predset (int (*pred) (int))
-{
-  int ch;
-  SCM cs = make_char_set (NULL);
-  for (ch = 0; ch < 256; ch++)
-    if (pred (ch))
-      SCM_CHARSET_SET (cs, ch);
-  return cs;
-}
 
-static SCM
-define_predset (const char *name, int (*pred) (int))
+/* Create an empty character set and return it after binding it to NAME.  */
+static inline SCM
+define_charset (const char *name)
 {
-  SCM cs = make_predset (pred);
+  SCM cs = make_char_set (NULL);
   scm_c_define (name, cs);
   return scm_permanent_object (cs);
 }
 
-static SCM
-make_strset (const char *str)
+/* Membership predicates for the `symbol', `blank' and `punct' char sets.
+
+   XXX: The `punctuation' and `symbol' char sets have no direct equivalent in
+   <ctype.h>.  Thus, the predicates below yield correct results for ASCII,
+   but they do not provide the result described by the SRFI for Latin-1.  The
+   correct Latin-1 result could only be obtained by hard-coding the
+   characters listed by the SRFI, but the problem would remain for other
+   8-bit charsets.
+
+   Similarly, character 0xA0 in Latin-1 (unbreakable space, `#\0240') should
+   be part of `char-set:blank'.  However, glibc's current (2006/09) Latin-1
+   locales (which use the ISO 14652 "i18n" FDCC-set) do not consider it
+   `blank' so it ends up in `char-set:punctuation'.  */
+#ifdef HAVE_ISBLANK
+# define CSET_BLANK_PRED(c)  (isblank (c))
+#else
+# define CSET_BLANK_PRED(c)			\
+   (((c) == ' ') || ((c) == '\t'))
+#endif
+
+#define CSET_SYMBOL_PRED(c)					\
+  (((c) != '\0') && (strchr ("$+<=>^`|~", (c)) != NULL))
+#define CSET_PUNCT_PRED(c)					\
+  ((ispunct (c)) && (!CSET_SYMBOL_PRED (c)))
+
+/* False and true predicates.  */
+#define CSET_TRUE_PRED(c)    (1)
+#define CSET_FALSE_PRED(c)   (0)
+
+
+/* Compute the contents of all the standard character sets.  Computation may
+   need to be re-done at `setlocale'-time because some char sets (e.g.,
+   `char-set:letter') need to reflect the character set supported by Guile.
+
+   For instance, at startup time, the "C" locale is used, thus Guile supports
+   only ASCII; therefore, `char-set:letter' only contains English letters.
+   The user can change this by invoking `setlocale' and specifying a locale
+   with an 8-bit charset, thereby augmenting some of the SRFI-14 standard
+   character sets.
+
+   This works because some of the predicates used below to construct
+   character sets (e.g., `isalpha(3)') are locale-dependent (so
+   charset-dependent, though generally not language-dependent).  For details,
+   please see the `guile-devel' mailing list archive of September 2006.  */
+void
+scm_srfi_14_compute_char_sets (void)
 {
-  SCM cs = make_char_set (NULL);
-  while (*str)
+#define UPDATE_CSET(c, cset, pred)		\
+  do						\
+    {						\
+      if (pred (c))				\
+	SCM_CHARSET_SET ((cset), (c));		\
+      else					\
+	SCM_CHARSET_UNSET ((cset), (c));	\
+    }						\
+  while (0)
+
+  register int ch;
+
+  for (ch = 0; ch < 256; ch++)
     {
-      SCM_CHARSET_SET (cs, *str);
-      str++;
+      UPDATE_CSET (ch, scm_char_set_upper_case, isupper);
+      UPDATE_CSET (ch, scm_char_set_lower_case, islower);
+      UPDATE_CSET (ch, scm_char_set_title_case, CSET_FALSE_PRED);
+      UPDATE_CSET (ch, scm_char_set_letter, isalpha);
+      UPDATE_CSET (ch, scm_char_set_digit, isdigit);
+      UPDATE_CSET (ch, scm_char_set_letter_and_digit, isalnum);
+      UPDATE_CSET (ch, scm_char_set_graphic, isgraph);
+      UPDATE_CSET (ch, scm_char_set_printing, isprint);
+      UPDATE_CSET (ch, scm_char_set_whitespace, isspace);
+      UPDATE_CSET (ch, scm_char_set_iso_control, iscntrl);
+      UPDATE_CSET (ch, scm_char_set_punctuation, CSET_PUNCT_PRED);
+      UPDATE_CSET (ch, scm_char_set_symbol, CSET_SYMBOL_PRED);
+      UPDATE_CSET (ch, scm_char_set_hex_digit, isxdigit);
+      UPDATE_CSET (ch, scm_char_set_blank, CSET_BLANK_PRED);
+      UPDATE_CSET (ch, scm_char_set_ascii, isascii);
+      UPDATE_CSET (ch, scm_char_set_empty, CSET_FALSE_PRED);
+      UPDATE_CSET (ch, scm_char_set_full, CSET_TRUE_PRED);
     }
-  return cs;
-}
 
-static SCM
-define_strset (const char *name, const char *str)
-{
-  SCM cs = make_strset (str);
-  scm_c_define (name, cs);
-  return scm_permanent_object (cs);
+#undef UPDATE_CSET
 }
 
-static int false (int ch) { return 0; }
-static int true (int ch) { return 1; }
-
+\f
 void
 scm_init_srfi_14 (void)
 {
@@ -1461,24 +1523,25 @@
   scm_set_smob_free (scm_tc16_charset, charset_free);
   scm_set_smob_print (scm_tc16_charset, charset_print);
 
-  scm_char_set_upper_case = define_predset ("char-set:upper-case", isupper);
-  scm_char_set_lower_case = define_predset ("char-set:lower-case", islower);
-  scm_char_set_title_case = define_predset ("char-set:title-case", false);
-  scm_char_set_letter = define_predset ("char-set:letter", isalpha);
-  scm_char_set_digit = define_predset ("char-set:digit", isdigit);
-  scm_char_set_letter_and_digit = define_predset ("char-set:letter+digit",
-						  isalnum);
-  scm_char_set_graphic = define_predset ("char-set:graphic", isgraph);
-  scm_char_set_printing = define_predset ("char-set:printing", isprint);
-  scm_char_set_whitespace = define_predset ("char-set:whitespace", isspace);
-  scm_char_set_iso_control = define_predset ("char-set:iso-control", iscntrl);
-  scm_char_set_punctuation = define_predset ("char-set:punctuation", ispunct);
-  scm_char_set_symbol = define_strset ("char-set:symbol", "$+<=>^`|~");
-  scm_char_set_hex_digit = define_predset ("char-set:hex-digit", isxdigit);
-  scm_char_set_blank = define_strset ("char-set:blank", " \t");
-  scm_char_set_ascii = define_predset ("char-set:ascii", isascii);
-  scm_char_set_empty = define_predset ("char-set:empty", false);
-  scm_char_set_full = define_predset ("char-set:full", true);
+  scm_char_set_upper_case = define_charset ("char-set:upper-case");
+  scm_char_set_lower_case = define_charset ("char-set:lower-case");
+  scm_char_set_title_case = define_charset ("char-set:title-case");
+  scm_char_set_letter = define_charset ("char-set:letter");
+  scm_char_set_digit = define_charset ("char-set:digit");
+  scm_char_set_letter_and_digit = define_charset ("char-set:letter+digit");
+  scm_char_set_graphic = define_charset ("char-set:graphic");
+  scm_char_set_printing = define_charset ("char-set:printing");
+  scm_char_set_whitespace = define_charset ("char-set:whitespace");
+  scm_char_set_iso_control = define_charset ("char-set:iso-control");
+  scm_char_set_punctuation = define_charset ("char-set:punctuation");
+  scm_char_set_symbol = define_charset ("char-set:symbol");
+  scm_char_set_hex_digit = define_charset ("char-set:hex-digit");
+  scm_char_set_blank = define_charset ("char-set:blank");
+  scm_char_set_ascii = define_charset ("char-set:ascii");
+  scm_char_set_empty = define_charset ("char-set:empty");
+  scm_char_set_full = define_charset ("char-set:full");
+
+  scm_srfi_14_compute_char_sets ();
 
 #include "libguile/srfi-14.x"
 }


--- orig/libguile/srfi-14.h
+++ mod/libguile/srfi-14.h
@@ -106,7 +106,7 @@
 SCM_API SCM scm_char_set_empty;
 SCM_API SCM scm_char_set_full;
 
-SCM_API void scm_c_init_srfi_14 (void);
+SCM_API void scm_srfi_14_compute_char_sets (void);
 SCM_API void scm_init_srfi_14 (void);
 
 #endif /* SCM_SRFI_14_H */


--- orig/test-suite/tests/srfi-14.test
+++ mod/test-suite/tests/srfi-14.test
@@ -1,4 +1,4 @@
-;;;; srfi-14.test --- Test suite for Guile's SRFI-14 functions. -*- scheme -*-
+;;;; srfi-14.test --- Test suite for Guile's SRFI-14 functions.
 ;;;; Martin Grabmueller, 2001-07-16
 ;;;;
 ;;;; Copyright (C) 2001, 2006 Free Software Foundation, Inc.
@@ -18,7 +18,9 @@
 ;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 ;;;; Boston, MA 02110-1301 USA
 
-(use-modules (srfi srfi-14))
+(use-modules (srfi srfi-14)
+             (srfi srfi-1) ;; `every'
+             (test-suite lib))
 
 (define exception:invalid-char-set-cursor
   (cons 'misc-error "^invalid character set cursor"))
@@ -186,3 +188,102 @@
   (pass-if "upper case char set"
      (char-set= (char-set-map char-upcase char-set:lower-case)
 		char-set:upper-case)))
+
+(with-test-prefix "string->char-set"
+
+  (pass-if "some char set"
+     (let ((chars '(#\g #\u #\i #\l #\e)))
+       (char-set= (list->char-set chars)
+		  (string->char-set (apply string chars))))))
+
+;; Make sure we get an ASCII charset and character classification.
+(if (defined? 'setlocale) (setlocale LC_CTYPE "C"))
+
+(with-test-prefix "standard char sets (ASCII)"
+
+  (pass-if "char-set:letter"
+     (char-set= (string->char-set
+		 (string-append "abcdefghijklmnopqrstuvwxyz"
+				"ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+		char-set:letter))
+
+  (pass-if "char-set:punctuation"
+     (char-set= (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}")
+		char-set:punctuation))
+
+  (pass-if "char-set:symbol"
+     (char-set= (string->char-set "$+<=>^`|~")
+		char-set:symbol)))
+
+\f
+;;;
+;;; 8-bit charsets.
+;;;
+;;; Here, we only test ISO-8859-1 (Latin-1), notably because behavior of
+;;; SRFI-14 for implementations supporting this charset is well-defined.
+;;;
+
+(define (every? pred lst)
+  (not (not (every pred lst))))
+
+(define (find-latin1-locale)
+  ;; Try to find and install an ISO-8859-1 locale.  Return `#f' on failure.
+  (if (defined? 'setlocale)
+      (let loop ((locales (map (lambda (lang)
+				 (string-append lang ".iso88591"))
+			       '("de_DE" "en_GB" "en_US" "es_ES"
+				 "fr_FR" "it_IT"))))
+	(if (null? locales)
+	    #f
+	    (if (false-if-exception (setlocale LC_CTYPE (car locales)))
+		(car locales)
+		(loop (cdr locales)))))
+      #f))
+
+
+(define %latin1 (find-latin1-locale))
+
+(with-test-prefix "Latin-1 (8-bit charset)"
+
+  ;; Note: the membership tests below are not exhaustive.
+
+  (pass-if "char-set:letter (membership)"
+     (if (not %latin1)
+	 (throw 'unresolved)
+	 (let ((letters (char-set->list char-set:letter)))
+	   (every? (lambda (8-bit-char)
+		     (memq 8-bit-char letters))
+		   (append '(#\a #\b #\c)             ;; ASCII
+			   (string->list "çéèâùÉÀÈÊ") ;; French
+			   (string->list "øñÑíßåæðþ"))))))
+
+  (pass-if "char-set:letter (size)"
+     (if (not %latin1)
+	 (throw 'unresolved)
+	 (= (char-set-size char-set:letter) 117)))
+
+  (pass-if "char-set:lower-case (size)"
+     (if (not %latin1)
+	 (throw 'unresolved)
+	 (= (char-set-size char-set:lower-case) (+ 26 33))))
+
+  (pass-if "char-set:upper-case (size)"
+     (if (not %latin1)
+	 (throw 'unresolved)
+	 (= (char-set-size char-set:upper-case) (+ 26 30))))
+
+  (pass-if "char-set:punctuation (membership)"
+     (if (not %latin1)
+	 (thrown 'unresolved)
+	 (let ((punctuation (char-set->list char-set:punctuation)))
+	   (every? (lambda (8-bit-char)
+		     (memq 8-bit-char punctuation))
+		   (append '(#\! #\. #\?)            ;; ASCII
+			   (string->list "¡¿")       ;; Castellano
+			   (string->list "«»"))))))) ;; French
+
+
+;; Local Variables:
+;; mode: scheme
+;; coding: latin-1
+;; End:



[-- Attachment #3: Type: text/plain, Size: 143 bytes --]

_______________________________________________
Guile-devel mailing list
Guile-devel@gnu.org
http://lists.gnu.org/mailman/listinfo/guile-devel