--- orig/configure.in +++ mod/configure.in @@ -598,9 +598,10 @@ # readdir_r - recent posix, not on old systems # stat64 - SuS largefile stuff, not on old systems # sysconf - not on old systems +# isblank - available as a GNU extension or in C99 # _NSGetEnviron - Darwin specific # -AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index bcopy memcpy rindex unsetenv _NSGetEnviron]) +AC_CHECK_FUNCS([DINFINITY DQNAN ctermid fesetround ftime fchown getcwd geteuid gettimeofday gmtime_r ioctl lstat mkdir mknod nice readdir_r readlink rename rmdir select setegid seteuid setlocale setpgid setsid sigaction siginterrupt stat64 strftime strptime symlink sync sysconf tcgetpgrp tcsetpgrp times uname waitpid strdup system usleep atexit on_exit chown link fcntl ttyname getpwent getgrent kill getppid getpgrp fork setitimer getitimer strchr strcmp index bcopy memcpy rindex unsetenv isblank _NSGetEnviron]) # Reasons for testing: # netdb.h - not in mingw --- orig/libguile/posix.c +++ mod/libguile/posix.c @@ -34,6 +34,7 @@ #include "libguile/feature.h" #include "libguile/strings.h" #include "libguile/srfi-13.h" +#include "libguile/srfi-14.h" #include "libguile/vectors.h" #include "libguile/lang.h" @@ -1392,6 +1393,10 @@ SCM_SYSERROR; } + /* Recompute the standard SRFI-14 character sets in a locale-dependent + (actually charset-dependent) way. */ + scm_srfi_14_compute_char_sets (); + scm_dynwind_end (); return scm_from_locale_string (rv); } --- orig/libguile/srfi-14.c +++ mod/libguile/srfi-14.c @@ -17,18 +17,27 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ +#define _GNU_SOURCE /* Ask for `isblank ()'. */ #include #include +#ifdef HAVE_CONFIG_H +# include +#endif + #include "libguile.h" #include "libguile/srfi-14.h" -#define SCM_CHARSET_SET(cs, idx) \ - (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] |= \ +#define SCM_CHARSET_SET(cs, idx) \ + (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] |= \ (1L << ((idx) % SCM_BITS_PER_LONG))) +#define SCM_CHARSET_UNSET(cs, idx) \ + (((long *) SCM_SMOB_DATA (cs))[(idx) / SCM_BITS_PER_LONG] &= \ + (~(1L << ((idx) % SCM_BITS_PER_LONG)))) + #define BYTES_PER_CHARSET (SCM_CHARSET_SIZE / 8) #define LONGS_PER_CHARSET (SCM_CHARSET_SIZE / SCM_BITS_PER_LONG) @@ -1393,6 +1402,9 @@ } #undef FUNC_NAME + +/* Standard character sets. */ + SCM scm_char_set_lower_case; SCM scm_char_set_upper_case; SCM scm_char_set_title_case; @@ -1411,48 +1423,123 @@ SCM scm_char_set_empty; SCM scm_char_set_full; -static SCM -make_predset (int (*pred) (int)) -{ - int ch; - SCM cs = make_char_set (NULL); - for (ch = 0; ch < 256; ch++) - if (pred (ch)) - SCM_CHARSET_SET (cs, ch); - return cs; -} -static SCM -define_predset (const char *name, int (*pred) (int)) +/* Create an empty character set and return it after binding it to NAME. */ +static inline SCM +define_charset (const char *name) { - SCM cs = make_predset (pred); + SCM cs = make_char_set (NULL); scm_c_define (name, cs); return scm_permanent_object (cs); } -static SCM -make_strset (const char *str) +/* Membership predicates for the various char sets. + + XXX: The `punctuation' and `symbol' char sets have no direct equivalent in + . Thus, the predicates below yield correct results for ASCII, + but they do not provide the result described by the SRFI for Latin-1. The + correct Latin-1 result could only be obtained by hard-coding the + characters listed by the SRFI, but the problem would remain for other + 8-bit charsets. + + Similarly, character 0xA0 in Latin-1 (unbreakable space, `#\0240') should + be part of `char-set:blank'. However, glibc's current (2006/09) Latin-1 + locales (which use the ISO 14652 "i18n" FDCC-set) do not consider it + `blank' so it ends up in `char-set:punctuation'. */ +#ifdef HAVE_ISBLANK +# define CSET_BLANK_PRED(c) (isblank (c)) +#else +# define CSET_BLANK_PRED(c) \ + (((c) == ' ') || ((c) == '\t')) +#endif + +#define CSET_SYMBOL_PRED(c) \ + (((c) != '\0') && (strchr ("$+<=>^`|~", (c)) != NULL)) +#define CSET_PUNCT_PRED(c) \ + ((ispunct (c)) && (!CSET_SYMBOL_PRED (c))) + +#define CSET_LOWER_PRED(c) (islower (c)) +#define CSET_UPPER_PRED(c) (isupper (c)) +#define CSET_LETTER_PRED(c) (isalpha (c)) +#define CSET_DIGIT_PRED(c) (isdigit (c)) +#define CSET_WHITESPACE_PRED(c) (isspace (c)) +#define CSET_CONTROL_PRED(c) (iscntrl (c)) +#define CSET_HEX_DIGIT_PRED(c) (isxdigit (c)) +#define CSET_ASCII_PRED(c) (isascii (c)) + +/* Some char sets are explicitly defined by the SRFI as a union of other char + sets so we try to follow this closely. */ + +#define CSET_LETTER_AND_DIGIT_PRED(c) \ + (CSET_LETTER_PRED (c) || CSET_DIGIT_PRED (c)) + +#define CSET_GRAPHIC_PRED(c) \ + (CSET_LETTER_PRED (c) || CSET_DIGIT_PRED (c) \ + || CSET_PUNCT_PRED (c) || CSET_SYMBOL_PRED (c)) + +#define CSET_PRINTING_PRED(c) \ + (CSET_GRAPHIC_PRED (c) || CSET_WHITESPACE_PRED (c)) + +/* False and true predicates. */ +#define CSET_TRUE_PRED(c) (1) +#define CSET_FALSE_PRED(c) (0) + + +/* Compute the contents of all the standard character sets. Computation may + need to be re-done at `setlocale'-time because some char sets (e.g., + `char-set:letter') need to reflect the character set supported by Guile. + + For instance, at startup time, the "C" locale is used, thus Guile supports + only ASCII; therefore, `char-set:letter' only contains English letters. + The user can change this by invoking `setlocale' and specifying a locale + with an 8-bit charset, thereby augmenting some of the SRFI-14 standard + character sets. + + This works because some of the predicates used below to construct + character sets (e.g., `isalpha(3)') are locale-dependent (so + charset-dependent, though generally not language-dependent). For details, + please see the `guile-devel' mailing list archive of September 2006. */ +void +scm_srfi_14_compute_char_sets (void) { - SCM cs = make_char_set (NULL); - while (*str) +#define UPDATE_CSET(c, cset, pred) \ + do \ + { \ + if (pred (c)) \ + SCM_CHARSET_SET ((cset), (c)); \ + else \ + SCM_CHARSET_UNSET ((cset), (c)); \ + } \ + while (0) + + register int ch; + + for (ch = 0; ch < 256; ch++) { - SCM_CHARSET_SET (cs, *str); - str++; + UPDATE_CSET (ch, scm_char_set_upper_case, CSET_UPPER_PRED); + UPDATE_CSET (ch, scm_char_set_lower_case, CSET_LOWER_PRED); + UPDATE_CSET (ch, scm_char_set_title_case, CSET_FALSE_PRED); + UPDATE_CSET (ch, scm_char_set_letter, CSET_LETTER_PRED); + UPDATE_CSET (ch, scm_char_set_digit, CSET_DIGIT_PRED); + UPDATE_CSET (ch, scm_char_set_letter_and_digit, + CSET_LETTER_AND_DIGIT_PRED); + UPDATE_CSET (ch, scm_char_set_graphic, CSET_GRAPHIC_PRED); + UPDATE_CSET (ch, scm_char_set_printing, CSET_PRINTING_PRED); + UPDATE_CSET (ch, scm_char_set_whitespace, CSET_WHITESPACE_PRED); + UPDATE_CSET (ch, scm_char_set_iso_control, CSET_CONTROL_PRED); + UPDATE_CSET (ch, scm_char_set_punctuation, CSET_PUNCT_PRED); + UPDATE_CSET (ch, scm_char_set_symbol, CSET_SYMBOL_PRED); + UPDATE_CSET (ch, scm_char_set_hex_digit, CSET_HEX_DIGIT_PRED); + UPDATE_CSET (ch, scm_char_set_blank, CSET_BLANK_PRED); + UPDATE_CSET (ch, scm_char_set_ascii, CSET_ASCII_PRED); + UPDATE_CSET (ch, scm_char_set_empty, CSET_FALSE_PRED); + UPDATE_CSET (ch, scm_char_set_full, CSET_TRUE_PRED); } - return cs; -} -static SCM -define_strset (const char *name, const char *str) -{ - SCM cs = make_strset (str); - scm_c_define (name, cs); - return scm_permanent_object (cs); +#undef UPDATE_CSET } -static int false (int ch) { return 0; } -static int true (int ch) { return 1; } - + void scm_init_srfi_14 (void) { @@ -1461,24 +1548,25 @@ scm_set_smob_free (scm_tc16_charset, charset_free); scm_set_smob_print (scm_tc16_charset, charset_print); - scm_char_set_upper_case = define_predset ("char-set:upper-case", isupper); - scm_char_set_lower_case = define_predset ("char-set:lower-case", islower); - scm_char_set_title_case = define_predset ("char-set:title-case", false); - scm_char_set_letter = define_predset ("char-set:letter", isalpha); - scm_char_set_digit = define_predset ("char-set:digit", isdigit); - scm_char_set_letter_and_digit = define_predset ("char-set:letter+digit", - isalnum); - scm_char_set_graphic = define_predset ("char-set:graphic", isgraph); - scm_char_set_printing = define_predset ("char-set:printing", isprint); - scm_char_set_whitespace = define_predset ("char-set:whitespace", isspace); - scm_char_set_iso_control = define_predset ("char-set:iso-control", iscntrl); - scm_char_set_punctuation = define_predset ("char-set:punctuation", ispunct); - scm_char_set_symbol = define_strset ("char-set:symbol", "$+<=>^`|~"); - scm_char_set_hex_digit = define_predset ("char-set:hex-digit", isxdigit); - scm_char_set_blank = define_strset ("char-set:blank", " \t"); - scm_char_set_ascii = define_predset ("char-set:ascii", isascii); - scm_char_set_empty = define_predset ("char-set:empty", false); - scm_char_set_full = define_predset ("char-set:full", true); + scm_char_set_upper_case = define_charset ("char-set:upper-case"); + scm_char_set_lower_case = define_charset ("char-set:lower-case"); + scm_char_set_title_case = define_charset ("char-set:title-case"); + scm_char_set_letter = define_charset ("char-set:letter"); + scm_char_set_digit = define_charset ("char-set:digit"); + scm_char_set_letter_and_digit = define_charset ("char-set:letter+digit"); + scm_char_set_graphic = define_charset ("char-set:graphic"); + scm_char_set_printing = define_charset ("char-set:printing"); + scm_char_set_whitespace = define_charset ("char-set:whitespace"); + scm_char_set_iso_control = define_charset ("char-set:iso-control"); + scm_char_set_punctuation = define_charset ("char-set:punctuation"); + scm_char_set_symbol = define_charset ("char-set:symbol"); + scm_char_set_hex_digit = define_charset ("char-set:hex-digit"); + scm_char_set_blank = define_charset ("char-set:blank"); + scm_char_set_ascii = define_charset ("char-set:ascii"); + scm_char_set_empty = define_charset ("char-set:empty"); + scm_char_set_full = define_charset ("char-set:full"); + + scm_srfi_14_compute_char_sets (); #include "libguile/srfi-14.x" } --- orig/libguile/srfi-14.h +++ mod/libguile/srfi-14.h @@ -106,7 +106,7 @@ SCM_API SCM scm_char_set_empty; SCM_API SCM scm_char_set_full; -SCM_API void scm_c_init_srfi_14 (void); +SCM_API void scm_srfi_14_compute_char_sets (void); SCM_API void scm_init_srfi_14 (void); #endif /* SCM_SRFI_14_H */ --- orig/test-suite/tests/srfi-14.test +++ mod/test-suite/tests/srfi-14.test @@ -1,4 +1,4 @@ -;;;; srfi-14.test --- Test suite for Guile's SRFI-14 functions. -*- scheme -*- +;;;; srfi-14.test --- Test suite for Guile's SRFI-14 functions. ;;;; Martin Grabmueller, 2001-07-16 ;;;; ;;;; Copyright (C) 2001, 2006 Free Software Foundation, Inc. @@ -18,7 +18,11 @@ ;;;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, ;;;; Boston, MA 02110-1301 USA -(use-modules (srfi srfi-14)) +(define-module (test-suite test-srfi-14) + :use-module (srfi srfi-14) + :use-module (srfi srfi-1) ;; `every' + :use-module (test-suite lib)) + (define exception:invalid-char-set-cursor (cons 'misc-error "^invalid character set cursor")) @@ -186,3 +190,128 @@ (pass-if "upper case char set" (char-set= (char-set-map char-upcase char-set:lower-case) char-set:upper-case))) + +(with-test-prefix "string->char-set" + + (pass-if "some char set" + (let ((chars '(#\g #\u #\i #\l #\e))) + (char-set= (list->char-set chars) + (string->char-set (apply string chars)))))) + +;; Make sure we get an ASCII charset and character classification. +(if (defined? 'setlocale) (setlocale LC_CTYPE "C")) + +(with-test-prefix "standard char sets (ASCII)" + + (pass-if "char-set:letter" + (char-set= (string->char-set + (string-append "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ")) + char-set:letter)) + + (pass-if "char-set:punctuation" + (char-set= (string->char-set "!\"#%&'()*,-./:;?@[\\]_{}") + char-set:punctuation)) + + (pass-if "char-set:symbol" + (char-set= (string->char-set "$+<=>^`|~") + char-set:symbol)) + + (pass-if "char-set:letter+digit" + (char-set= char-set:letter+digit + (char-set-union char-set:letter char-set:digit))) + + (pass-if "char-set:graphic" + (char-set= char-set:graphic + (char-set-union char-set:letter char-set:digit + char-set:punctuation char-set:symbol))) + + (pass-if "char-set:printing" + (char-set= char-set:printing + (char-set-union char-set:whitespace char-set:graphic)))) + + + +;;; +;;; 8-bit charsets. +;;; +;;; Here, we only test ISO-8859-1 (Latin-1), notably because behavior of +;;; SRFI-14 for implementations supporting this charset is well-defined. +;;; + +(define (every? pred lst) + (not (not (every pred lst)))) + +(define (find-latin1-locale) + ;; Try to find and install an ISO-8859-1 locale. Return `#f' on failure. + (if (defined? 'setlocale) + (let loop ((locales (map (lambda (lang) + (string-append lang ".iso88591")) + '("de_DE" "en_GB" "en_US" "es_ES" + "fr_FR" "it_IT")))) + (if (null? locales) + #f + (if (false-if-exception (setlocale LC_CTYPE (car locales))) + (car locales) + (loop (cdr locales))))) + #f)) + + +(define %latin1 (find-latin1-locale)) + +(with-test-prefix "Latin-1 (8-bit charset)" + + ;; Note: the membership tests below are not exhaustive. + + (pass-if "char-set:letter (membership)" + (if (not %latin1) + (throw 'unresolved) + (let ((letters (char-set->list char-set:letter))) + (every? (lambda (8-bit-char) + (memq 8-bit-char letters)) + (append '(#\a #\b #\c) ;; ASCII + (string->list "çéèâùÉÀÈÊ") ;; French + (string->list "øñÑíßåæðþ")))))) + + (pass-if "char-set:letter (size)" + (if (not %latin1) + (throw 'unresolved) + (= (char-set-size char-set:letter) 117))) + + (pass-if "char-set:lower-case (size)" + (if (not %latin1) + (throw 'unresolved) + (= (char-set-size char-set:lower-case) (+ 26 33)))) + + (pass-if "char-set:upper-case (size)" + (if (not %latin1) + (throw 'unresolved) + (= (char-set-size char-set:upper-case) (+ 26 30)))) + + (pass-if "char-set:punctuation (membership)" + (if (not %latin1) + (thrown 'unresolved) + (let ((punctuation (char-set->list char-set:punctuation))) + (every? (lambda (8-bit-char) + (memq 8-bit-char punctuation)) + (append '(#\! #\. #\?) ;; ASCII + (string->list "¡¿") ;; Castellano + (string->list "«»")))))) ;; French + + (pass-if "char-set:letter+digit" + (char-set= char-set:letter+digit + (char-set-union char-set:letter char-set:digit))) + + (pass-if "char-set:graphic" + (char-set= char-set:graphic + (char-set-union char-set:letter char-set:digit + char-set:punctuation char-set:symbol))) + + (pass-if "char-set:printing" + (char-set= char-set:printing + (char-set-union char-set:whitespace char-set:graphic)))) + +;; Local Variables: +;; mode: scheme +;; coding: latin-1 +;; End: