From 5512fe4f93e4e583ab538ae02dd98e5825252dc9 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 30 Jan 2013 10:17:25 +0100 Subject: [PATCH] detect and consume byte-order marks for textual ports * libguile/ports.h: * libguile/ports.c (scm_consume_byte_order_mark): New procedure. * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we are opening a file in "r" mode. * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about byte-order marks. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling. --- libguile/fports.c | 35 +++++++++-------- libguile/load.c | 3 ++ libguile/ports.c | 85 ++++++++++++++++++++++++++++++++++++++++- libguile/ports.h | 3 +- libguile/read.c | 14 +------ test-suite/tests/filesys.test | 59 +++++++++++++++++++++++++++- 6 files changed, 169 insertions(+), 30 deletions(-) diff --git a/libguile/fports.c b/libguile/fports.c index 10cf671..fbc0530 100644 --- a/libguile/fports.c +++ b/libguile/fports.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, #define FUNC_NAME s_scm_open_file { SCM port; - int fdes, flags = 0, use_encoding = 1; + int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0; unsigned int retries; char *file, *md, *ptr; @@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, { case 'r': flags |= O_RDONLY; + consume_bom = 1; + scan_for_encoding = 1; break; case 'w': flags |= O_WRONLY | O_CREAT | O_TRUNC; @@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, { case '+': flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR; + consume_bom = 0; break; case 'b': - use_encoding = 0; + scan_for_encoding = 0; + consume_bom = 0; + binary = 1; #if defined (O_BINARY) flags |= O_BINARY; #endif @@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode), fport_canonicalize_filename (filename)); - if (use_encoding) - { - /* If this file has a coding declaration, use that as the port - encoding. */ - if (SCM_INPUT_PORT_P (port)) - { - char *enc = scm_i_scan_for_encoding (port); - if (enc != NULL) - scm_i_set_port_encoding_x (port, enc); - } - } - else + if (consume_bom) + scm_consume_byte_order_mark (port); + + if (binary) /* If this is a binary file, use the binary-friendly ISO-8859-1 encoding. */ scm_i_set_port_encoding_x (port, NULL); + else if (scan_for_encoding) + /* If this is an input port and the file has a coding declaration, + use that as the port encoding. */ + { + char *enc = scm_i_scan_for_encoding (port); + if (enc != NULL) + scm_i_set_port_encoding_x (port, enc); + } scm_dynwind_end (); diff --git a/libguile/load.c b/libguile/load.c index 84b6705..476461c 100644 --- a/libguile/load.c +++ b/libguile/load.c @@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0, scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE); scm_i_dynwind_current_load_port (port); + /* FIXME: For better or for worse, scm_open_file already scans the + file for an encoding. This scans again; necessary for this + logic, but unnecessary overall. */ encoding = scm_i_scan_for_encoding (port); if (encoding) scm_i_set_port_encoding_x (port, encoding); diff --git a/libguile/ports.c b/libguile/ports.c index 55808e2..9b1be9b 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, - * 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0, } #undef FUNC_NAME +SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0, + (SCM port), + "Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n" + "in UTF-8 or in UTF-16. If found, consume the byte-order mark\n" + "and set the port to the indicated encoding.\n" + "\n" + "As a special case, if the port's encoding is already UTF-16LE\n" + "or UTF-16BE (as opposed to UTF-16), we consider that the user\n" + "has already asked for an explicit byte order. In this case no\n" + "scan is performed, and the byte-order mark (if any) is left in\n" + "the port.\n" + "\n" + "Return @code{#t} if a byte-order mark was consumed, and\n" + "@code{#f} otherwise.") +#define FUNC_NAME s_scm_consume_byte_order_mark +{ + scm_t_port *pt; + const char *enc; + + SCM_VALIDATE_PORT (1, port); + + pt = SCM_PTAB_ENTRY (port); + enc = pt->encoding; + + if (enc && strcasecmp (enc, "UTF-16BE") == 0) + return SCM_BOOL_F; + + if (enc && strcasecmp (enc, "UTF-16LE") == 0) + return SCM_BOOL_F; + + switch (scm_peek_byte_or_eof (port)) + { + case 0xEF: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xBB: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xBF: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-8"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xBB, port); + scm_unget_byte (0xEF, port); + return SCM_BOOL_F; + } + default: + scm_unget_byte (0xEF, port); + return SCM_BOOL_F; + } + case 0xFE: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xFF: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-16BE"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xFE, port); + return SCM_BOOL_F; + } + case 0xFF: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xFE: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-16LE"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xFF, port); + return SCM_BOOL_F; + } + default: + return SCM_BOOL_F; + } +} +#undef FUNC_NAME + /* A fluid specifying the default encoding for newly created ports. If it is a string, that is the encoding. If it is #f, it is in the "native" (Latin-1) encoding. */ diff --git a/libguile/ports.h b/libguile/ports.h index d4d59b7..2f32345 100644 --- a/libguile/ports.h +++ b/libguile/ports.h @@ -4,7 +4,7 @@ #define SCM_PORTS_H /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, - * 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port); SCM_API SCM scm_set_port_column_x (SCM port, SCM line); SCM_API SCM scm_port_filename (SCM port); SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename); +SCM_API SCM scm_consume_byte_order_mark (SCM port); SCM_INTERNAL const char *scm_i_default_port_encoding (void); SCM_INTERNAL void scm_i_set_default_port_encoding (const char *); SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str); diff --git a/libguile/read.c b/libguile/read.c index 222891b..a8f7744 100644 --- a/libguile/read.c +++ b/libguile/read.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006, - * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port) char header[SCM_ENCODING_SEARCH_SIZE+1]; size_t bytes_read, encoding_length, i; char *encoding = NULL; - int utf8_bom = 0; char *pos, *encoding_start; int in_comment; @@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port) bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE); header[bytes_read] = '\0'; - scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET)); + scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR)); } - if (bytes_read > 3 - && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf') - utf8_bom = 1; - /* search past "coding[:=]" */ pos = header; while (1) @@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port) /* This wasn't in a comment */ return NULL; - if (utf8_bom && strcmp(encoding, "UTF-8")) - scm_misc_error (NULL, - "the port input declares the encoding ~s but is encoded as UTF-8", - scm_list_1 (scm_from_locale_string (encoding))); - return encoding; } diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test index a6bfb6e..8bd974d 100644 --- a/test-suite/tests/filesys.test +++ b/test-suite/tests/filesys.test @@ -1,6 +1,6 @@ ;;;; filesys.test --- test file system functions -*- scheme -*- ;;;; -;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. +;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -17,6 +17,8 @@ ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA (define-module (test-suite test-filesys) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 binary-ports) #:use-module (test-suite lib) #:use-module (test-suite guile-test)) @@ -127,3 +129,58 @@ (delete-file (test-file)) (delete-file (test-symlink)) + +(let ((s "\ufeffHello, world!")) + (define* (test-encoding encoding #:optional (ambient "ISO-8859-1")) + (with-fluids ((%default-port-encoding ambient)) + (let* ((bytes (catch 'misc-error + (lambda () + (call-with-values open-bytevector-output-port + (lambda (port get-bytevector) + (set-port-encoding! port encoding) + (display s port) + (get-bytevector)))) + (lambda args + (throw 'unresolved)))) + (name (string-copy "myfile-XXXXXX")) + (port (mkstemp! name))) + (put-bytevector port bytes) + (close-port port) + (let ((contents (call-with-input-file name read-string))) + (delete-file name) + contents)))) + + (pass-if "UTF-8" + (equal? (test-encoding "UTF-8") + "Hello, world!")) + + (pass-if "UTF-16BE" + (equal? (test-encoding "UTF-16BE") + "Hello, world!")) + + (pass-if "UTF-16LE" + (equal? (test-encoding "UTF-16LE") + "Hello, world!")) + + (pass-if "UTF-8 (ambient)" + (equal? (test-encoding "UTF-8" "UTF-8") + "Hello, world!")) + + (pass-if "UTF-8 (UTF-16 ambient)" + (equal? (test-encoding "UTF-8" "UTF-16") + "Hello, world!")) + + ;; Unicode 6.2 section 16.8: + ;; + ;; For compatibility with versions of the Unicode Standard prior to + ;; Version 3.2, the code point U+FEFF has the word-joining semantics + ;; of zero width no-break space when it is not used as a BOM. [...] + ;; + ;; Where the byte order is explicitly specified, such as in UTF-16BE + ;; or UTF-16LE, then all U+FEFF characters -- even at the very + ;; beginning of the text -- are to be interpreted as zero width + ;; no-break spaces. + ;; + (pass-if "UTF-16LE (ambient)" + (equal? (test-encoding "UTF-16LE" "UTF-16LE") + "\ufeffHello, world!"))) -- 1.7.10.4