From: Andy Wingo <wingo@pobox.com>
To: guile-devel <guile-devel@gnu.org>
Subject: byte-order marks
Date: Mon, 28 Jan 2013 22:42:09 +0100 [thread overview]
Message-ID: <87boc956j2.fsf@pobox.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 55 bytes --]
What do people think about this attached patch?
Andy
[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-detect-and-consume-byte-order-marks-for-textual-port.patch --]
[-- Type: text/x-diff, Size: 6546 bytes --]
From 831c3418941f2d643f91e3076ef9458f700a2c59 Mon Sep 17 00:00:00 2001
From: Andy Wingo <wingo@pobox.com>
Date: Mon, 28 Jan 2013 22:41:34 +0100
Subject: [PATCH] detect and consume byte-order marks for textual ports
* libguile/read.c (scm_i_scan_for_encoding): If we see a BOM, use it in
preference to any "coding" declaration, and consume it. This only
happens in textual mode.
* libguile/load.c (scm_primitive_load): Add a note about the duplicate
encoding scan.
* test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and
UTF-16LE BOM handling.
---
libguile/load.c | 4 ++++
libguile/read.c | 39 +++++++++++++++++++++++++++------------
test-suite/tests/filesys.test | 34 +++++++++++++++++++++++++++++++++-
3 files changed, 64 insertions(+), 13 deletions(-)
diff --git a/libguile/load.c b/libguile/load.c
index 84b6705..b5e430e 100644
--- a/libguile/load.c
+++ b/libguile/load.c
@@ -106,6 +106,10 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0,
scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
scm_i_dynwind_current_load_port (port);
+ /* FIXME: For better or for worse, scm_open_file already scans the
+ file for an encoding. This scans again; necessary for this
+ logic, but unnecessary overall. As scanning for an encoding
+ consumes a BOM, this might mean we miss a BOM. */
encoding = scm_i_scan_for_encoding (port);
if (encoding)
scm_i_set_port_encoding_x (port, encoding);
diff --git a/libguile/read.c b/libguile/read.c
index 222891b..1a7462f 100644
--- a/libguile/read.c
+++ b/libguile/read.c
@@ -1,5 +1,5 @@
/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006,
- * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+ * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
@@ -1975,9 +1975,10 @@ scm_get_hash_procedure (int c)
#define SCM_ENCODING_SEARCH_SIZE (500)
-/* Search the first few hundred characters of a file for an Emacs-like coding
- declaration. Returns either NULL or a string whose storage has been
- allocated with `scm_gc_malloc ()'. */
+/* Search the first few hundred characters of a file for an Emacs-like
+ coding declaration. Returns either NULL or a string whose storage
+ has been allocated with `scm_gc_malloc ()'. If a BOM is present, it
+ is consumed and used in preference to any coding declaration. */
char *
scm_i_scan_for_encoding (SCM port)
{
@@ -1985,7 +1986,6 @@ scm_i_scan_for_encoding (SCM port)
char header[SCM_ENCODING_SEARCH_SIZE+1];
size_t bytes_read, encoding_length, i;
char *encoding = NULL;
- int utf8_bom = 0;
char *pos, *encoding_start;
int in_comment;
@@ -2030,9 +2030,26 @@ scm_i_scan_for_encoding (SCM port)
scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET));
}
- if (bytes_read > 3
+ /* If there is a byte-order mark, consume it, and use its
+ encoding. */
+ if (bytes_read >= 3
&& header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf')
- utf8_bom = 1;
+ {
+ pt->read_pos += 3;
+ return "UTF-8";
+ }
+ else if (bytes_read >= 2
+ && header[0] == '\xfe' && header[1] == '\xff')
+ {
+ pt->read_pos += 2;
+ return "UTF-16BE";
+ }
+ else if (bytes_read >= 2
+ && header[0] == '\xff' && header[1] == '\xfe')
+ {
+ pt->read_pos += 2;
+ return "UTF-16LE";
+ }
/* search past "coding[:=]" */
pos = header;
@@ -2102,11 +2119,6 @@ scm_i_scan_for_encoding (SCM port)
/* This wasn't in a comment */
return NULL;
- if (utf8_bom && strcmp(encoding, "UTF-8"))
- scm_misc_error (NULL,
- "the port input declares the encoding ~s but is encoded as UTF-8",
- scm_list_1 (scm_from_locale_string (encoding)));
-
return encoding;
}
@@ -2117,6 +2129,9 @@ SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0,
"The coding declaration is of the form\n"
"@code{coding: XXXXX} and must appear in a scheme comment.\n"
"\n"
+ "If a UTF-8 or UTF-16 BOM is present, it is consumed, and used in\n"
+ "preference to any coding declaration.\n"
+ "\n"
"Returns a string containing the character encoding of the file\n"
"if a declaration was found, or @code{#f} otherwise.\n")
#define FUNC_NAME s_scm_file_encoding
diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test
index a6bfb6e..ecbb3f1 100644
--- a/test-suite/tests/filesys.test
+++ b/test-suite/tests/filesys.test
@@ -1,6 +1,6 @@
;;;; filesys.test --- test file system functions -*- scheme -*-
;;;;
-;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc.
;;;;
;;;; This library is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU Lesser General Public
@@ -17,6 +17,8 @@
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
(define-module (test-suite test-filesys)
+ #:use-module (ice-9 rdelim)
+ #:use-module (ice-9 binary-ports)
#:use-module (test-suite lib)
#:use-module (test-suite guile-test))
@@ -127,3 +129,33 @@
(delete-file (test-file))
(delete-file (test-symlink))
+
+(let ((s "\ufeffHello, world!"))
+ (define (test-encoding encoding)
+ (with-fluids ((%default-port-encoding "ISO-8859-1"))
+ (let* ((bytes (catch 'misc-error
+ (lambda ()
+ (call-with-values open-bytevector-output-port
+ (lambda (port get-bytevector)
+ (set-port-encoding! port encoding)
+ (display s port)
+ (get-bytevector))))
+ (lambda args
+ (throw 'unresolved))))
+ (name (string-copy "myfile-XXXXXX"))
+ (port (mkstemp! name)))
+ (put-bytevector port bytes)
+ (close-port port)
+ (let ((contents (call-with-input-file name read-string)))
+ (delete-file name)
+ (equal? contents
+ (substring s 1))))))
+
+ (pass-if "UTF-8"
+ (test-encoding "UTF-8"))
+
+ (pass-if "UTF-16BE"
+ (test-encoding "UTF-16BE"))
+
+ (pass-if "UTF-16LE"
+ (test-encoding "UTF-16LE")))
--
1.7.10.4
[-- Attachment #3: Type: text/plain, Size: 26 bytes --]
--
http://wingolog.org/
next reply other threads:[~2013-01-28 21:42 UTC|newest]
Thread overview: 22+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-01-28 21:42 Andy Wingo [this message]
2013-01-28 22:20 ` byte-order marks Mike Gran
2013-01-29 9:03 ` Andy Wingo
2013-01-29 8:22 ` Mark H Weaver
2013-01-29 9:03 ` Andy Wingo
2013-01-29 13:27 ` Ludovic Courtès
2013-01-29 14:04 ` Andy Wingo
2013-01-29 17:09 ` Mark H Weaver
2013-01-29 19:09 ` Mark H Weaver
2013-01-29 20:52 ` Ludovic Courtès
2013-01-29 20:53 ` Ludovic Courtès
2013-01-30 9:20 ` Andy Wingo
2013-01-30 21:18 ` Ludovic Courtès
2013-01-31 8:52 ` Andy Wingo
2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver
2013-01-31 9:39 ` Andy Wingo
2013-01-31 10:33 ` Andy Wingo
2013-01-31 18:01 ` [PATCHES] Discard BOMs at stream start for UTF-{8, 16, 32} encodings Mark H Weaver
2013-01-31 21:42 ` Ludovic Courtès
2013-01-29 19:22 ` byte-order marks Neil Jerram
2013-01-29 21:09 ` Andy Wingo
2013-01-29 21:12 ` Neil Jerram
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/guile/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87boc956j2.fsf@pobox.com \
--to=wingo@pobox.com \
--cc=guile-devel@gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).