* byte-order marks @ 2013-01-28 21:42 Andy Wingo 2013-01-28 22:20 ` Mike Gran ` (2 more replies) 0 siblings, 3 replies; 22+ messages in thread From: Andy Wingo @ 2013-01-28 21:42 UTC (permalink / raw) To: guile-devel [-- Attachment #1: Type: text/plain, Size: 55 bytes --] What do people think about this attached patch? Andy [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: 0001-detect-and-consume-byte-order-marks-for-textual-port.patch --] [-- Type: text/x-diff, Size: 6546 bytes --] From 831c3418941f2d643f91e3076ef9458f700a2c59 Mon Sep 17 00:00:00 2001 From: Andy Wingo <wingo@pobox.com> Date: Mon, 28 Jan 2013 22:41:34 +0100 Subject: [PATCH] detect and consume byte-order marks for textual ports * libguile/read.c (scm_i_scan_for_encoding): If we see a BOM, use it in preference to any "coding" declaration, and consume it. This only happens in textual mode. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling. --- libguile/load.c | 4 ++++ libguile/read.c | 39 +++++++++++++++++++++++++++------------ test-suite/tests/filesys.test | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 64 insertions(+), 13 deletions(-) diff --git a/libguile/load.c b/libguile/load.c index 84b6705..b5e430e 100644 --- a/libguile/load.c +++ b/libguile/load.c @@ -106,6 +106,10 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0, scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE); scm_i_dynwind_current_load_port (port); + /* FIXME: For better or for worse, scm_open_file already scans the + file for an encoding. This scans again; necessary for this + logic, but unnecessary overall. As scanning for an encoding + consumes a BOM, this might mean we miss a BOM. */ encoding = scm_i_scan_for_encoding (port); if (encoding) scm_i_set_port_encoding_x (port, encoding); diff --git a/libguile/read.c b/libguile/read.c index 222891b..1a7462f 100644 --- a/libguile/read.c +++ b/libguile/read.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006, - * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1975,9 +1975,10 @@ scm_get_hash_procedure (int c) #define SCM_ENCODING_SEARCH_SIZE (500) -/* Search the first few hundred characters of a file for an Emacs-like coding - declaration. Returns either NULL or a string whose storage has been - allocated with `scm_gc_malloc ()'. */ +/* Search the first few hundred characters of a file for an Emacs-like + coding declaration. Returns either NULL or a string whose storage + has been allocated with `scm_gc_malloc ()'. If a BOM is present, it + is consumed and used in preference to any coding declaration. */ char * scm_i_scan_for_encoding (SCM port) { @@ -1985,7 +1986,6 @@ scm_i_scan_for_encoding (SCM port) char header[SCM_ENCODING_SEARCH_SIZE+1]; size_t bytes_read, encoding_length, i; char *encoding = NULL; - int utf8_bom = 0; char *pos, *encoding_start; int in_comment; @@ -2030,9 +2030,26 @@ scm_i_scan_for_encoding (SCM port) scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET)); } - if (bytes_read > 3 + /* If there is a byte-order mark, consume it, and use its + encoding. */ + if (bytes_read >= 3 && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf') - utf8_bom = 1; + { + pt->read_pos += 3; + return "UTF-8"; + } + else if (bytes_read >= 2 + && header[0] == '\xfe' && header[1] == '\xff') + { + pt->read_pos += 2; + return "UTF-16BE"; + } + else if (bytes_read >= 2 + && header[0] == '\xff' && header[1] == '\xfe') + { + pt->read_pos += 2; + return "UTF-16LE"; + } /* search past "coding[:=]" */ pos = header; @@ -2102,11 +2119,6 @@ scm_i_scan_for_encoding (SCM port) /* This wasn't in a comment */ return NULL; - if (utf8_bom && strcmp(encoding, "UTF-8")) - scm_misc_error (NULL, - "the port input declares the encoding ~s but is encoded as UTF-8", - scm_list_1 (scm_from_locale_string (encoding))); - return encoding; } @@ -2117,6 +2129,9 @@ SCM_DEFINE (scm_file_encoding, "file-encoding", 1, 0, 0, "The coding declaration is of the form\n" "@code{coding: XXXXX} and must appear in a scheme comment.\n" "\n" + "If a UTF-8 or UTF-16 BOM is present, it is consumed, and used in\n" + "preference to any coding declaration.\n" + "\n" "Returns a string containing the character encoding of the file\n" "if a declaration was found, or @code{#f} otherwise.\n") #define FUNC_NAME s_scm_file_encoding diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test index a6bfb6e..ecbb3f1 100644 --- a/test-suite/tests/filesys.test +++ b/test-suite/tests/filesys.test @@ -1,6 +1,6 @@ ;;;; filesys.test --- test file system functions -*- scheme -*- ;;;; -;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. +;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -17,6 +17,8 @@ ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA (define-module (test-suite test-filesys) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 binary-ports) #:use-module (test-suite lib) #:use-module (test-suite guile-test)) @@ -127,3 +129,33 @@ (delete-file (test-file)) (delete-file (test-symlink)) + +(let ((s "\ufeffHello, world!")) + (define (test-encoding encoding) + (with-fluids ((%default-port-encoding "ISO-8859-1")) + (let* ((bytes (catch 'misc-error + (lambda () + (call-with-values open-bytevector-output-port + (lambda (port get-bytevector) + (set-port-encoding! port encoding) + (display s port) + (get-bytevector)))) + (lambda args + (throw 'unresolved)))) + (name (string-copy "myfile-XXXXXX")) + (port (mkstemp! name))) + (put-bytevector port bytes) + (close-port port) + (let ((contents (call-with-input-file name read-string))) + (delete-file name) + (equal? contents + (substring s 1)))))) + + (pass-if "UTF-8" + (test-encoding "UTF-8")) + + (pass-if "UTF-16BE" + (test-encoding "UTF-16BE")) + + (pass-if "UTF-16LE" + (test-encoding "UTF-16LE"))) -- 1.7.10.4 [-- Attachment #3: Type: text/plain, Size: 26 bytes --] -- http://wingolog.org/ ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-28 21:42 byte-order marks Andy Wingo @ 2013-01-28 22:20 ` Mike Gran 2013-01-29 9:03 ` Andy Wingo 2013-01-29 8:22 ` Mark H Weaver 2013-01-29 19:22 ` byte-order marks Neil Jerram 2 siblings, 1 reply; 22+ messages in thread From: Mike Gran @ 2013-01-28 22:20 UTC (permalink / raw) To: Andy Wingo, guile-devel > What do people think about this attached patch? > > Andy If you find the word "coding" by scanning 8-bit char by 8-bit char, it can't be UTF-16, since that would be more like "c o d i n g :" with nulls interspersed. While rather unlikely, it is a theoretical possibility that a doc in encodings like ISO-8859-2 through 8859-5 could begin with 0xff 0xfe or 0xfe 0xff. They are valid characters. So if there is a "coding:" line in the doc, I think it should nullify giving precedence to a UTF-16 BOM. -Mike Gran ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-28 22:20 ` Mike Gran @ 2013-01-29 9:03 ` Andy Wingo 0 siblings, 0 replies; 22+ messages in thread From: Andy Wingo @ 2013-01-29 9:03 UTC (permalink / raw) To: Mike Gran; +Cc: guile-devel On Mon 28 Jan 2013 23:20, Mike Gran <spk121@yahoo.com> writes: > So if there is a "coding:" line in the doc, I think it > should nullify giving precedence to a UTF-16 BOM. OK. Cheers, Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-28 21:42 byte-order marks Andy Wingo 2013-01-28 22:20 ` Mike Gran @ 2013-01-29 8:22 ` Mark H Weaver 2013-01-29 9:03 ` Andy Wingo 2013-01-29 19:22 ` byte-order marks Neil Jerram 2 siblings, 1 reply; 22+ messages in thread From: Mark H Weaver @ 2013-01-29 8:22 UTC (permalink / raw) To: Andy Wingo; +Cc: guile-devel Hi Andy, Andy Wingo <wingo@pobox.com> writes: > What do people think about this attached patch? I'm strongly opposed to making 'open-input-file' any more clever than it already is. Furthermore, I strongly believe that it should be much less clever than it is now. Our basic textual I/O should be robust by default, and should not second-guess the specified encoding based on flimsy heuristics that work 99% of the time. IMO, our default behavior should allow portable scheme code to write an arbitrary string of characters to a file in some encoding, and later read it back, without having to worry about whether the string starts with something that looks like a BOM, or contains a string that looks like a coding declaration. The string might be from a network, and thus potentially from a malicious source. Frankly, I consider this to be a potential source of security flaws in software built using Guile, and on that basis would advocate removing the existing cleverness from 'open-input-file' in stable-2.0. At the very least it should be removed from master. Regarding byte-order marks, my preference is that users should explictly consume BOMs if that's what they want (ideally using some convenience procedure provided by Guile). Sometimes consuming the BOM is the wrong thing. For example, if the user is copying a file to another file, or to a socket, it may be important to preserve the BOM. If others feel strongly that BOMs should be consumed by default, then the following compromise is about as far as I'd (reluctantly) consider going: * 'open-input-file' could perhaps auto-consume a BOM at the beginning of the stream, but *only* if the BOM is already in the encoding specified by the user (possibly via an explicit call to 'file-encoding'). For example, if the specified port encoding is UTF-8, then EF BB BF would be consumed, but FE FF or FF FE would be left alone. * BOMs absolutely should *not* be used to determine the encoding unless the user has explicitly asked for coding auto-detection. Having said all this, if 'open-input-file' is changed to no longer call 'scm_i_scan_for_file_encoding', then I think it's a fine idea to add BOMs to its list of heuristics, though I tend to agree with Mike that a coding declaration should take precedence, for the reasons he described. However, I strongly believe that 'scm_i_scan_for_file_encoding' is the wrong place to consume BOMs. What do you think? Mark ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 8:22 ` Mark H Weaver @ 2013-01-29 9:03 ` Andy Wingo 2013-01-29 13:27 ` Ludovic Courtès 0 siblings, 1 reply; 22+ messages in thread From: Andy Wingo @ 2013-01-29 9:03 UTC (permalink / raw) To: Mark H Weaver; +Cc: guile-devel Hi Mark, Let me work the other way around, starting at the problem and not a potential solution. There is a file: https://cvs.khronos.org/svn/repos/ogl/trunk/ecosystem/public/sdk/docs/man3/glBlendEquationSeparate.xml It is valid XML. It also has a UTF-8 BOM. It fails to parse in SSAX. The reason is that the XML standard specifies that if an XML file starts with a <?xml ...?>, that the `<' must be the first character. It also recommends in a non-normative section that, for XML files, that the BOM (if any) together with the coding attribute on the XML be used to detect the character encoding. (http://www.w3.org/TR/REC-xml/#sec-guessing). This to me says that, for the purposes of XML, that the BOM is actually outside of the text of the document. And indeed, this does make sense in some way, and given that the Windows world seems to always prepend these marks on their text documents, it is a kind of "container". Anyway, it is someone's responsibility to consume the BOM. So I was thinking: whose? It *can't* be xml->sxml, because it receives a port already, and the BOM should only be interpreted in files from disk, not from e.g. sockets. So it's someone's responsibility outside the XML code. Whose? scm_i_scan_for_file_encoding is only called when opening files from the disk, in textual mode (without the "b" / O_BINARY flag). It seemed a safe place. I agree it's a bit hacky, but we are talking about the BOM here. On Tue 29 Jan 2013 09:22, Mark H Weaver <mhw@netris.org> writes: > IMO, our default behavior should allow portable scheme code to write an > arbitrary string of characters to a file in some encoding, and later > read it back, without having to worry about whether the string starts > with something that looks like a BOM, or contains a string that looks > like a coding declaration. I agree FWIW. > Frankly, I consider this to be a potential source of security flaws in > software built using Guile, and on that basis would advocate removing > the existing cleverness from 'open-input-file' in stable-2.0. At the > very least it should be removed from master. I agree as well. Want to make a patch? > Regarding byte-order marks, my preference is that users should explictly > consume BOMs if that's what they want (ideally using some convenience > procedure provided by Guile). Sometimes consuming the BOM is the wrong > thing. For example, if the user is copying a file to another file, or > to a socket, it may be important to preserve the BOM. If you are copying a binary file, you should use binary APIs. Otherwise you can misinterpret the characters, and potentially write them as a different encoding. Also otherwise, without O_BINARY on Windows, you will end up munging line-ends. So from a portable perspective, reading a file as characters already implies munging the text. If you are copying a textual file, you need to know how to decode it; and in that case a BOM can be helpful. I do not feel strongly about this point however. > If others feel strongly that BOMs should be consumed by default, then > the following compromise is about as far as I'd (reluctantly) consider > going: > > * 'open-input-file' could perhaps auto-consume a BOM at the beginning of > the stream, but *only* if the BOM is already in the encoding specified > by the user (possibly via an explicit call to 'file-encoding'). The problem is that we have no way of knowing what file encoding the user specifies. The encoding could come from the environment, or from some fluid that some other piece of code binds. We are really missing an encoding argument to open-file. > * BOMs absolutely should *not* be used to determine the encoding unless > the user has explicitly asked for coding auto-detection. OK. > Having said all this, if 'open-input-file' is changed to no longer call > 'scm_i_scan_for_file_encoding', then I think it's a fine idea to add > BOMs to its list of heuristics, though I tend to agree with Mike that a > coding declaration should take precedence, for the reasons he described. OK. Incidentally we should relax the scan-for-encoding requirement that the coding be in a comment, as we will begin compiling javascript, lua, etc files in the future. That would perhaps allow XML encodings to be automatically detected as well. > What do you think? I liked that my solution "just worked" with a small amount of code and no changes to the rest of the application. I can't help but think that requiring the user to put in more code is going to infect an endless set of call sites with little "helper" procedures that aren't going to be more correct in aggregate. Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 9:03 ` Andy Wingo @ 2013-01-29 13:27 ` Ludovic Courtès 2013-01-29 14:04 ` Andy Wingo 0 siblings, 1 reply; 22+ messages in thread From: Ludovic Courtès @ 2013-01-29 13:27 UTC (permalink / raw) To: guile-devel Andy Wingo <wingo@pobox.com> skribis: [...] >> Regarding byte-order marks, my preference is that users should explictly >> consume BOMs if that's what they want (ideally using some convenience >> procedure provided by Guile). Sometimes consuming the BOM is the wrong >> thing. For example, if the user is copying a file to another file, or >> to a socket, it may be important to preserve the BOM. > > If you are copying a binary file, you should use binary APIs. Otherwise > you can misinterpret the characters, and potentially write them as a > different encoding. > > Also otherwise, without O_BINARY on Windows, you will end up munging > line-ends. So from a portable perspective, reading a file as > characters already implies munging the text. Agreed. Reading textual data implies interpretation of its byte structure, and the BOM is just part of that meta-data. >> If others feel strongly that BOMs should be consumed by default, then >> the following compromise is about as far as I'd (reluctantly) consider >> going: >> >> * 'open-input-file' could perhaps auto-consume a BOM at the beginning of >> the stream, but *only* if the BOM is already in the encoding specified >> by the user (possibly via an explicit call to 'file-encoding'). > > The problem is that we have no way of knowing what file encoding the > user specifies. The encoding could come from the environment, or from > some fluid that some other piece of code binds. We are really missing > an encoding argument to open-file. Well, ‘%default-port-encoding’ is really an argument to ‘open-file’, though admittedly not a convenient one. However, there’s no way to open a file in binary mode when using ‘open-input-file’, ‘call-with-input-file’, etc. >> Having said all this, if 'open-input-file' is changed to no longer call >> 'scm_i_scan_for_file_encoding', then I think it's a fine idea to add >> BOMs to its list of heuristics, though I tend to agree with Mike that a >> coding declaration should take precedence, for the reasons he described. > > OK. Incidentally we should relax the scan-for-encoding requirement that > the coding be in a comment, as we will begin compiling javascript, lua, > etc files in the future. OTOH, that would make it more likely that the “coding:” sequence is misinterpreted as a coding declaration in contexts that have nothing to do with that. > I liked that my solution "just worked" with a small amount of code and > no changes to the rest of the application. I can't help but think that > requiring the user to put in more code is going to infect an endless set > of call sites with little "helper" procedures that aren't going to be > more correct in aggregate. For textual files, it doesn’t seem unreasonable for ‘open-input-file’ to consume the BOM, IMO. It’s not much different from the ‘eol-style’ transcoders. Ludo’. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 13:27 ` Ludovic Courtès @ 2013-01-29 14:04 ` Andy Wingo 2013-01-29 17:09 ` Mark H Weaver 0 siblings, 1 reply; 22+ messages in thread From: Andy Wingo @ 2013-01-29 14:04 UTC (permalink / raw) To: Ludovic Courtès; +Cc: guile-devel Hi, [Ludo and Mark and I scribas]: >>> * 'open-input-file' could perhaps auto-consume a BOM at the beginning of >>> the stream, but *only* if the BOM is already in the encoding specified >>> by the user (possibly via an explicit call to 'file-encoding'). >> >> The problem is that we have no way of knowing what file encoding the >> user specifies. The encoding could come from the environment, or from >> some fluid that some other piece of code binds. We are really missing >> an encoding argument to open-file. > > Well, ‘%default-port-encoding’ is really an argument to ‘open-file’, > though admittedly not a convenient one. Dunno :) In the end this reduces to saying "the user always specifies a port encoding". > However, there’s no way to open a file in binary mode when using > ‘open-input-file’, ‘call-with-input-file’, etc. We can add keyword or optional arguments of course. (Not suggesting that we do so at this time though.) >> I liked that my solution "just worked" with a small amount of code and >> no changes to the rest of the application. I can't help but think that >> requiring the user to put in more code is going to infect an endless set >> of call sites with little "helper" procedures that aren't going to be >> more correct in aggregate. > > For textual files, it doesn’t seem unreasonable for ‘open-input-file’ to > consume the BOM, IMO. It’s not much different from the ‘eol-style’ > transcoders. I could go either way. I would prefer for open-input-file to consume a BOM on textual files. But I have another patch that fixes the (sxml simple) problem, so I'm also OK with punting on this issue for now. Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 14:04 ` Andy Wingo @ 2013-01-29 17:09 ` Mark H Weaver 2013-01-29 19:09 ` Mark H Weaver ` (2 more replies) 0 siblings, 3 replies; 22+ messages in thread From: Mark H Weaver @ 2013-01-29 17:09 UTC (permalink / raw) To: Andy Wingo; +Cc: Ludovic Courtès, guile-devel Hi, ludo@gnu.org (Ludovic Courtès) writes: >> For textual files, it doesn’t seem unreasonable for ‘open-input-file’ to >> consume the BOM, IMO. It’s not much different from the ‘eol-style’ >> transcoders. Andy Wingo <wingo@pobox.com> writes: > I could go either way. I would prefer for open-input-file to consume a > BOM on textual files. Having slept on this, I think I agree that 'open-input-file' should auto-consume BOMs. As you say, textual transcoders are already somewhat lossy anyway. If the user wants to preserve such details, they should use binary I/O. However, 'open-input-file' should not auto-detect the encoding by default, and should only consume BOMs that match the specified encoding. 'scm_i_scan_for_file_encoding' should look for (but not consume) BOMs as a last resort, but only if no coding declaration is found. > But I have another patch that fixes the (sxml simple) problem, so I'm > also OK with punting on this issue for now. IMO, BOMs should probably also be consumed by (sxml simple), but again only if the BOM is already in the previously specified encoding. This is to handle the case where the XML is read from a non-file stream whose contents originally comes from a file containing a BOM, e.g. from a web server that losslessly copies a static file to the socket. > [Ludo and Mark and I scribas]: >>>> * 'open-input-file' could perhaps auto-consume a BOM at the beginning of >>>> the stream, but *only* if the BOM is already in the encoding specified >>>> by the user (possibly via an explicit call to 'file-encoding'). >>> >>> The problem is that we have no way of knowing what file encoding the >>> user specifies. The encoding could come from the environment, or from >>> some fluid that some other piece of code binds. We are really missing >>> an encoding argument to open-file. >> >> Well, ‘%default-port-encoding’ is really an argument to ‘open-file’, >> though admittedly not a convenient one. > > Dunno :) In the end this reduces to saying "the user always specifies a > port encoding". A common case, hopefully soon to be nearly ubiquitous, are modern OSes that use UTF-8 locales by default, and where virtually all textual data on the system is encoded using UTF-8. I'd like this to be robust, and not broken by files that contain strings that look like coding declarations. >> However, there’s no way to open a file in binary mode when using >> ‘open-input-file’, ‘call-with-input-file’, etc. > > We can add keyword or optional arguments of course. (Not suggesting > that we do so at this time though.) This has been on my TODO list for a while, and I agree that it would be a good thing. What do you think? Regards, Mark ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 17:09 ` Mark H Weaver @ 2013-01-29 19:09 ` Mark H Weaver 2013-01-29 20:52 ` Ludovic Courtès 2013-01-29 20:53 ` Ludovic Courtès 2013-01-30 9:20 ` Andy Wingo 2 siblings, 1 reply; 22+ messages in thread From: Mark H Weaver @ 2013-01-29 19:09 UTC (permalink / raw) To: Andy Wingo; +Cc: Ludovic Courtès, guile-devel I wrote: > Having slept on this, I think I agree that 'open-input-file' should > auto-consume BOMs. On the other hand, there's a nasty complication. Of course (open-input-file FILENAME) is just (open-file FILENAME "r"), so the auto-consuming logic should be in 'open-file'. So what should (open-file FILENAME "r+") do? The problem is that we don't know if the user will read or write first. If they write first, then they may reasonably assume that what they write will be put at the very beginning of the file, no? Also, Unicode 6.2 section 2.6 table 2-4 says that BOMs are only allowed for the encoding schemes UTF-8, UTF-16, and UTF-32. They are *not* allowed for UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE. Unicode 6.2 section 16.8 goes into more detail: For compatibility with versions of the Unicode Standard prior to Version 3.2, the code point U+FEFF has the word-joining semantics of zero width no-break space when it is not used as a BOM. [...] Where the byte order is explicitly specified, such as in UTF-16BE or UTF-16LE, then all U+FEFF characters -- even at the very beginning of the text -- are to be interpreted as zero width no-break spaces. Similarly, where Unicode text has known byte order, initial U+FEFF characters are not required, but for backward compatibility are to be interpreted as zero width no-break spaces. [...] Systems that use the byte order mark must recognize when an initial U+FEFF signals the byte order. In those cases, it is not part of the textual content and should be removed before processing, because otherwise it may be mistaken for a legitimate zero width no-break space. To represent an initial U+FEFF zero width no-break space in a UTF-16 file, use U+FEFF twice in a row. The first one is a byte order mark; the second one is the initial zero width no-break space. [...] This will require some more research and thought. Mark ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 19:09 ` Mark H Weaver @ 2013-01-29 20:52 ` Ludovic Courtès 0 siblings, 0 replies; 22+ messages in thread From: Ludovic Courtès @ 2013-01-29 20:52 UTC (permalink / raw) To: Mark H Weaver; +Cc: Andy Wingo, guile-devel Mark H Weaver <mhw@netris.org> skribis: > I wrote: >> Having slept on this, I think I agree that 'open-input-file' should >> auto-consume BOMs. Good. > So what should (open-file FILENAME "r+") do? What about doing the same as for just “r”? I can’t think of any reasonable scenario where this could be a problem in practice. > Also, Unicode 6.2 section 2.6 table 2-4 says that BOMs are only allowed > for the encoding schemes UTF-8, UTF-16, and UTF-32. They are *not* > allowed for UTF-16BE, UTF-16LE, UTF-32BE, or UTF-32LE. What about this: in ‘open-file’, if %default-port-encoding is one of the BE/LE variants, then skip the BOM logic; otherwise, check the presence of a BOM and consume it? Thanks for investigating! Ludo’. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 17:09 ` Mark H Weaver 2013-01-29 19:09 ` Mark H Weaver @ 2013-01-29 20:53 ` Ludovic Courtès 2013-01-30 9:20 ` Andy Wingo 2 siblings, 0 replies; 22+ messages in thread From: Ludovic Courtès @ 2013-01-29 20:53 UTC (permalink / raw) To: Mark H Weaver; +Cc: Andy Wingo, guile-devel Mark H Weaver <mhw@netris.org> skribis: >>> However, there’s no way to open a file in binary mode when using >>> ‘open-input-file’, ‘call-with-input-file’, etc. >> >> We can add keyword or optional arguments of course. (Not suggesting >> that we do so at this time though.) > > This has been on my TODO list for a while, and I agree that it would be > a good thing. > > What do you think? +1! (Actually I’d prefer a world where binary mode doesn’t exist at all, but that’s what we have.) Ludo’. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 17:09 ` Mark H Weaver 2013-01-29 19:09 ` Mark H Weaver 2013-01-29 20:53 ` Ludovic Courtès @ 2013-01-30 9:20 ` Andy Wingo 2013-01-30 21:18 ` Ludovic Courtès 2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver 2 siblings, 2 replies; 22+ messages in thread From: Andy Wingo @ 2013-01-30 9:20 UTC (permalink / raw) To: Mark H Weaver; +Cc: Ludovic Courtès, guile-devel [-- Attachment #1: Type: text/plain, Size: 887 bytes --] Hi, On Tue 29 Jan 2013 18:09, Mark H Weaver <mhw@netris.org> writes: > Having slept on this, I think I agree that 'open-input-file' should > auto-consume BOMs. Patch attached. > However, 'open-input-file' should not auto-detect the encoding by > default, The ball is in your court now :) > and should only consume BOMs that match the specified encoding. What do you mean by "specified encoding"? > 'scm_i_scan_for_file_encoding' should look for (but not consume) BOMs as > a last resort, but only if no coding declaration is found. I removed the BOM handling from this routine entirely. >> But I have another patch that fixes the (sxml simple) problem, so I'm >> also OK with punting on this issue for now. > > IMO, BOMs should probably also be consumed by (sxml simple), but again > only if the BOM is already in the previously specified encoding. I will punt on this one. [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: 0001-detect-and-consume-byte-order-marks-for-textual-port.patch --] [-- Type: text/x-diff, Size: 13043 bytes --] From 5512fe4f93e4e583ab538ae02dd98e5825252dc9 Mon Sep 17 00:00:00 2001 From: Andy Wingo <wingo@pobox.com> Date: Wed, 30 Jan 2013 10:17:25 +0100 Subject: [PATCH] detect and consume byte-order marks for textual ports * libguile/ports.h: * libguile/ports.c (scm_consume_byte_order_mark): New procedure. * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we are opening a file in "r" mode. * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about byte-order marks. * libguile/load.c (scm_primitive_load): Add a note about the duplicate encoding scan. * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and UTF-16LE BOM handling. --- libguile/fports.c | 35 +++++++++-------- libguile/load.c | 3 ++ libguile/ports.c | 85 ++++++++++++++++++++++++++++++++++++++++- libguile/ports.h | 3 +- libguile/read.c | 14 +------ test-suite/tests/filesys.test | 59 +++++++++++++++++++++++++++- 6 files changed, 169 insertions(+), 30 deletions(-) diff --git a/libguile/fports.c b/libguile/fports.c index 10cf671..fbc0530 100644 --- a/libguile/fports.c +++ b/libguile/fports.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, - * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2004, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -399,7 +399,7 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, #define FUNC_NAME s_scm_open_file { SCM port; - int fdes, flags = 0, use_encoding = 1; + int fdes, flags = 0, scan_for_encoding = 0, consume_bom = 0, binary = 0; unsigned int retries; char *file, *md, *ptr; @@ -415,6 +415,8 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, { case 'r': flags |= O_RDONLY; + consume_bom = 1; + scan_for_encoding = 1; break; case 'w': flags |= O_WRONLY | O_CREAT | O_TRUNC; @@ -432,9 +434,12 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, { case '+': flags = (flags & ~(O_RDONLY | O_WRONLY)) | O_RDWR; + consume_bom = 0; break; case 'b': - use_encoding = 0; + scan_for_encoding = 0; + consume_bom = 0; + binary = 1; #if defined (O_BINARY) flags |= O_BINARY; #endif @@ -473,21 +478,21 @@ SCM_DEFINE (scm_open_file, "open-file", 2, 0, 0, port = scm_i_fdes_to_port (fdes, scm_i_mode_bits (mode), fport_canonicalize_filename (filename)); - if (use_encoding) - { - /* If this file has a coding declaration, use that as the port - encoding. */ - if (SCM_INPUT_PORT_P (port)) - { - char *enc = scm_i_scan_for_encoding (port); - if (enc != NULL) - scm_i_set_port_encoding_x (port, enc); - } - } - else + if (consume_bom) + scm_consume_byte_order_mark (port); + + if (binary) /* If this is a binary file, use the binary-friendly ISO-8859-1 encoding. */ scm_i_set_port_encoding_x (port, NULL); + else if (scan_for_encoding) + /* If this is an input port and the file has a coding declaration, + use that as the port encoding. */ + { + char *enc = scm_i_scan_for_encoding (port); + if (enc != NULL) + scm_i_set_port_encoding_x (port, enc); + } scm_dynwind_end (); diff --git a/libguile/load.c b/libguile/load.c index 84b6705..476461c 100644 --- a/libguile/load.c +++ b/libguile/load.c @@ -106,6 +106,9 @@ SCM_DEFINE (scm_primitive_load, "primitive-load", 1, 0, 0, scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE); scm_i_dynwind_current_load_port (port); + /* FIXME: For better or for worse, scm_open_file already scans the + file for an encoding. This scans again; necessary for this + logic, but unnecessary overall. */ encoding = scm_i_scan_for_encoding (port); if (encoding) scm_i_set_port_encoding_x (port, encoding); diff --git a/libguile/ports.c b/libguile/ports.c index 55808e2..9b1be9b 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, - * 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -2153,6 +2153,89 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0, } #undef FUNC_NAME +SCM_DEFINE (scm_consume_byte_order_mark, "consume-byte-order-mark", 1, 0, 0, + (SCM port), + "Peek ahead in @var{port} for a byte-order mark (\\uFEFF) encoded\n" + "in UTF-8 or in UTF-16. If found, consume the byte-order mark\n" + "and set the port to the indicated encoding.\n" + "\n" + "As a special case, if the port's encoding is already UTF-16LE\n" + "or UTF-16BE (as opposed to UTF-16), we consider that the user\n" + "has already asked for an explicit byte order. In this case no\n" + "scan is performed, and the byte-order mark (if any) is left in\n" + "the port.\n" + "\n" + "Return @code{#t} if a byte-order mark was consumed, and\n" + "@code{#f} otherwise.") +#define FUNC_NAME s_scm_consume_byte_order_mark +{ + scm_t_port *pt; + const char *enc; + + SCM_VALIDATE_PORT (1, port); + + pt = SCM_PTAB_ENTRY (port); + enc = pt->encoding; + + if (enc && strcasecmp (enc, "UTF-16BE") == 0) + return SCM_BOOL_F; + + if (enc && strcasecmp (enc, "UTF-16LE") == 0) + return SCM_BOOL_F; + + switch (scm_peek_byte_or_eof (port)) + { + case 0xEF: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xBB: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xBF: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-8"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xBB, port); + scm_unget_byte (0xEF, port); + return SCM_BOOL_F; + } + default: + scm_unget_byte (0xEF, port); + return SCM_BOOL_F; + } + case 0xFE: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xFF: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-16BE"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xFE, port); + return SCM_BOOL_F; + } + case 0xFF: + scm_get_byte_or_eof (port); + switch (scm_peek_byte_or_eof (port)) + { + case 0xFE: + scm_get_byte_or_eof (port); + scm_i_set_port_encoding_x (port, "UTF-16LE"); + return SCM_BOOL_T; + default: + scm_unget_byte (0xFF, port); + return SCM_BOOL_F; + } + default: + return SCM_BOOL_F; + } +} +#undef FUNC_NAME + /* A fluid specifying the default encoding for newly created ports. If it is a string, that is the encoding. If it is #f, it is in the "native" (Latin-1) encoding. */ diff --git a/libguile/ports.h b/libguile/ports.h index d4d59b7..2f32345 100644 --- a/libguile/ports.h +++ b/libguile/ports.h @@ -4,7 +4,7 @@ #define SCM_PORTS_H /* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, - * 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -303,6 +303,7 @@ SCM_API SCM scm_port_column (SCM port); SCM_API SCM scm_set_port_column_x (SCM port, SCM line); SCM_API SCM scm_port_filename (SCM port); SCM_API SCM scm_set_port_filename_x (SCM port, SCM filename); +SCM_API SCM scm_consume_byte_order_mark (SCM port); SCM_INTERNAL const char *scm_i_default_port_encoding (void); SCM_INTERNAL void scm_i_set_default_port_encoding (const char *); SCM_INTERNAL void scm_i_set_port_encoding_x (SCM port, const char *str); diff --git a/libguile/read.c b/libguile/read.c index 222891b..a8f7744 100644 --- a/libguile/read.c +++ b/libguile/read.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006, - * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. + * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port) char header[SCM_ENCODING_SEARCH_SIZE+1]; size_t bytes_read, encoding_length, i; char *encoding = NULL; - int utf8_bom = 0; char *pos, *encoding_start; int in_comment; @@ -2027,13 +2026,9 @@ scm_i_scan_for_encoding (SCM port) bytes_read = scm_c_read (port, header, SCM_ENCODING_SEARCH_SIZE); header[bytes_read] = '\0'; - scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET)); + scm_seek (port, scm_from_int (-bytes_read), scm_from_int (SEEK_CUR)); } - if (bytes_read > 3 - && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf') - utf8_bom = 1; - /* search past "coding[:=]" */ pos = header; while (1) @@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port) /* This wasn't in a comment */ return NULL; - if (utf8_bom && strcmp(encoding, "UTF-8")) - scm_misc_error (NULL, - "the port input declares the encoding ~s but is encoded as UTF-8", - scm_list_1 (scm_from_locale_string (encoding))); - return encoding; } diff --git a/test-suite/tests/filesys.test b/test-suite/tests/filesys.test index a6bfb6e..8bd974d 100644 --- a/test-suite/tests/filesys.test +++ b/test-suite/tests/filesys.test @@ -1,6 +1,6 @@ ;;;; filesys.test --- test file system functions -*- scheme -*- ;;;; -;;;; Copyright (C) 2004, 2006 Free Software Foundation, Inc. +;;;; Copyright (C) 2004, 2006, 2013 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -17,6 +17,8 @@ ;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA (define-module (test-suite test-filesys) + #:use-module (ice-9 rdelim) + #:use-module (ice-9 binary-ports) #:use-module (test-suite lib) #:use-module (test-suite guile-test)) @@ -127,3 +129,58 @@ (delete-file (test-file)) (delete-file (test-symlink)) + +(let ((s "\ufeffHello, world!")) + (define* (test-encoding encoding #:optional (ambient "ISO-8859-1")) + (with-fluids ((%default-port-encoding ambient)) + (let* ((bytes (catch 'misc-error + (lambda () + (call-with-values open-bytevector-output-port + (lambda (port get-bytevector) + (set-port-encoding! port encoding) + (display s port) + (get-bytevector)))) + (lambda args + (throw 'unresolved)))) + (name (string-copy "myfile-XXXXXX")) + (port (mkstemp! name))) + (put-bytevector port bytes) + (close-port port) + (let ((contents (call-with-input-file name read-string))) + (delete-file name) + contents)))) + + (pass-if "UTF-8" + (equal? (test-encoding "UTF-8") + "Hello, world!")) + + (pass-if "UTF-16BE" + (equal? (test-encoding "UTF-16BE") + "Hello, world!")) + + (pass-if "UTF-16LE" + (equal? (test-encoding "UTF-16LE") + "Hello, world!")) + + (pass-if "UTF-8 (ambient)" + (equal? (test-encoding "UTF-8" "UTF-8") + "Hello, world!")) + + (pass-if "UTF-8 (UTF-16 ambient)" + (equal? (test-encoding "UTF-8" "UTF-16") + "Hello, world!")) + + ;; Unicode 6.2 section 16.8: + ;; + ;; For compatibility with versions of the Unicode Standard prior to + ;; Version 3.2, the code point U+FEFF has the word-joining semantics + ;; of zero width no-break space when it is not used as a BOM. [...] + ;; + ;; Where the byte order is explicitly specified, such as in UTF-16BE + ;; or UTF-16LE, then all U+FEFF characters -- even at the very + ;; beginning of the text -- are to be interpreted as zero width + ;; no-break spaces. + ;; + (pass-if "UTF-16LE (ambient)" + (equal? (test-encoding "UTF-16LE" "UTF-16LE") + "\ufeffHello, world!"))) -- 1.7.10.4 [-- Attachment #3: Type: text/plain, Size: 26 bytes --] -- http://wingolog.org/ ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-30 9:20 ` Andy Wingo @ 2013-01-30 21:18 ` Ludovic Courtès 2013-01-31 8:52 ` Andy Wingo 2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver 1 sibling, 1 reply; 22+ messages in thread From: Ludovic Courtès @ 2013-01-30 21:18 UTC (permalink / raw) To: Andy Wingo; +Cc: Mark H Weaver, guile-devel Hello! Andy Wingo <wingo@pobox.com> skribis: > On Tue 29 Jan 2013 18:09, Mark H Weaver <mhw@netris.org> writes: [...] >> However, 'open-input-file' should not auto-detect the encoding by >> default, > > The ball is in your court now :) Can we discuss this one in the other thread, so my little brain and mailbox don’t get confused? :-) > From 5512fe4f93e4e583ab538ae02dd98e5825252dc9 Mon Sep 17 00:00:00 2001 > From: Andy Wingo <wingo@pobox.com> > Date: Wed, 30 Jan 2013 10:17:25 +0100 > Subject: [PATCH] detect and consume byte-order marks for textual ports > > * libguile/ports.h: > * libguile/ports.c (scm_consume_byte_order_mark): New procedure. > > * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we > are opening a file in "r" mode. > > * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about > byte-order marks. > > * libguile/load.c (scm_primitive_load): Add a note about the duplicate > encoding scan. > > * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and > UTF-16LE BOM handling. Looks good to me. Thanks! Ludo’. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-30 21:18 ` Ludovic Courtès @ 2013-01-31 8:52 ` Andy Wingo 0 siblings, 0 replies; 22+ messages in thread From: Andy Wingo @ 2013-01-31 8:52 UTC (permalink / raw) To: Ludovic Courtès; +Cc: Mark H Weaver, guile-devel On Wed 30 Jan 2013 22:18, ludo@gnu.org (Ludovic Courtès) writes: > Can we discuss this one in the other thread, so my little brain and > mailbox don’t get confused? :-) > >> From 5512fe4f93e4e583ab538ae02dd98e5825252dc9 Mon Sep 17 00:00:00 2001 >> From: Andy Wingo <wingo@pobox.com> >> Date: Wed, 30 Jan 2013 10:17:25 +0100 >> Subject: [PATCH] detect and consume byte-order marks for textual ports >> >> * libguile/ports.h: >> * libguile/ports.c (scm_consume_byte_order_mark): New procedure. >> >> * libguile/fports.c (scm_open_file): Call consume-byte-order-mark if we >> are opening a file in "r" mode. >> >> * libguile/read.c (scm_i_scan_for_encoding): Don't do anything about >> byte-order marks. >> >> * libguile/load.c (scm_primitive_load): Add a note about the duplicate >> encoding scan. >> >> * test-suite/tests/filesys.test: Add tests for UTF-8, UTF-16BE, and >> UTF-16LE BOM handling. > > Looks good to me. It's terribly confusing, I'm sorry. I accidentally pushed this patch, then later pushed a reversion. Apologies for the mess. I think Mark's patches are the state of the art, off to review them now... Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings 2013-01-30 9:20 ` Andy Wingo 2013-01-30 21:18 ` Ludovic Courtès @ 2013-01-31 4:40 ` Mark H Weaver 2013-01-31 9:39 ` Andy Wingo 2013-01-31 21:42 ` Ludovic Courtès 1 sibling, 2 replies; 22+ messages in thread From: Mark H Weaver @ 2013-01-31 4:40 UTC (permalink / raw) To: Andy Wingo; +Cc: Ludovic Courtès, guile-devel [-- Attachment #1: Type: text/plain, Size: 2448 bytes --] Hello all, I researched this some more, and discovered that removal of byte-order marks (BOMs) is the responsibility of iconv, which discards BOMs from the beginning of streams when using the UTF-16 or UTF-32 encodings, but *not* for UTF-16LE, UTF-16GE, UTF-32LE, UTF-32GE or any other encoding. It uses the BOM to determine the endianness of the stream, but other than that does *not* use it to guess the encoding, so there's no guesswork involved. (Side note: iconv also inserts a BOM automatically when writing a stream using UTF-16 or UTF-32). Note that the discarding of BOMs does *not* happen when opening a port, but instead when reading from a port for the first time (or after calling 'set-port-encoding!', since that creates a fresh iconv descriptor). This is important for a few reasons: It solves the nasty complication of (open-file NAME "r+"), it works for any kind of stream (not just files), and it does not block waiting for input immediately after opening a port. So thanks to iconv, we get UTF-{16,32} BOM removal for free. Unfortunately we have a nasty bug in 'get_iconv_codepoint' that leads to a buffer overrun and assertion failure when 'iconv' discards a BOM. It can be triggered as follows: (use-modules (rnrs io ports)) (let ((port (open-bytevector-input-port #vu8(255 254 97 0)))) (set-port-encoding! port "UTF-16") (read-char port)) The first patch below fixes this problem. I ended up almost completely rewriting that function, partly because it was largely structured around a mistaken assumption that iconv will never consume input without producing output, and partly because it was quite inefficient (several unnecessary conditional branches in the loop) and IMO was rather difficult to read. So what about UTF-8? Since we handle UTF-8 internally, we have to handle UTF-8 BOM removal ourselves. The second patch implements this with the same semantics as iconv does for UTF-{16,32}. It was a bit tricky because we need to keep track of whether we're at the beginning of the stream, and we cannot add any more fields to scm_t_port in stable-2.0. I handled this by adding some more special values for the iconv descriptor pointers. As a bonus, we no longer call 'scm_i_set_port_encoding_x' once per character when using UTF-8 :) The third patch removes the byte-order mark check from scm_i_scan_for_encoding, following Andy's suggestion. Comments and suggestions solicited. Mark [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #2: [PATCH 1/3] Rewrite get_iconv_codepoint to fix a bug involving byte-order marks --] [-- Type: text/x-diff, Size: 9533 bytes --] From ceccaf59267bc98c86aae33809905f26b017ebc8 Mon Sep 17 00:00:00 2001 From: Mark H Weaver <mhw@netris.org> Date: Wed, 30 Jan 2013 10:16:37 -0500 Subject: [PATCH 1/3] Rewrite get_iconv_codepoint to fix a bug involving byte-order marks. * libguile/ports.c (get_iconv_codepoint): Rewrite to fix a bug and improve efficiency and clarity. Previously, it incorrectly assumed that iconv would never consume input without producing output, which led to a buffer overrun and subsequent assertion failure. This happens when a byte-order mark is consumed by iconv at the beginning of the stream when using the UTF-16 or UTF-32 encodings. * test-suite/tests/ports.test (unicode byte-order marks (BOMs)): Add tests. --- libguile/ports.c | 95 ++++++++++++++++++++++-------------------- test-suite/tests/ports.test | 96 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 145 insertions(+), 46 deletions(-) diff --git a/libguile/ports.c b/libguile/ports.c index 55808e2..a1d6c96 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1,5 +1,5 @@ -/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, - * 2006, 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. +/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2006, + * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1292,66 +1292,73 @@ static int get_iconv_codepoint (SCM port, scm_t_wchar *codepoint, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) { - scm_t_port *pt; - int err, byte_read; - size_t bytes_consumed, output_size; - char *output; + scm_t_port *pt = SCM_PTAB_ENTRY (port); scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE]; + size_t input_size = 0; - pt = SCM_PTAB_ENTRY (port); - - for (output_size = 0, output = (char *) utf8_buf, - bytes_consumed = 0, err = 0; - err == 0 && output_size == 0 - && (bytes_consumed == 0 || byte_read != EOF); - bytes_consumed++) + for (;;) { - char *input; + int byte_read; + char *input, *output; size_t input_left, output_left, done; byte_read = scm_get_byte_or_eof (port); - if (byte_read == EOF) + if (SCM_UNLIKELY (byte_read == EOF)) { - if (bytes_consumed == 0) - { - *codepoint = (scm_t_wchar) EOF; - *len = 0; - return 0; - } - else - continue; + if (SCM_LIKELY (input_size == 0)) + { + *codepoint = (scm_t_wchar) EOF; + *len = input_size; + return 0; + } + else + /* EOF found in the middle of a multibyte character. */ + return EILSEQ; } - buf[bytes_consumed] = byte_read; + buf[input_size++] = byte_read; input = buf; - input_left = bytes_consumed + 1; + input_left = input_size; + output = (char *) utf8_buf; output_left = sizeof (utf8_buf); - done = iconv (pt->input_cd, &input, &input_left, - &output, &output_left); + done = iconv (pt->input_cd, &input, &input_left, &output, &output_left); + if (done == (size_t) -1) { - err = errno; - if (err == EINVAL) - /* Missing input: keep trying. */ - err = 0; + int err = errno; + if (SCM_LIKELY (err == EINVAL)) + /* The input byte sequence did not form a complete + character. Read another byte and try again. */ + continue; + else + return err; } else - output_size = sizeof (utf8_buf) - output_left; - } - - if (SCM_UNLIKELY (output_size == 0)) - /* An unterminated sequence. */ - err = EILSEQ; - else if (SCM_LIKELY (err == 0)) - { - /* Convert the UTF8_BUF sequence to a Unicode code point. */ - *codepoint = utf8_to_codepoint (utf8_buf, output_size); - *len = bytes_consumed; + { + size_t output_size = sizeof (utf8_buf) - output_left; + if (SCM_LIKELY (output_size > 0)) + { + /* iconv generated output. Convert the UTF8_BUF sequence + to a Unicode code point. */ + *codepoint = utf8_to_codepoint (utf8_buf, output_size); + *len = input_size; + return 0; + } + else + { + /* iconv consumed some bytes without producing any output. + Most likely this means that a Unicode byte-order mark + (BOM) was consumed, which should not be included in the + returned buf. Shift any remaining bytes to the beginning + of buf, and continue the loop. */ + memcpy (buf, input, input_left); + input_size = input_left; + continue; + } + } } - - return err; } /* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 613d269..38044c6 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -2,7 +2,7 @@ ;;;; Jim Blandy <jimb@red-bean.com> --- May 1999 ;;;; ;;;; Copyright (C) 1999, 2001, 2004, 2006, 2007, 2009, 2010, -;;;; 2011, 2012 Free Software Foundation, Inc. +;;;; 2011, 2012, 2013 Free Software Foundation, Inc. ;;;; ;;;; This library is free software; you can redistribute it and/or ;;;; modify it under the terms of the GNU Lesser General Public @@ -24,7 +24,7 @@ #:use-module (ice-9 popen) #:use-module (ice-9 rdelim) #:use-module (rnrs bytevectors) - #:use-module ((rnrs io ports) #:select (open-bytevector-input-port))) + #:use-module ((ice-9 binary-ports) #:select (open-bytevector-input-port))) (define (display-line . args) (for-each display args) @@ -1149,6 +1149,98 @@ \f +(define (bv-read-test encoding bv) + (let ((port (open-bytevector-input-port bv))) + (set-port-encoding! port encoding) + (read-string port))) + +(with-test-prefix "unicode byte-order marks (BOMs)" + + (pass-if-equal "BOM not discarded from Latin-1 stream" + "\xEF\xBB\xBF\x61" + (bv-read-test "ISO-8859-1" #vu8(#xEF #xBB #xBF #x61))) + + (pass-if-equal "BOM not discarded from Latin-2 stream" + "\u010F\u0165\u017C\x61" + (bv-read-test "ISO-8859-2" #vu8(#xEF #xBB #xBF #x61))) + + (pass-if-equal "BOM not discarded from UTF-16BE stream" + "\uFEFF\x61" + (bv-read-test "UTF-16BE" #vu8(#xFE #xFF #x00 #x61))) + + (pass-if-equal "BOM not discarded from UTF-16LE stream" + "\uFEFF\x61" + (bv-read-test "UTF-16LE" #vu8(#xFF #xFE #x61 #x00))) + + (pass-if-equal "BOM not discarded from UTF-32BE stream" + "\uFEFF\x61" + (bv-read-test "UTF-32BE" #vu8(#x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x61))) + + (pass-if-equal "BOM not discarded from UTF-32LE stream" + "\uFEFF\x61" + (bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00))) + + (pass-if-equal "BOM discarded from start of UTF-16 stream (BE)" + "a" + (bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61))) + + (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (BE)" + "\uFEFFa" + (bv-read-test "UTF-16" #vu8(#xFE #xFF #xFE #xFF #x00 #x61))) + + (pass-if-equal "BOM not discarded unless at start of UTF-16 stream" + "a\uFEFFb" + (let ((be (bv-read-test "UTF-16" #vu8(#x00 #x61 #xFE #xFF #x00 #x62))) + (le (bv-read-test "UTF-16" #vu8(#x61 #x00 #xFF #xFE #x62 #x00)))) + (if (char=? #\a (string-ref be 1)) + be + le))) + + (pass-if-equal "BOM discarded from start of UTF-16 stream (LE)" + "a" + (bv-read-test "UTF-16" #vu8(#xFF #xFE #x61 #x00))) + + (pass-if-equal "Only one BOM discarded from start of UTF-16 stream (LE)" + "\uFEFFa" + (bv-read-test "UTF-16" #vu8(#xFF #xFE #xFF #xFE #x61 #x00))) + + (pass-if-equal "BOM discarded from start of UTF-32 stream (BE)" + "a" + (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF #x00 #x00 #x00 #x61))) + + (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (BE)" + "\uFEFFa" + (bv-read-test "UTF-32" #vu8(#x00 #x00 #xFE #xFF + #x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x61))) + + (pass-if-equal "BOM not discarded unless at start of UTF-32 stream" + "a\uFEFFb" + (let ((be (bv-read-test "UTF-32" #vu8(#x00 #x00 #x00 #x61 + #x00 #x00 #xFE #xFF + #x00 #x00 #x00 #x62))) + (le (bv-read-test "UTF-32" #vu8(#x61 #x00 #x00 #x00 + #xFF #xFE #x00 #x00 + #x62 #x00 #x00 #x00)))) + (if (char=? #\a (string-ref be 1)) + be + le))) + + (pass-if-equal "BOM discarded from start of UTF-32 stream (LE)" + "a" + (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00))) + + (pass-if-equal "Only one BOM discarded from start of UTF-32 stream (LE)" + "\uFEFFa" + (bv-read-test "UTF-32" #vu8(#xFF #xFE #x00 #x00 + #xFF #xFE #x00 #x00 + #x61 #x00 #x00 #x00)))) + +\f + (define-syntax-rule (with-load-path path body ...) (let ((new path) (old %load-path)) -- 1.7.10.4 [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #3: [PATCH 2/3] Discard UTF-8 byte-order marks at stream start, and improve efficiency --] [-- Type: text/x-diff, Size: 7618 bytes --] From 65e0cca752e005d75c8eade1c92f084a8518f209 Mon Sep 17 00:00:00 2001 From: Mark H Weaver <mhw@netris.org> Date: Wed, 30 Jan 2013 12:21:20 -0500 Subject: [PATCH 2/3] Discard UTF-8 byte-order marks at stream start, and improve efficiency. * libguile/ports.c (SCM_ICONV_UNINITIALIZED, SCM_ICONV_UTF8_AT_START, SCM_ICONV_UTF8_NOT_AT_START, SCM_ICONV_SPECIAL_P, SCM_UNICODE_BOM): New macros. (finalize_port): Use new macros, and black hole the iconv descriptor pointers after closing them. (scm_new_port_table_entry, scm_i_remove_port, get_codepoint): Use new macros. (get_utf8_codepoint): Discard unicode byte-order marks at stream start. After reading a code point, record the fact that we are no longer at the stream start. (scm_i_set_port_encoding_x): Use new macros. Eliminate extraneous test. * test-suite/tests/ports.test (unicode byte-order marks (BOMs)): Add tests. --- libguile/ports.c | 79 ++++++++++++++++++++++++++++++------------- test-suite/tests/ports.test | 12 +++++++ 2 files changed, 67 insertions(+), 24 deletions(-) diff --git a/libguile/ports.c b/libguile/ports.c index a1d6c96..438a15b 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -89,6 +89,16 @@ #define HAVE_FTRUNCATE 1 #endif +/* Special values for the input_cd and output_cd fields of the + scm_t_port structure. FIXME: Use a nicer representation for Guile + 2.1, when we can add more fields to the scm_t_port structure. */ +#define SCM_ICONV_UNINITIALIZED ((iconv_t) -1) +#define SCM_ICONV_UTF8_AT_START ((iconv_t) -3) +#define SCM_ICONV_UTF8_NOT_AT_START ((iconv_t) -4) +#define SCM_ICONV_SPECIAL_P(cd) ((cd) >= (iconv_t) -4) + +#define SCM_UNICODE_BOM 0xFEFF /* Unicode byte-order mark */ + \f /* The port kind table --- a dynamically resized array of port types. */ @@ -585,10 +595,13 @@ finalize_port (void *ptr, void *data) entry = SCM_PTAB_ENTRY (port); - if (entry->input_cd != (iconv_t) -1) + if (!SCM_ICONV_SPECIAL_P (entry->input_cd)) iconv_close (entry->input_cd); - if (entry->output_cd != (iconv_t) -1) + entry->input_cd = SCM_ICONV_UNINITIALIZED; + + if (!SCM_ICONV_SPECIAL_P (entry->output_cd)) iconv_close (entry->output_cd); + entry->output_cd = SCM_ICONV_UNINITIALIZED; SCM_SETSTREAM (port, 0); SCM_CLR_PORT_OPEN_FLAG (port); @@ -626,8 +639,8 @@ scm_new_port_table_entry (scm_t_bits tag) entry->encoding = enc ? scm_gc_strdup (enc, "port") : NULL; /* The conversion descriptors will be opened lazily. */ - entry->input_cd = (iconv_t) -1; - entry->output_cd = (iconv_t) -1; + entry->input_cd = SCM_ICONV_UNINITIALIZED; + entry->output_cd = SCM_ICONV_UNINITIALIZED; entry->ilseq_handler = scm_i_default_port_conversion_handler (); @@ -682,17 +695,13 @@ scm_i_remove_port (SCM port) p->putback_buf = NULL; p->putback_buf_size = 0; - if (p->input_cd != (iconv_t) -1) - { - iconv_close (p->input_cd); - p->input_cd = (iconv_t) -1; - } - - if (p->output_cd != (iconv_t) -1) - { - iconv_close (p->output_cd); - p->output_cd = (iconv_t) -1; - } + if (!SCM_ICONV_SPECIAL_P (p->input_cd)) + iconv_close (p->input_cd); + p->input_cd = SCM_ICONV_UNINITIALIZED; + + if (!SCM_ICONV_SPECIAL_P (p->output_cd)) + iconv_close (p->output_cd); + p->output_cd = SCM_ICONV_UNINITIALIZED; SCM_SETPTAB_ENTRY (port, 0); @@ -1202,6 +1211,8 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint, } else if ((buf[0] & 0xf0) == 0xe0) { + scm_t_wchar code_pt; + /* 3-byte form. */ byte = scm_peek_byte_or_eof (port); ASSERT_NOT_EOF (byte); @@ -1225,9 +1236,21 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint, buf[2] = (scm_t_uint8) byte; *len = 3; - *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL + code_pt = ((scm_t_wchar) buf[0] & 0x0f) << 12UL | ((scm_t_wchar) buf[1] & 0x3f) << 6UL | (buf[2] & 0x3f); + + if (SCM_UNLIKELY (code_pt == SCM_UNICODE_BOM + && pt->input_cd == SCM_ICONV_UTF8_AT_START)) + { + /* Discard a Unicode byte-order mark (BOM) at the beginning of + the stream. Record the fact that we are no longer at the + beginning, and read another code point. */ + pt->input_cd = SCM_ICONV_UTF8_NOT_AT_START; + return get_utf8_codepoint (port, codepoint, buf, len); + } + + *codepoint = code_pt; } else if (buf[0] >= 0xf0 && buf[0] <= 0xf4) { @@ -1272,6 +1295,9 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint, else goto invalid_seq; + /* Record the fact that we are no longer at the beginning of the + stream, so that future byte-order marks will not be discarded. */ + pt->input_cd = SCM_ICONV_UTF8_NOT_AT_START; return 0; invalid_seq: @@ -1372,12 +1398,13 @@ get_codepoint (SCM port, scm_t_wchar *codepoint, int err; scm_t_port *pt = SCM_PTAB_ENTRY (port); - if (pt->input_cd == (iconv_t) -1) + if (pt->input_cd == SCM_ICONV_UNINITIALIZED) /* Initialize the conversion descriptors, if needed. */ scm_i_set_port_encoding_x (port, pt->encoding); - /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */ - if (pt->input_cd == (iconv_t) -1) + /* NOTE: The following test assumes that the only special values + (other than SCM_ICONV_UNINITIALIZED) are for UTF-8. */ + if (SCM_ICONV_SPECIAL_P (pt->input_cd)) err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len); else err = get_iconv_codepoint (port, codepoint, buf, len); @@ -2228,7 +2255,12 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding) /* If ENCODING is UTF-8, then no conversion descriptor is opened because we do I/O ourselves. This saves 100+ KiB for each descriptor. */ - if (strcmp (encoding, "UTF-8")) + if (strcmp (encoding, "UTF-8") == 0) + { + new_input_cd = SCM_ICONV_UTF8_AT_START; + new_output_cd = SCM_ICONV_UNINITIALIZED; + } + else { if (SCM_CELL_WORD_0 (port) & SCM_RDNG) { @@ -2247,16 +2279,15 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding) new_output_cd = iconv_open (encoding, "UTF-8"); if (new_output_cd == (iconv_t) -1) { - if (new_input_cd != (iconv_t) -1) - iconv_close (new_input_cd); + iconv_close (new_input_cd); goto invalid_encoding; } } } - if (pt->input_cd != (iconv_t) -1) + if (!SCM_ICONV_SPECIAL_P (pt->input_cd)) iconv_close (pt->input_cd); - if (pt->output_cd != (iconv_t) -1) + if (!SCM_ICONV_SPECIAL_P (pt->output_cd)) iconv_close (pt->output_cd); pt->input_cd = new_input_cd; diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test index 38044c6..65c3b3f 100644 --- a/test-suite/tests/ports.test +++ b/test-suite/tests/ports.test @@ -1182,6 +1182,18 @@ (bv-read-test "UTF-32LE" #vu8(#xFF #xFE #x00 #x00 #x61 #x00 #x00 #x00))) + (pass-if-equal "BOM discarded from start of UTF-8 stream" + "a" + (bv-read-test "UTF-8" #vu8(#xEF #xBB #xBF #x61))) + + (pass-if-equal "Only one BOM discarded from start of UTF-8 stream" + "\uFEFFa" + (bv-read-test "UTF-8" #vu8(#xEF #xBB #xBF #xEF #xBB #xBF #x61))) + + (pass-if-equal "BOM not discarded unless at start of UTF-8 stream" + "a\uFEFFb" + (bv-read-test "UTF-8" #vu8(#x61 #xEF #xBB #xBF #x62))) + (pass-if-equal "BOM discarded from start of UTF-16 stream (BE)" "a" (bv-read-test "UTF-16" #vu8(#xFE #xFF #x00 #x61))) -- 1.7.10.4 [-- Warning: decoded text below may be mangled, UTF-8 assumed --] [-- Attachment #4: [PATCH 3/3] Remove byte-order mark check from scm_i_scan_for_encoding --] [-- Type: text/x-diff, Size: 1891 bytes --] From 1e4dde890b0a9a80b26f78d5c82b8ffac9e47689 Mon Sep 17 00:00:00 2001 From: Mark H Weaver <mhw@netris.org> Date: Wed, 30 Jan 2013 14:22:00 -0500 Subject: [PATCH 3/3] Remove byte-order mark check from scm_i_scan_for_encoding. * libguile/read.c (scm_i_scan_for_encoding): Remove byte-order mark check. --- libguile/read.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/libguile/read.c b/libguile/read.c index 222891b..75cf2cc 100644 --- a/libguile/read.c +++ b/libguile/read.c @@ -1,5 +1,5 @@ -/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006, - * 2007, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. +/* Copyright (C) 1995, 1996, 1997, 1999, 2000, 2001, 2003, 2004, 2006, 2007, + * 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1985,7 +1985,6 @@ scm_i_scan_for_encoding (SCM port) char header[SCM_ENCODING_SEARCH_SIZE+1]; size_t bytes_read, encoding_length, i; char *encoding = NULL; - int utf8_bom = 0; char *pos, *encoding_start; int in_comment; @@ -2030,10 +2029,6 @@ scm_i_scan_for_encoding (SCM port) scm_seek (port, scm_from_int (0), scm_from_int (SEEK_SET)); } - if (bytes_read > 3 - && header[0] == '\xef' && header[1] == '\xbb' && header[2] == '\xbf') - utf8_bom = 1; - /* search past "coding[:=]" */ pos = header; while (1) @@ -2102,11 +2097,6 @@ scm_i_scan_for_encoding (SCM port) /* This wasn't in a comment */ return NULL; - if (utf8_bom && strcmp(encoding, "UTF-8")) - scm_misc_error (NULL, - "the port input declares the encoding ~s but is encoded as UTF-8", - scm_list_1 (scm_from_locale_string (encoding))); - return encoding; } -- 1.7.10.4 ^ permalink raw reply related [flat|nested] 22+ messages in thread
* Re: [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings 2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver @ 2013-01-31 9:39 ` Andy Wingo 2013-01-31 10:33 ` Andy Wingo 2013-01-31 21:42 ` Ludovic Courtès 1 sibling, 1 reply; 22+ messages in thread From: Andy Wingo @ 2013-01-31 9:39 UTC (permalink / raw) To: Mark H Weaver; +Cc: Ludovic Courtès, guile-devel Hi Mark, On Thu 31 Jan 2013 05:40, Mark H Weaver <mhw@netris.org> writes: > From ceccaf59267bc98c86aae33809905f26b017ebc8 Mon Sep 17 00:00:00 2001 > From: Mark H Weaver <mhw@netris.org> > Date: Wed, 30 Jan 2013 10:16:37 -0500 > Subject: [PATCH 1/3] Rewrite get_iconv_codepoint to fix a bug involving > byte-order marks. LGTM, thanks! > From 65e0cca752e005d75c8eade1c92f084a8518f209 Mon Sep 17 00:00:00 2001 > From: Mark H Weaver <mhw@netris.org> > Date: Wed, 30 Jan 2013 12:21:20 -0500 > Subject: [PATCH 2/3] Discard UTF-8 byte-order marks at stream start, and > improve efficiency. > > * libguile/ports.c (SCM_ICONV_UNINITIALIZED, SCM_ICONV_UTF8_AT_START, > SCM_ICONV_UTF8_NOT_AT_START, SCM_ICONV_SPECIAL_P, SCM_UNICODE_BOM): > New macros. I would prefer a different name other than "special". Perhaps reverse the test to be SCM_ICONV_DESCRIPTOR_OPEN_P or something. > @@ -1202,6 +1211,8 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint, > } > else if ((buf[0] & 0xf0) == 0xe0) > { > + scm_t_wchar code_pt; > + > /* 3-byte form. */ > byte = scm_peek_byte_or_eof (port); > ASSERT_NOT_EOF (byte); Call it "codepoint_or_bom" perhaps; otherwise *codepoint = code_pt is too confusing. The patch looks good to me in general but there is one problem, related to seeks. If you seek back to 0, the iconv descriptors are not re-set. Perhaps seeks should flush the iconv descriptors, if any, and re-set the UTF-8 state to "at start". Thoughts? > From 1e4dde890b0a9a80b26f78d5c82b8ffac9e47689 Mon Sep 17 00:00:00 2001 > From: Mark H Weaver <mhw@netris.org> > Date: Wed, 30 Jan 2013 14:22:00 -0500 > Subject: [PATCH 3/3] Remove byte-order mark check from > scm_i_scan_for_encoding. Looks good to me. Thanks for working on this! Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings 2013-01-31 9:39 ` Andy Wingo @ 2013-01-31 10:33 ` Andy Wingo 2013-01-31 18:01 ` [PATCHES] Discard BOMs at stream start for UTF-{8, 16, 32} encodings Mark H Weaver 0 siblings, 1 reply; 22+ messages in thread From: Andy Wingo @ 2013-01-31 10:33 UTC (permalink / raw) To: Mark H Weaver; +Cc: Ludovic Courtès, guile-devel One more thing, Mark; would you mind handling merges to master? The port code got rearranged there and merges can be tricky. I'll merge stable-2.0 as it is first and then you can just deal with these patches. Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCHES] Discard BOMs at stream start for UTF-{8, 16, 32} encodings 2013-01-31 10:33 ` Andy Wingo @ 2013-01-31 18:01 ` Mark H Weaver 0 siblings, 0 replies; 22+ messages in thread From: Mark H Weaver @ 2013-01-31 18:01 UTC (permalink / raw) To: Andy Wingo; +Cc: Ludovic Courtès, guile-devel Andy Wingo <wingo@pobox.com> writes: > One more thing, Mark; would you mind handling merges to master? The > port code got rearranged there and merges can be tricky. Sounds good, but first I need to think a bit more about seeking, and also about an additional complication regarding the possibility that the input and output iconv descriptors will internally choose different endiannesses for the UTF-16 or UTF-32 encodings (a problem which already exists today). Thanks! Mark ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: [PATCHES] Discard BOMs at stream start for UTF-{8, 16, 32} encodings 2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver 2013-01-31 9:39 ` Andy Wingo @ 2013-01-31 21:42 ` Ludovic Courtès 1 sibling, 0 replies; 22+ messages in thread From: Ludovic Courtès @ 2013-01-31 21:42 UTC (permalink / raw) To: Mark H Weaver; +Cc: Andy Wingo, guile-devel Hi! Mark H Weaver <mhw@netris.org> skribis: > I researched this some more, and discovered that removal of byte-order > marks (BOMs) is the responsibility of iconv, which discards BOMs from > the beginning of streams when using the UTF-16 or UTF-32 encodings, but > *not* for UTF-16LE, UTF-16GE, UTF-32LE, UTF-32GE or any other encoding. > It uses the BOM to determine the endianness of the stream, but other > than that does *not* use it to guess the encoding, so there's no > guesswork involved. (Side note: iconv also inserts a BOM automatically > when writing a stream using UTF-16 or UTF-32). Are you talking about GNU iconv or iconv as specified by POSIX? I can’t see any occurrence of “BOM” at <http://pubs.opengroup.org/onlinepubs/9699919799/functions/iconv.html>. > So thanks to iconv, we get UTF-{16,32} BOM removal for free. > Unfortunately we have a nasty bug in 'get_iconv_codepoint' that leads to > a buffer overrun and assertion failure when 'iconv' discards a BOM. Good catch! > The first patch below fixes this problem. I ended up almost completely > rewriting that function, partly because it was largely structured around > a mistaken assumption that iconv will never consume input without > producing output, and partly because it was quite inefficient (several > unnecessary conditional branches in the loop) and IMO was rather > difficult to read. Great. (I think ‘iconv’ semantics lead to tricky code, no matter what.) > get_iconv_codepoint (SCM port, scm_t_wchar *codepoint, > char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) > { [...] > - for (output_size = 0, output = (char *) utf8_buf, > - bytes_consumed = 0, err = 0; > - err == 0 && output_size == 0 > - && (bytes_consumed == 0 || byte_read != EOF); > - bytes_consumed++) > + for (;;) Clarity is in the eye of the beholder, but to me this is a step backwards. [...] > + /* NOTE: The following test assumes that the only special values > + (other than SCM_ICONV_UNINITIALIZED) are for UTF-8. */ > + if (SCM_ICONV_SPECIAL_P (pt->input_cd)) Probably an indication that a more descriptive name is needed, as Andy noted. > @@ -2247,16 +2279,15 @@ scm_i_set_port_encoding_x (SCM port, const char *encoding) > new_output_cd = iconv_open (encoding, "UTF-8"); > if (new_output_cd == (iconv_t) -1) Should be SCM_ICONV_UNINITIALIZED? Thanks again for the research and fixes! Ludo’. ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-28 21:42 byte-order marks Andy Wingo 2013-01-28 22:20 ` Mike Gran 2013-01-29 8:22 ` Mark H Weaver @ 2013-01-29 19:22 ` Neil Jerram 2013-01-29 21:09 ` Andy Wingo 2 siblings, 1 reply; 22+ messages in thread From: Neil Jerram @ 2013-01-29 19:22 UTC (permalink / raw) To: Andy Wingo; +Cc: guile-devel Andy Wingo <wingo@pobox.com> writes: > What do people think about this attached patch? > > Andy > > >>From 831c3418941f2d643f91e3076ef9458f700a2c59 Mon Sep 17 00:00:00 2001 > From: Andy Wingo <wingo@pobox.com> > Date: Mon, 28 Jan 2013 22:41:34 +0100 > Subject: [PATCH] detect and consume byte-order marks for textual ports In case an example is of any help for this discussion, here's some code that I wrote to consume a possible BOM on contacts data downloaded from Google: (define (read-csv file-name) (let ((s (utf16->string (get-bytevector-all (open-input-file file-name)) 'little))) ;; Discard possible byte order mark. (if (and (>= (string-length s) 1) (char=? (string-ref s 0) #\xfeff)) (set! s (substring s 1))) ...)) I wonder if I chose to use utf16->string because there wasn't - at that time - a way of handling the BOM without slurping into a string first? However it was a long time ago now so I really can't be sure what the context was. Regards, Neil ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 19:22 ` byte-order marks Neil Jerram @ 2013-01-29 21:09 ` Andy Wingo 2013-01-29 21:12 ` Neil Jerram 0 siblings, 1 reply; 22+ messages in thread From: Andy Wingo @ 2013-01-29 21:09 UTC (permalink / raw) To: Neil Jerram; +Cc: guile-devel On Tue 29 Jan 2013 20:22, Neil Jerram <neil@ossau.homelinux.net> writes: > (define (read-csv file-name) > (let ((s (utf16->string (get-bytevector-all (open-input-file file-name)) > 'little))) > > ;; Discard possible byte order mark. > (if (and (>= (string-length s) 1) > (char=? (string-ref s 0) #\xfeff)) > (set! s (substring s 1))) > > ...)) FWIW the procedure I had was: (define (consume-byte-order-mark port) (let ((enc (or (port-encoding port) "ISO-8859-1"))) (set-port-encoding! port "ISO-8859-1") (case (peek-char port) ((#\xEF) (read-char port) (case (peek-char port) ((#\xBB) (read-char port) (case (peek-char port) ((#\xBF) (read-char port) (set-port-encoding! port "UTF-8")) (else (unread-char #\xBB port) (unread-char #\xEF port) (set-port-encoding! port enc)))) (else (unread-char #\xEF port) (set-port-encoding! port enc)))) ((#\xFE) (read-char port) (case (peek-char port) ((#\xFF) (read-char port) (set-port-encoding! port "UTF-16BE")) (else (unread-char #\xFE port) (set-port-encoding! port enc)))) ((#\xFF) (read-char port) (case (peek-char port) ((#\xFE) (read-char port) (set-port-encoding! port "UTF-16LE")) (else (unread-char #\xFF port) (set-port-encoding! port enc)))) (else (set-port-encoding! port enc))))) The encoding dance is because there is no unread-u8 from Scheme, only unread-char. Andy -- http://wingolog.org/ ^ permalink raw reply [flat|nested] 22+ messages in thread
* Re: byte-order marks 2013-01-29 21:09 ` Andy Wingo @ 2013-01-29 21:12 ` Neil Jerram 0 siblings, 0 replies; 22+ messages in thread From: Neil Jerram @ 2013-01-29 21:12 UTC (permalink / raw) To: Andy Wingo; +Cc: guile-devel Andy Wingo <wingo@pobox.com> writes: > On Tue 29 Jan 2013 20:22, Neil Jerram <neil@ossau.homelinux.net> writes: > >> (define (read-csv file-name) >> (let ((s (utf16->string (get-bytevector-all (open-input-file file-name)) >> 'little))) >> >> ;; Discard possible byte order mark. >> (if (and (>= (string-length s) 1) >> (char=? (string-ref s 0) #\xfeff)) >> (set! s (substring s 1))) >> >> ...)) > > FWIW the procedure I had was: > > (define (consume-byte-order-mark port) > (let ((enc (or (port-encoding port) "ISO-8859-1"))) > (set-port-encoding! port "ISO-8859-1") > (case (peek-char port) > ((#\xEF) > (read-char port) > (case (peek-char port) > ((#\xBB) > (read-char port) > (case (peek-char port) > ((#\xBF) > (read-char port) > (set-port-encoding! port "UTF-8")) > (else > (unread-char #\xBB port) > (unread-char #\xEF port) > (set-port-encoding! port enc)))) > (else > (unread-char #\xEF port) > (set-port-encoding! port enc)))) > ((#\xFE) > (read-char port) > (case (peek-char port) > ((#\xFF) > (read-char port) > (set-port-encoding! port "UTF-16BE")) > (else > (unread-char #\xFE port) > (set-port-encoding! port enc)))) > ((#\xFF) > (read-char port) > (case (peek-char port) > ((#\xFE) > (read-char port) > (set-port-encoding! port "UTF-16LE")) > (else > (unread-char #\xFF port) > (set-port-encoding! port enc)))) > (else > (set-port-encoding! port enc))))) > > The encoding dance is because there is no unread-u8 from Scheme, only > unread-char. I can see why you'd want to do something about that! Regards, Neil ^ permalink raw reply [flat|nested] 22+ messages in thread
end of thread, other threads:[~2013-01-31 21:42 UTC | newest] Thread overview: 22+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2013-01-28 21:42 byte-order marks Andy Wingo 2013-01-28 22:20 ` Mike Gran 2013-01-29 9:03 ` Andy Wingo 2013-01-29 8:22 ` Mark H Weaver 2013-01-29 9:03 ` Andy Wingo 2013-01-29 13:27 ` Ludovic Courtès 2013-01-29 14:04 ` Andy Wingo 2013-01-29 17:09 ` Mark H Weaver 2013-01-29 19:09 ` Mark H Weaver 2013-01-29 20:52 ` Ludovic Courtès 2013-01-29 20:53 ` Ludovic Courtès 2013-01-30 9:20 ` Andy Wingo 2013-01-30 21:18 ` Ludovic Courtès 2013-01-31 8:52 ` Andy Wingo 2013-01-31 4:40 ` [PATCHES] Discard BOMs at stream start for UTF-{8,16,32} encodings Mark H Weaver 2013-01-31 9:39 ` Andy Wingo 2013-01-31 10:33 ` Andy Wingo 2013-01-31 18:01 ` [PATCHES] Discard BOMs at stream start for UTF-{8, 16, 32} encodings Mark H Weaver 2013-01-31 21:42 ` Ludovic Courtès 2013-01-29 19:22 ` byte-order marks Neil Jerram 2013-01-29 21:09 ` Andy Wingo 2013-01-29 21:12 ` Neil Jerram
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).