Re: Character literals for Unicode (control) characters

unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed

From: Philipp Stephani <p.stephani2@gmail.com>
To: Paul Eggert <eggert@cs.ucla.edu>, Eli Zaretskii <eliz@gnu.org>
Cc: larsi@gnus.org, johnw@gnu.org, emacs-devel@gnu.org
Subject: Re: Character literals for Unicode (control) characters
Date: Fri, 25 Mar 2016 17:41:45 +0000	[thread overview]
Message-ID: <CAArVCkQ26+TN9BNv3ApPBfs=vsUycBt+rmd9ospLPVLWeEDK6Q@mail.gmail.com> (raw)
In-Reply-To: <CAArVCkQp=k3-NzQq3KuBNiGhzb9Wyqn6VuCDxy-fBEeRArEoFw@mail.gmail.com>


[-- Attachment #1.1: Type: text/plain, Size: 685 bytes --]

Philipp Stephani <p.stephani2@gmail.com> schrieb am So., 20. März 2016 um
14:25 Uhr:

> Paul Eggert <eggert@cs.ucla.edu> schrieb am So., 20. März 2016 um
> 13:58 Uhr:
>
>> Thanks, one thing I didn't notice earlier:
>>
>> +  xsignal1 (Qinvalid_read_syntax,
>> +            CALLN (Fformat, build_pure_c_string ("\\N{%s}"), name));
>>
>> This can run Emacs out of pure space unnecessarily.  Use AUTO_STRING
>> instead of
>> build_pure_c_string.
>>
>> Also, I've lost track of what this patch is building on. Perhaps send all
>> the
>> patches next time....
>>
>
> Done. Attached all patches.
>


Oops, forgot to actually commit the changes. New patch attached.

[-- Attachment #1.2: Type: text/html, Size: 1309 bytes --]

[-- Attachment #2: 0001-Use-ucs-names-for-character-name-escapes.patch --]
[-- Type: application/octet-stream, Size: 9269 bytes --]

From 808f28cde583e2aa05dffff65b40c684d7895eab Mon Sep 17 00:00:00 2001
From: Philipp Stephani <phst@google.com>
Date: Sun, 13 Mar 2016 21:27:30 +0100
Subject: [PATCH] Use `ucs-names' for character name escapes

* lread.c (invalid_character_name, check_scalar_value)
(parse_code_after_prefix, character_name_to_code): New helper
functions that use `ucs-names' and parsing for CJK ideographs.
(read_escape): Use helper functions.
(syms_of_lread): New symbol `ucs-names'.
* test/src/lread-tests.el: New tests; fix a couple of bugs in
existing tests.
---
 src/lread.c             | 137 +++++++++++++++++++++++++++++++-----------------
 test/src/lread-tests.el |  11 +++-
 2 files changed, 97 insertions(+), 51 deletions(-)

diff --git a/src/lread.c b/src/lread.c
index 4000637..fd5b363 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -44,6 +44,7 @@ along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
 #include "termhooks.h"
 #include "blockinput.h"
 #include <c-ctype.h>
+#include <string.h>
 
 #ifdef MSDOS
 #include "msdos.h"
@@ -2151,36 +2152,90 @@ grow_read_buffer (void)
 			 MAX_MULTIBYTE_LENGTH, -1, 1);
 }
 
-/* Hash table that maps Unicode character names to code points.  */
-static Lisp_Object character_names;
+/* Signal an invalid-read-syntax error indicating that the character
+   name in an \N{…} literal is invalid.  */
+static _Noreturn void
+invalid_character_name (Lisp_Object name)
+{
+  AUTO_STRING (format, "\\N{%s}");
+  xsignal1 (Qinvalid_read_syntax, CALLN (Fformat, format, name));
+}
 
-/* Length of the longest Unicode character name, in bytes. */
-static ptrdiff_t max_character_name_length;
+/* Check that CODE is a valid Unicode scalar value, and return its
+   value.  CODE should be parsed from the character name given by
+   NAME.  NAME is used for error messages.  */
+static int
+check_scalar_value (Lisp_Object code, Lisp_Object name)
+{
+  if (! NUMBERP (code))
+    invalid_character_name (name);
+  EMACS_INT i = XINT (code);
+  if (! (0 <= i && i <= MAX_UNICODE_CHAR)
+      /* Don't allow surrogates.  */
+      || (0xD800 <= code && code <= 0xDFFF))
+    invalid_character_name (name);
+  return i;
+}
 
-/* Initializes `character_names' and `max_character_name_length'.
-   Called by `read_escape'.  */
-void init_character_names (void)
+/* If NAME starts with PREFIX, interpret the rest as a hexadecimal
+   number and return its value.  Raise invalid-read-syntax if the
+   number is not a valid scalar value.  Return −1 if NAME doesn’t
+   start with PREFIX.  */
+static int
+parse_code_after_prefix (Lisp_Object name, const char *prefix)
 {
-  character_names = CALLN (Fmake_hash_table,
-                           QCtest, Qequal,
-                           /* Currently around 100,000 Unicode
-                              characters are defined.  */
-                           QCsize, make_natnum (100000));
-  Lisp_Object get_property =
-    Fsymbol_function (intern_c_string ("get-char-code-property"));
-  ptrdiff_t length = 0;
-  for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
+  ptrdiff_t name_len = SBYTES (name);
+  ptrdiff_t prefix_len = strlen (prefix);
+  /* Allow between one and eight hexadecimal digits after the
+     prefix.  */
+  if (prefix_len < name_len && name_len <= prefix_len + 8
+      && memcmp (SDATA (name), prefix, prefix_len) == 0)
     {
-      Lisp_Object code = make_natnum (i);
-      Lisp_Object name = call2 (get_property, code, Qname);
-      if (NILP (name)) continue;
-      CHECK_STRING (name);
-      length = max (length, SBYTES (name));
-      Fputhash (name, code, character_names);
+      Lisp_Object code = string_to_number (SDATA (name) + prefix_len, 16, false);
+      if (NUMBERP (code))
+        return check_scalar_value (code, name);
+    }
+  return -1;
+}
+
+/* Returns the scalar value that has the Unicode character name NAME.
+   Raises `invalid-read-syntax' if there is no such character.  */
+static int
+character_name_to_code (Lisp_Object name)
+{
+  /* Code point as U+N, where N is between 1 and 8 hexadecimal
+     digits.  */
+  int code = parse_code_after_prefix (name, "U+");
+  if (code >= 0)
+    return code;
+
+  /* CJK ideographs are not contained in the association list returned
+     by `ucs-names'.  But they follow a predictable naming pattern: a
+     fixed prefix plus the hexadecimal codepoint value.  */
+  code = parse_code_after_prefix (name, "CJK IDEOGRAPH-");
+  if (code >= 0)
+    {
+      /* Various ranges of CJK characters; see UnicodeData.txt.  */
+      if ((0x3400 <= code && code <= 0x4DB5)
+          || (0x4E00 <= code && code <= 0x9FD5)
+          || (0x20000 <= code && code <= 0x2A6D6)
+          || (0x2A700 <= code && code <= 0x2B734)
+          || (0x2B740 <= code && code <= 0x2B81D)
+          || (0x2B820 <= code && code <= 0x2CEA1))
+        return code;
+      else
+        invalid_character_name (name);
     }
-  max_character_name_length = length;
+
+  /* Look up the name in the table returned by `ucs-names'.  */
+  Lisp_Object names = call0 (Qucs_names);
+  return check_scalar_value (CDR (Fassoc (name, names)), name);
 }
 
+/* Bound on the length of a Unicode character name.  As of
+   Unicode 9.0.0 the maximum is 83, so this should be safe. */
+enum { UNICODE_CHARACTER_NAME_LENGTH_BOUND = 200 };
+
 /* Read a \-escape sequence, assuming we already read the `\'.
    If the escape sequence forces unibyte, return eight-bit char.  */
 
@@ -2394,10 +2449,7 @@ read_escape (Lisp_Object readcharfun, bool stringp)
         c = READCHAR;
         if (c != '{')
           invalid_syntax ("Expected opening brace after \\N");
-        if (NILP (character_names))
-          init_character_names ();
-        USE_SAFE_ALLOCA;
-        char *name = SAFE_ALLOCA (max_character_name_length + 1);
+        char name[UNICODE_CHARACTER_NAME_LENGTH_BOUND + 1];
         bool whitespace = false;
         ptrdiff_t length = 0;
         while (true)
@@ -2408,11 +2460,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
             if (c == '}')
               break;
             if (! c_isascii (c))
-              xsignal1 (Qinvalid_read_syntax,
-                        CALLN (Fformat,
-                               build_pure_c_string ("Non-ASCII character U+%04X"
-                                                    " in character name"),
-                               make_natnum (c)));
+              {
+                AUTO_STRING (format,
+                             "Non-ASCII character U+%04X in character name");
+                xsignal1 (Qinvalid_read_syntax,
+                          CALLN (Fformat, format, make_natnum (c)));
+              }
             /* We treat multiple adjacent whitespace characters as a
                single space character.  This makes it easier to use
                character names in e.g. multi-line strings.  */
@@ -2426,25 +2479,12 @@ read_escape (Lisp_Object readcharfun, bool stringp)
             else
               whitespace = false;
             name[length++] = c;
-            if (length >= max_character_name_length)
+            if (length >= sizeof name)
               invalid_syntax ("Character name too long");
           }
         if (length == 0)
           invalid_syntax ("Empty character name");
-        name[length] = 0;
-        Lisp_Object lisp_name = make_unibyte_string (name, length);
-        Lisp_Object code =
-          (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
-          /* Code point as U+N, where N is between 1 and 8 hexadecimal
-             digits.  */
-          string_to_number (name + 2, 16, false) :
-          Fgethash (lisp_name, character_names, Qnil);
-        SAFE_FREE ();
-        if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
-          xsignal1 (Qinvalid_read_syntax,
-                    CALLN (Fformat,
-                           build_pure_c_string ("\\N{%s}"), lisp_name));
-        return XINT (code);
+        return character_name_to_code (make_unibyte_string (name, length));
       }
 
     default:
@@ -4836,6 +4876,5 @@ that are loaded before your customizations are read!  */);
   DEFSYM (Qrehash_size, "rehash-size");
   DEFSYM (Qrehash_threshold, "rehash-threshold");
 
-  character_names = Qnil;
-  staticpro (&character_names);
+  DEFSYM (Qucs_names, "ucs-names");
 }
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
index 1f87334..ff5d0f6 100644
--- a/test/src/lread-tests.el
+++ b/test/src/lread-tests.el
@@ -40,10 +40,17 @@
   (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
 
 (ert-deftest lread-char-non-ascii-name ()
-  (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
+  (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")
+                :type 'invalid-read-syntax))
 
 (ert-deftest lread-char-empty-name ()
-  (should-error (read "?\\N{}")) 'invalid-read-syntax)
+  (should-error (read "?\\N{}") :type 'invalid-read-syntax))
+
+(ert-deftest lread-char-cjk-name ()
+  (should (equal ?\N{CJK IDEOGRAPH-2B734} #x2B734)))
+
+(ert-deftest lread-char-invalid-cjk-name ()
+  (should-error (read "?\\N{CJK IDEOGRAPH-2B735}") :type 'invalid-read-syntax))
 
 (ert-deftest lread-string-char-number ()
   (should (equal "a\N{U+A817}b" "a\uA817b")))
-- 
2.7.0

next prev parent reply	other threads:[~2016-03-25 17:41 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-03-03  5:47 Character literals for Unicode (control) characters Lars Ingebrigtsen
2016-03-03  6:20 ` John Wiegley
2016-03-03  6:25   ` Lars Ingebrigtsen
2016-03-03  6:34 ` Drew Adams
2016-03-03 16:11 ` Paul Eggert
2016-03-03 20:48   ` Eli Zaretskii
2016-03-03 23:58     ` Paul Eggert
2016-03-05 15:28   ` Philipp Stephani
2016-03-05 15:39     ` Marcin Borkowski
2016-03-05 16:51       ` Philipp Stephani
2016-03-06  2:27     ` John Wiegley
2016-03-06 15:24       ` Philipp Stephani
2016-03-06 15:54         ` Eli Zaretskii
2016-03-06 17:35           ` Philipp Stephani
2016-03-06 18:08             ` Paul Eggert
2016-03-06 18:28               ` Philipp Stephani
2016-03-06 19:03                 ` Paul Eggert
2016-03-06 19:16                   ` Philipp Stephani
2016-03-06 20:05                     ` Eli Zaretskii
2016-03-13 20:31                       ` Philipp Stephani
2016-03-14 20:03                         ` Paul Eggert
2016-03-14 20:30                           ` Eli Zaretskii
2016-03-15 11:09                             ` Nikolai Weibull
2016-03-15 17:10                               ` Eli Zaretskii
2016-03-16  8:16                                 ` Nikolai Weibull
2016-03-14 21:27                           ` Clément Pit--Claudel
2016-03-14 21:48                             ` Paul Eggert
2016-03-19 16:27                           ` Philipp Stephani
2016-03-20 12:58                             ` Paul Eggert
2016-03-20 13:25                               ` Philipp Stephani
2016-03-25 17:41                                 ` Philipp Stephani [this message]
2016-04-22  2:39                                   ` Paul Eggert
2016-04-22  7:57                                     ` Eli Zaretskii
2016-04-22  8:01                                       ` Eli Zaretskii
2016-04-22  9:39                                         ` Elias Mårtenson
2016-04-22 10:01                                           ` Eli Zaretskii
2016-04-25 17:48                                             ` Paul Eggert
2016-03-05 16:35   ` Clément Pit--Claudel
2016-03-05 17:12     ` Paul Eggert
2016-03-05 17:53       ` Clément Pit--Claudel
2016-03-05 18:16         ` Eli Zaretskii
2016-03-05 18:34           ` Clément Pit--Claudel
2016-03-05 18:56             ` Eli Zaretskii
2016-03-05 19:08               ` Drew Adams
2016-03-05 22:52                 ` Clément Pit--Claudel
2016-03-06 15:49           ` Joost Kremers
2016-03-06 16:55             ` Drew Adams

find likely ancestor, descendant, or conflicting patches for this message:
dfblob:4000637 dfblob:1f87334 dfblob:fd5b363 dfblob:ff5d0f6
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAArVCkQ26+TN9BNv3ApPBfs=vsUycBt+rmd9ospLPVLWeEDK6Q@mail.gmail.com' \
    --to=p.stephani2@gmail.com \
    --cc=eggert@cs.ucla.edu \
    --cc=eliz@gnu.org \
    --cc=emacs-devel@gnu.org \
    --cc=johnw@gnu.org \
    --cc=larsi@gnus.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).