From: Philipp Stephani <p.stephani2@gmail.com>
To: Eli Zaretskii <eliz@gnu.org>
Cc: eggert@cs.ucla.edu, larsi@gnus.org, johnw@gnu.org, emacs-devel@gnu.org
Subject: Re: Character literals for Unicode (control) characters
Date: Sun, 06 Mar 2016 17:35:18 +0000 [thread overview]
Message-ID: <CAArVCkQtqYfyDNmWQ4qQL9QOh8HHrJTSWesG+pqgGdtYwVrc4g@mail.gmail.com> (raw)
In-Reply-To: <838u1vwqj9.fsf@gnu.org>
[-- Attachment #1.1: Type: text/plain, Size: 1105 bytes --]
Eli Zaretskii <eliz@gnu.org> schrieb am So., 6. März 2016 um 16:54 Uhr:
> > From: Philipp Stephani <p.stephani2@gmail.com>
> > Date: Sun, 06 Mar 2016 15:24:47 +0000
> >
> > I've attached a patch with an initial implementation.
>
> Thanks.
>
> > +/* Hash table that maps Unicode character names to code points. */
> > +static Lisp_Object character_names;
> > +
> > +/* Length of the longest Unicode character name, in bytes. */
> > +static ptrdiff_t max_character_name_length;
> > +
> > +/* Initializes `character_names' and `max_character_name_length'.
> > + Called by `read_escape'. */
>
> I wonder if there's a better way, in particular with a smaller memory
> footprint. Doesn't map-char-table work well enough to avoid
> generating all the names up front?
>
It doesn't seem to work; for some reason the Unicode name table appears
very small (only 136 code points) when map-char-table is called from C and
lacks most characters.
>
> > + if (! RANGED_INTEGERP (0, code, 0x10FFFF))
>
> This should use MAX_UNICODE_CHAR.
>
>
Done, attached a new patch.
[-- Attachment #1.2: Type: text/html, Size: 1701 bytes --]
[-- Attachment #2: 0001-Implement-named-character-escapes-similar-to-Perl.patch --]
[-- Type: application/octet-stream, Size: 7130 bytes --]
From 22e299cd23a72a072461befa30a04bf557aecac8 Mon Sep 17 00:00:00 2001
From: Philipp Stephani <phst@google.com>
Date: Sun, 6 Mar 2016 16:16:29 +0100
Subject: [PATCH] Implement named character escapes, similar to Perl
* lread.c (init_character_names): New function.
(read_escape): Read Perl-style named character escape sequences.
(syms_of_lread): Initialize new variable `character_names'.
* test/src/lread-tests.el (lread-char-empty-name): Add test file
for src/lread.c.
---
src/lread.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++
test/src/lread-tests.el | 54 ++++++++++++++++++++++++++++
2 files changed, 150 insertions(+)
create mode 100644 test/src/lread-tests.el
diff --git a/src/lread.c b/src/lread.c
index 25e3ff0..6e84fc8 100644
--- a/src/lread.c
+++ b/src/lread.c
@@ -43,6 +43,7 @@ along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
#include "systime.h"
#include "termhooks.h"
#include "blockinput.h"
+#include <c-ctype.h>
#ifdef MSDOS
#include "msdos.h"
@@ -2150,6 +2151,36 @@ grow_read_buffer (void)
MAX_MULTIBYTE_LENGTH, -1, 1);
}
+/* Hash table that maps Unicode character names to code points. */
+static Lisp_Object character_names;
+
+/* Length of the longest Unicode character name, in bytes. */
+static ptrdiff_t max_character_name_length;
+
+/* Initializes `character_names' and `max_character_name_length'.
+ Called by `read_escape'. */
+void init_character_names ()
+{
+ character_names = CALLN (Fmake_hash_table,
+ QCtest, Qequal,
+ /* Currently around 100,000 Unicode
+ characters are defined. */
+ QCsize, make_natnum (100000));
+ const Lisp_Object get_property =
+ Fsymbol_function (intern_c_string ("get-char-code-property"));
+ ptrdiff_t length = 0;
+ for (int i = 0; i <= MAX_UNICODE_CHAR; ++i)
+ {
+ const Lisp_Object code = make_natnum (i);
+ const Lisp_Object name = call2 (get_property, code, Qname);
+ if (NILP (name)) continue;
+ CHECK_STRING (name);
+ length = max (length, SBYTES (name));
+ Fputhash (name, code, character_names);
+ }
+ max_character_name_length = length;
+}
+
/* Read a \-escape sequence, assuming we already read the `\'.
If the escape sequence forces unibyte, return eight-bit char. */
@@ -2357,6 +2388,68 @@ read_escape (Lisp_Object readcharfun, bool stringp)
return i;
}
+ case 'N':
+ /* Named character. */
+ {
+ c = READCHAR;
+ if (c != '{')
+ invalid_syntax ("Expected opening brace after \\N");
+ if (NILP (character_names))
+ init_character_names ();
+ USE_SAFE_ALLOCA;
+ char *name = SAFE_ALLOCA (max_character_name_length + 1);
+ bool whitespace = false;
+ ptrdiff_t length = 0;
+ while (true)
+ {
+ c = READCHAR;
+ if (c < 0)
+ end_of_file_error ();
+ if (c == '}')
+ break;
+ if (! c_isascii (c))
+ xsignal1 (Qinvalid_read_syntax,
+ CALLN (Fformat,
+ build_pure_c_string ("Non-ASCII character U+%04X"
+ " in character name"),
+ make_natnum (c)));
+ /* We treat multiple adjacent whitespace characters as a
+ single space character. This makes it easier to use
+ character names in e.g. multi-line strings. */
+ if (c_isspace (c))
+ {
+ if (! whitespace)
+ {
+ whitespace = true;
+ name[length++] = ' ';
+ }
+ }
+ else
+ {
+ whitespace = false;
+ name[length++] = c;
+ }
+ if (length >= max_character_name_length)
+ invalid_syntax ("Character name too long");
+ }
+ if (length == 0)
+ invalid_syntax ("Empty character name");
+ name[length] = 0;
+ const Lisp_Object lisp_name = make_unibyte_string (name, length);
+ const Lisp_Object code =
+ (length >= 3 && length <= 10 && name[0] == 'U' && name[1] == '+') ?
+ /* Code point as U+N, where N is between 1 and 8 hexadecimal
+ digits. */
+ string_to_number (name + 2, 16, false) :
+ Fgethash (lisp_name, character_names, Qnil);
+ SAFE_FREE ();
+ if (! RANGED_INTEGERP (0, code, MAX_UNICODE_CHAR))
+ xsignal1 (Qinvalid_read_syntax,
+ CALLN (Fformat,
+ build_pure_c_string ("\\N{%s}"), lisp_name));
+ return XINT (code);
+ }
+
default:
return c;
}
@@ -4745,4 +4838,7 @@ that are loaded before your customizations are read! */);
DEFSYM (Qweakness, "weakness");
DEFSYM (Qrehash_size, "rehash-size");
DEFSYM (Qrehash_threshold, "rehash-threshold");
+
+ character_names = Qnil;
+ staticpro (&character_names);
}
diff --git a/test/src/lread-tests.el b/test/src/lread-tests.el
new file mode 100644
index 0000000..1f87334
--- /dev/null
+++ b/test/src/lread-tests.el
@@ -0,0 +1,54 @@
+;;; lread-tests.el --- tests for lread.c -*- lexical-binding: t; -*-
+
+;; Copyright (C) 2016 Google Inc.
+
+;; Author: Philipp Stephani <phst@google.com>
+
+;; This file is part of GNU Emacs.
+
+;; This program is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
+
+;; This program is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+
+;; Unit tests for code in src/lread.c.
+
+;;; Code:
+
+(ert-deftest lread-char-number ()
+ (should (equal ?\N{U+A817} #xA817)))
+
+(ert-deftest lread-char-name ()
+ (should (equal ?\N{SYLOTI NAGRI LETTER
+ DHO}
+ #xA817)))
+
+(ert-deftest lread-char-invalid-number ()
+ (should-error (read "?\\N{U+110000}") :type 'invalid-read-syntax))
+
+(ert-deftest lread-char-invalid-name ()
+ (should-error (read "?\\N{DOES NOT EXIST}")) :type 'invalid-read-syntax)
+
+(ert-deftest lread-char-non-ascii-name ()
+ (should-error (read "?\\N{LATIN CAPITAL LETTER Ø}")) 'invalid-read-syntax)
+
+(ert-deftest lread-char-empty-name ()
+ (should-error (read "?\\N{}")) 'invalid-read-syntax)
+
+(ert-deftest lread-string-char-number ()
+ (should (equal "a\N{U+A817}b" "a\uA817b")))
+
+(ert-deftest lread-string-char-name ()
+ (should (equal "a\N{SYLOTI NAGRI LETTER DHO}b" "a\uA817b")))
+
+;;; lread-tests.el ends here
--
2.7.0
next prev parent reply other threads:[~2016-03-06 17:35 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-03-03 5:47 Character literals for Unicode (control) characters Lars Ingebrigtsen
2016-03-03 6:20 ` John Wiegley
2016-03-03 6:25 ` Lars Ingebrigtsen
2016-03-03 6:34 ` Drew Adams
2016-03-03 16:11 ` Paul Eggert
2016-03-03 20:48 ` Eli Zaretskii
2016-03-03 23:58 ` Paul Eggert
2016-03-05 15:28 ` Philipp Stephani
2016-03-05 15:39 ` Marcin Borkowski
2016-03-05 16:51 ` Philipp Stephani
2016-03-06 2:27 ` John Wiegley
2016-03-06 15:24 ` Philipp Stephani
2016-03-06 15:54 ` Eli Zaretskii
2016-03-06 17:35 ` Philipp Stephani [this message]
2016-03-06 18:08 ` Paul Eggert
2016-03-06 18:28 ` Philipp Stephani
2016-03-06 19:03 ` Paul Eggert
2016-03-06 19:16 ` Philipp Stephani
2016-03-06 20:05 ` Eli Zaretskii
2016-03-13 20:31 ` Philipp Stephani
2016-03-14 20:03 ` Paul Eggert
2016-03-14 20:30 ` Eli Zaretskii
2016-03-15 11:09 ` Nikolai Weibull
2016-03-15 17:10 ` Eli Zaretskii
2016-03-16 8:16 ` Nikolai Weibull
2016-03-14 21:27 ` Clément Pit--Claudel
2016-03-14 21:48 ` Paul Eggert
2016-03-19 16:27 ` Philipp Stephani
2016-03-20 12:58 ` Paul Eggert
2016-03-20 13:25 ` Philipp Stephani
2016-03-25 17:41 ` Philipp Stephani
2016-04-22 2:39 ` Paul Eggert
2016-04-22 7:57 ` Eli Zaretskii
2016-04-22 8:01 ` Eli Zaretskii
2016-04-22 9:39 ` Elias Mårtenson
2016-04-22 10:01 ` Eli Zaretskii
2016-04-25 17:48 ` Paul Eggert
2016-03-05 16:35 ` Clément Pit--Claudel
2016-03-05 17:12 ` Paul Eggert
2016-03-05 17:53 ` Clément Pit--Claudel
2016-03-05 18:16 ` Eli Zaretskii
2016-03-05 18:34 ` Clément Pit--Claudel
2016-03-05 18:56 ` Eli Zaretskii
2016-03-05 19:08 ` Drew Adams
2016-03-05 22:52 ` Clément Pit--Claudel
2016-03-06 15:49 ` Joost Kremers
2016-03-06 16:55 ` Drew Adams
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/emacs/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAArVCkQtqYfyDNmWQ4qQL9QOh8HHrJTSWesG+pqgGdtYwVrc4g@mail.gmail.com \
--to=p.stephani2@gmail.com \
--cc=eggert@cs.ucla.edu \
--cc=eliz@gnu.org \
--cc=emacs-devel@gnu.org \
--cc=johnw@gnu.org \
--cc=larsi@gnus.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
Code repositories for project(s) associated with this public inbox
https://git.savannah.gnu.org/cgit/emacs.git
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).