unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
From: Michal Nazarewicz <mina86@mina86.com>
To: 24603@debbugs.gnu.org
Subject: bug#24603: [RFC 17/18] Optimise character class matching in regexes
Date: Tue,  4 Oct 2016 03:10:40 +0200	[thread overview]
Message-ID: <1475543441-10493-17-git-send-email-mina86@mina86.com> (raw)
In-Reply-To: <1475543441-10493-1-git-send-email-mina86@mina86.com>

Use lookup tables defined in src/character.h to bundle checks together
if possible.  For example, ‘[[:lower:][:digit:]]’ would perform an
equivalence of ‘lowercasep(ch) || numericp(ch)’ check.  Now, such checks
are done all at once with at most one Unicode general category lookup.

Similarly, do at most one syntax table lookup by unrolling macros
testing character properties.

* src/regex.c (execute_charset): Use category_char_bits and call SYNTAX
at most once.

* test/src/regex-tests.el (regex-tests--letter-character-classes): New
test case for various character classes relating to letters etc.
---
 src/regex.c             | 86 +++++++++++++++++++++++++++++++------------------
 test/src/regex-tests.el | 45 ++++++++++++++++++++++++++
 2 files changed, 100 insertions(+), 31 deletions(-)

diff --git a/src/regex.c b/src/regex.c
index 02da1fb..bfd04a1 100644
--- a/src/regex.c
+++ b/src/regex.c
@@ -1789,16 +1789,26 @@ struct range_table_work_area
 /* Bits used to implement the multibyte-part of the various character classes
    such as [:alnum:] in a charset's range table.  The code currently assumes
    that only the low 16 bits are used.  */
-#define BIT_WORD	0x1
-#define BIT_LOWER	0x2
-#define BIT_PUNCT	0x4
-#define BIT_SPACE	0x8
-#define BIT_UPPER	0x10
-#define BIT_MULTIBYTE	0x20
-#define BIT_ALPHA	0x40
-#define BIT_ALNUM	0x80
-#define BIT_GRAPH	0x100
-#define BIT_PRINT	0x200
+#ifdef emacs
+#  define BIT_ALNUM	CHAR_BIT_ALNUM
+#  define BIT_ALPHA	CHAR_BIT_ALPHA
+#  define BIT_UPPER	CHAR_BIT_UPPER
+#  define BIT_LOWER	CHAR_BIT_LOWER
+#  define BIT_GRAPH	CHAR_BIT_GRAPH
+#  define BIT_PRINT	CHAR_BIT_PRINT
+#else
+#  define BIT_ALNUM	(1 << 0)
+#  define BIT_ALPHA	(1 << 1)
+#  define BIT_UPPER	(1 << 2)
+#  define BIT_LOWER	(1 << 3)
+#  define BIT_GRAPH	(1 << 4)
+#  define BIT_PRINT	(1 << 5)
+#endif
+#define BIT_WORD	(BIT_PRINT << 1)
+#define BIT_PUNCT	(BIT_PRINT << 2)
+#define BIT_SPACE	(BIT_PRINT << 3)
+#define BIT_MULTIBYTE	(BIT_PRINT << 4)
+
 \f
 
 /* Set the bit for character C in a list.  */
@@ -1988,9 +1998,6 @@ re_wctype_parse (const unsigned char **strp, unsigned limit)
            2 [:print:]
            2 [:cntrl:]
            1 [:ff:]
-
-     If you update this list, consider also updating chain of or’ed conditions
-     in execute_charset function.
    */
 
   switch (it - beg) {
@@ -4657,28 +4664,45 @@ execute_charset (const_re_char **pp, unsigned c, unsigned corig, bool unibyte)
   else if (rtp)
     {
       int class_bits = CHARSET_RANGE_TABLE_BITS (p);
+      int bits;
       re_wchar_t range_start, range_end;
 
-  /* Sort tests by the most commonly used classes with some adjustment to which
-     tests are easiest to perform.  Take a look at comment in re_wctype_parse
-     for table with frequencies of character class names. */
-
-      if ((class_bits & BIT_MULTIBYTE) ||
-	  (class_bits & BIT_ALNUM && ISALNUM (c)) ||
-	  (class_bits & BIT_ALPHA && ISALPHA (c)) ||
-	  (class_bits & BIT_SPACE && ISSPACE (c)) ||
-	  (class_bits & BIT_WORD  && ISWORD  (c)) ||
-	  ((class_bits & BIT_UPPER) &&
-	   (ISUPPER (c) || (corig != c &&
-			    c == downcase (corig) && ISLOWER (c)))) ||
-	  ((class_bits & BIT_LOWER) &&
-	   (ISLOWER (c) || (corig != c &&
-			    c == upcase (corig) && ISUPPER(c)))) ||
-	  (class_bits & BIT_PUNCT && ISPUNCT (c)) ||
-	  (class_bits & BIT_GRAPH && ISGRAPH (c)) ||
-	  (class_bits & BIT_PRINT && ISPRINT (c)))
+      if (class_bits & BIT_MULTIBYTE)
 	return !not;
 
+      /* If we are at this point, the character is not an ASCII or single byte
+	 character.  This means that whenever ISFOO macros have special case for
+	 IS_REAL_ASCII (c), we can ignore that. */
+
+      bits = class_bits & (BIT_ALNUM | BIT_ALPHA | BIT_UPPER | BIT_LOWER |
+			   BIT_GRAPH | BIT_PRINT);
+      if (bits)
+	{
+	  int char_bits = category_char_bits[char_unicode_category (c)];
+	  if (bits & char_bits)
+	    return !not;
+
+	  /* Handle case folding. */
+	  if (corig != c)
+	    {
+	      if ((bits & BIT_UPPER) && (char_bits & BIT_LOWER) &&
+		  c == downcase (corig))
+		return !not;
+	      if ((bits & BIT_LOWER) && (char_bits & BIT_UPPER) &&
+		  c == upcase (corig))
+		return !not;
+	    }
+	}
+
+      if (class_bits & (BIT_SPACE | BIT_WORD | BIT_PUNCT))
+	{
+	  enum syntaxcode s = SYNTAX (c);
+	  if (((class_bits & BIT_SPACE) && s == Swhitespace) ||
+	      ((class_bits & BIT_WORD ) && s == Sword) ||
+	      ((class_bits & BIT_PUNCT) && s != Sword))
+	    return !not;
+	}
+
       for (p = *pp; rtp < p; rtp += 2 * 3)
 	{
 	  EXTRACT_CHARACTER (range_start, rtp);
diff --git a/test/src/regex-tests.el b/test/src/regex-tests.el
index fc50344..7617823 100644
--- a/test/src/regex-tests.el
+++ b/test/src/regex-tests.el
@@ -98,6 +98,51 @@ regex--test-cc
     (eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t)))
 
 
+(ert-deftest regex-tests--letter-character-classes ()
+  "Test character classes against various letters types."
+  (should-not
+   (cl-remove-if
+    'not
+    (let ((check-ccs (lambda (ch fold)
+                       (mapconcat
+                        (lambda (str) str)
+                        (let ((case-fold-search fold))
+                          (cl-remove-if-not
+                           (lambda (cc)
+                             (string-match-p (concat "[[:" cc ":]]")
+                                             (string ch)))
+                           '("alnum" "alpha" "upper" "lower")))
+                        " "))))
+      (mapcar
+       (lambda (entry)
+         (let ((ch (car entry)) (expected (cdr entry)))
+           (setq entry
+                 (format "%s | %s | case-fold: %s"
+                         (get-char-code-property ch 'general-category)
+                         (funcall check-ccs ch nil) (funcall check-ccs ch t)))
+           (unless (string-equal expected entry)
+             (format "\n%c        expected: %s\nU+%06X  but got: %s"
+                     ch expected ch entry))))
+       '((?A . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+         (?ẞ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+         (?DZ . "Lu | alnum alpha upper | case-fold: alnum alpha upper lower")
+         (?a . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         ;; FIXME: Should match upper when case-fold case
+         ;; (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         ;; (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         ;; (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         ;; (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         ;; (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha upper lower")
+         (?ł . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+         (?ß . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+         (?fi . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+         (?ɕ . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+         (?dz . "Ll | alnum alpha lower | case-fold: alnum alpha lower")
+         (?Dz . "Lt | alnum alpha | case-fold: alnum alpha upper lower")
+         (?ʰ . "Lm | alnum alpha | case-fold: alnum alpha")
+         (?º . "Lo | alnum alpha | case-fold: alnum alpha")))))))
+
+
 (defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body)
   "Reads a line of the test file TEST-FILE, skipping
 comments (defined by COMMENT-CHAR), and evaluates the tests in
-- 
2.8.0.rc3.226.g39d4020






  parent reply	other threads:[~2016-10-04  1:10 UTC|newest]

Thread overview: 89+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-04  1:05 bug#24603: [RFC 00/18] Improvement to casing Michal Nazarewicz
2016-10-04  1:10 ` bug#24603: [RFC 01/18] Add tests for casefiddle.c Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 02/18] Generate upcase and downcase tables from Unicode data Michal Nazarewicz
2016-10-04  7:27     ` Eli Zaretskii
2016-10-04 14:54       ` Michal Nazarewicz
2016-10-04 15:06         ` Eli Zaretskii
2016-10-04 16:57           ` Michal Nazarewicz
2016-10-04 17:27             ` Eli Zaretskii
2016-10-04 17:44               ` Eli Zaretskii
2016-10-06 20:29                 ` Michal Nazarewicz
2016-10-07  6:52                   ` Eli Zaretskii
2016-10-04  1:10   ` bug#24603: [RFC 03/18] Don’t assume character can be either upper- or lower-case when casing Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 04/18] Split casify_object into multiple functions Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 05/18] Introduce case_character function Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 06/18] Add support for title-casing letters Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 07/18] Split up casify_region function Michal Nazarewicz
2016-10-04  7:17     ` Eli Zaretskii
2016-10-18  2:27       ` Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 08/18] Support casing characters which map into multiple code points Michal Nazarewicz
2016-10-04  7:38     ` Eli Zaretskii
2016-10-06 21:40       ` Michal Nazarewicz
2016-10-07  7:46         ` Eli Zaretskii
2017-01-28 23:48           ` Michal Nazarewicz
2017-02-10  9:12             ` Eli Zaretskii
2016-10-04  1:10   ` bug#24603: [RFC 09/18] Implement special sigma casing rule Michal Nazarewicz
2016-10-04  7:22     ` Eli Zaretskii
2016-10-04  1:10   ` bug#24603: [RFC 10/18] Implement Turkic dotless and dotted i handling when casing strings Michal Nazarewicz
2016-10-04  7:12     ` Eli Zaretskii
2016-10-04  1:10   ` bug#24603: [RFC 11/18] Implement casing rules for Lithuanian Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 12/18] Implement rules for title-casing Dutch ij ‘letter’ Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 13/18] Add some tricky Unicode characters to regex test Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 14/18] Factor out character category lookup to separate function Michal Nazarewicz
2016-10-04  1:10   ` bug#24603: [RFC 15/18] Base lower- and upper-case tests on Unicode properties Michal Nazarewicz
2016-10-04  6:54     ` Eli Zaretskii
2016-10-04  1:10   ` bug#24603: [RFC 16/18] Refactor character class checking; optimise ASCII case Michal Nazarewicz
2016-10-04  7:48     ` Eli Zaretskii
2016-10-17 13:22       ` Michal Nazarewicz
2016-11-06 19:26       ` Michal Nazarewicz
2016-11-06 19:44         ` Eli Zaretskii
2016-12-20 14:32           ` Michal Nazarewicz
2016-12-20 16:39             ` Eli Zaretskii
2016-12-22 14:02               ` Michal Nazarewicz
2016-10-04  1:10   ` Michal Nazarewicz [this message]
2016-10-04  1:10   ` bug#24603: [RFC 18/18] Fix case-fold-search character class matching Michal Nazarewicz
2016-10-17 22:03 ` bug#24603: [PATCH 0/3] Case table updates Michal Nazarewicz
2016-10-17 22:03   ` bug#24603: [PATCH 1/3] Add tests for casefiddle.c Michal Nazarewicz
2016-10-17 22:03   ` bug#24603: [PATCH 2/3] Generate upcase and downcase tables from Unicode data Michal Nazarewicz
2016-10-17 22:03   ` bug#24603: [PATCH 3/3] Don’t generate ‘X maps to X’ entries in case tables Michal Nazarewicz
2016-10-18  6:36   ` bug#24603: [PATCH 0/3] Case table updates Eli Zaretskii
2016-10-24 15:11     ` Michal Nazarewicz
2016-10-24 15:33       ` Eli Zaretskii
2017-03-09 21:51 ` bug#24603: [PATCHv5 00/11] Casing improvements Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 01/11] Split casify_object into multiple functions Michal Nazarewicz
2017-03-10  9:00     ` Andreas Schwab
2017-03-09 21:51   ` bug#24603: [PATCHv5 02/11] Introduce case_character function Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 03/11] Add support for title-casing letters (bug#24603) Michal Nazarewicz
2017-03-11  9:03     ` Eli Zaretskii
2017-03-09 21:51   ` bug#24603: [PATCHv5 04/11] Split up casify_region function (bug#24603) Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 05/11] Support casing characters which map into multiple code points (bug#24603) Michal Nazarewicz
2017-03-11  9:14     ` Eli Zaretskii
2017-03-21  2:09       ` Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 06/11] Implement special sigma casing rule (bug#24603) Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 07/11] Introduce ‘buffer-language’ buffer-locar variable Michal Nazarewicz
2017-03-11  9:29     ` Eli Zaretskii
2017-03-09 21:51   ` bug#24603: [PATCHv5 08/11] Implement rules for title-casing Dutch ij ‘letter’ (bug#24603) Michal Nazarewicz
2017-03-11  9:40     ` Eli Zaretskii
2017-03-16 21:30       ` Michal Nazarewicz
2017-03-17 13:43         ` Eli Zaretskii
2017-03-09 21:51   ` bug#24603: [PATCHv5 09/11] Implement Turkic dotless and dotted i casing rules (bug#24603) Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 10/11] Implement casing rules for Lithuanian (bug#24603) Michal Nazarewicz
2017-03-09 21:51   ` bug#24603: [PATCHv5 11/11] Implement Irish casing rules (bug#24603) Michal Nazarewicz
2017-03-11  9:44     ` Eli Zaretskii
2017-03-16 22:16       ` Michal Nazarewicz
2017-03-17  8:20         ` Eli Zaretskii
2017-03-11 10:00   ` bug#24603: [PATCHv5 00/11] Casing improvements Eli Zaretskii
2017-03-21  1:27   ` bug#24603: [PATCHv6 0/6] Casing improvements, language-independent part Michal Nazarewicz
2017-03-21  1:27     ` bug#24603: [PATCHv6 1/6] Split casify_object into multiple functions Michal Nazarewicz
2017-03-21  1:27     ` bug#24603: [PATCHv6 2/6] Introduce case_character function Michal Nazarewicz
2017-03-21  1:27     ` bug#24603: [PATCHv6 3/6] Add support for title-casing letters (bug#24603) Michal Nazarewicz
2017-03-21  1:27     ` bug#24603: [PATCHv6 4/6] Split up casify_region function (bug#24603) Michal Nazarewicz
2017-03-21  1:27     ` bug#24603: [PATCHv6 5/6] Support casing characters which map into multiple code points (bug#24603) Michal Nazarewicz
2017-03-22 16:06       ` Eli Zaretskii
2017-04-03  9:01         ` Michal Nazarewicz
2017-04-03 14:52           ` Eli Zaretskii
2019-06-25  0:09           ` Lars Ingebrigtsen
2019-06-25  0:29             ` Michał Nazarewicz
2020-08-11 13:46               ` Lars Ingebrigtsen
2021-05-10 11:51                 ` bug#24603: [RFC 00/18] Improvement to casing Lars Ingebrigtsen
2017-03-21  1:27     ` bug#24603: [PATCHv6 6/6] Implement special sigma casing rule (bug#24603) Michal Nazarewicz

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1475543441-10493-17-git-send-email-mina86@mina86.com \
    --to=mina86@mina86.com \
    --cc=24603@debbugs.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).