unofficial mirror of emacs-devel@gnu.org 
 help / color / mirror / code / Atom feed
From: "Mattias Engdegård" <mattiase@acm.org>
To: philippe schnoebelen <schnoebelen.ph@gmail.com>, emacs-devel@gnu.org
Cc: acm@muc.de
Subject: Re: regular expressions that match nothing
Date: Tue, 14 May 2019 12:14:52 +0200	[thread overview]
Message-ID: <deadfeca7150dfc0a4d29558aed4d1437a3e758f.camel@acm.org> (raw)
In-Reply-To: <CADqJmR29MymVD96EznUsXRNaM9J6L7+s_Gj4pzuQMsCtVs_7RA@mail.gmail.com>

[-- Attachment #1: Type: text/plain, Size: 989 bytes --]

tis 2019-05-14 klockan 09:25 +0200 skrev philippe schnoebelen:
> I was wondering why (regexp-opt nil) uses a\` and not \'a or another
> option like \=a\= so I did some profiling (see attached code).

Thank you, and sorry about my bad initial attempt. I tried a few more,
like [z-a], \c* and \sq, but these were no better. The distribution is
decidedly bimodal; there seems to be no significant difference between
the 'fast' ones, so I went with \`a\` in the attached patch.

> Of course this may be dependent on the internals of the specific
> regexp library at hand. I do not know enough to judge. In fact I
> believe that a solid regular expression library should provide a
> specific regular expression that matches nothing with special but
> easy treatment that guarantees best response time.

We could add a standard constant for it, like unmatchable-regexp, so
that at least people don't keep reinventing it.
We could also make (rx (or)) work. (It does in my complete rx rewrite.)



[-- Attachment #2: 0001-Use-faster-unmatchable-regexp.patch --]
[-- Type: text/x-patch, Size: 14699 bytes --]

From 8c296cae6e0363485b1f24b76e7fb06984f3117e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Tue, 14 May 2019 11:43:49 +0200
Subject: [PATCH] Use faster unmatchable regexp

The unmatchable regexp a\` used in many places is unnecessarily slow in
searching (unanchored) matches; replace it with the faster \`a\` as
suggested by Philippe Schnoebelen.

* lisp/emacs-lisp/regexp-opt.el (regexp-opt)
* lisp/progmodes/cc-defs.el (cc-conditional-require-after-load)
(c-make-keywords-re)
* lisp/progmodes/cc-engine.el (c-beginning-of-statement-1)
(c-forward-<>-arglist-recur, c-forward-decl-or-cast-1)
(c-looking-at-decl-block)
* lisp/progmodes/cc-fonts.el (c-doc-line-join-re)
(c-doc-bright-comment-start-re)
* lisp/progmodes/cc-langs.el (c-populate-syntax-table)
(c-assignment-op-regexp)
(c-block-comment-ender-regexp, c-font-lock-comment-end-skip)
(c-block-comment-start-regexp, c-line-comment-start-regexp)
(c-doc-comment-start-regexp, c-decl-start-colon-kwd-re)
(c-type-decl-prefix-key, c-type-decl-operator-prefix-key)
(c-pre-id-bracelist-key, c-enum-clause-introduction-re)
(c-nonlabel-token-2-key)
* lisp/progmodes/cc-mode.el (c-doc-fl-decl-start, c-doc-fl-decl-end)
* lisp/progmodes/cc-vars.el (c-noise-macro-with-parens-name-re)
(c-noise-macro-name-re, c-make-noise-macro-regexps)
* lisp/progmodes/octave.el (octave-help-mode):
Use faster unmatchable regexp.
* lisp/textmodes/ispell.el (ispell-non-empty-string):
Fix broken 'unmatchable' regexp.
---
 lisp/emacs-lisp/regexp-opt.el |  4 ++--
 lisp/progmodes/cc-defs.el     |  6 +++---
 lisp/progmodes/cc-engine.el   |  8 ++++----
 lisp/progmodes/cc-fonts.el    |  4 ++--
 lisp/progmodes/cc-langs.el    | 26 +++++++++++++-------------
 lisp/progmodes/cc-mode.el     |  4 ++--
 lisp/progmodes/cc-vars.el     |  8 ++++----
 lisp/progmodes/octave.el      |  2 +-
 lisp/textmodes/ispell.el      |  2 +-
 9 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el
index d883752d71..220a4bfa7a 100644
--- a/lisp/emacs-lisp/regexp-opt.el
+++ b/lisp/emacs-lisp/regexp-opt.el
@@ -144,9 +144,9 @@ regexp-opt
 			    (sort (copy-sequence strings) 'string-lessp)))
 	   (re
             (cond
-             ;; No strings: return a\` which cannot match anything.
+             ;; No strings: return \`a\` which cannot match anything.
              ((null strings)
-              (concat (or open "\\(?:") "a\\`\\)"))
+              (concat (or open "\\(?:") "\\`a\\`\\)"))
              ;; If we cannot reorder, give up all attempts at
              ;; optimisation.  There is room for improvement (Bug#34641).
              ((and keep-order (regexp-opt--contains-prefix sorted-strings))
diff --git a/lisp/progmodes/cc-defs.el b/lisp/progmodes/cc-defs.el
index cd4ed6b352..1ae328d4ff 100644
--- a/lisp/progmodes/cc-defs.el
+++ b/lisp/progmodes/cc-defs.el
@@ -81,7 +81,7 @@
   (progn
     (require 'font-lock)
     (let (font-lock-keywords)
-      (font-lock-compile-keywords '("a\\`")) ; doesn't match anything.
+      (font-lock-compile-keywords '("\\`a\\`")) ; doesn't match anything.
       font-lock-keywords))))
 
 \f
@@ -1890,8 +1890,8 @@ c-make-keywords-re
 
     ;; Produce a regexp that doesn't match anything.
     (if adorn
-	"\\(a\\`\\)"
-      "a\\`")))
+	"\\(\\`a\\`\\)"
+      "\\`a\\`")))
 
 (put 'c-make-keywords-re 'lisp-indent-function 1)
 
diff --git a/lisp/progmodes/cc-engine.el b/lisp/progmodes/cc-engine.el
index a25d059553..648e7197f3 100644
--- a/lisp/progmodes/cc-engine.el
+++ b/lisp/progmodes/cc-engine.el
@@ -907,7 +907,7 @@ c-beginning-of-statement-1
 	stack
 	;; Regexp which matches "for", "if", etc.
 	(cond-key (or c-opt-block-stmt-key
-		      "a\\`"))	; Doesn't match anything.
+		      "\\`a\\`"))	; Doesn't match anything.
 	;; Return value.
 	(ret 'same)
 	;; Positions of the last three sexps or bounds we've stopped at.
@@ -7638,7 +7638,7 @@ c-forward-<>-arglist-recur
 		(progn
 		  (c-forward-syntactic-ws)
 		  (when (or (and c-record-type-identifiers all-types)
-			    (not (equal c-inside-<>-type-key "\\(a\\`\\)")))
+			    (not (equal c-inside-<>-type-key "\\(\\`a\\`\\)")))
 		    (c-forward-syntactic-ws)
 		    (cond
 		     ((eq (char-after) ??)
@@ -9245,7 +9245,7 @@ c-forward-decl-or-cast-1
       ;; Skip over type decl prefix operators.  (Note similar code in
       ;; `c-forward-declarator'.)
       (if (and c-recognize-typeless-decls
-	       (equal c-type-decl-prefix-key "a\\`")) ; Regexp which doesn't match
+	       (equal c-type-decl-prefix-key "\\`a\\`")) ; Regexp which doesn't match
 	  (when (eq (char-after) ?\()
 	    (progn
 	      (setq paren-depth (1+ paren-depth))
@@ -10878,7 +10878,7 @@ c-looking-at-decl-block
 	      ;; legal because it's part of a "compound keyword" like
 	      ;; "enum class".	Of course, if c-after-brace-list-key
 	      ;; is nil, we can skip the test.
-	      (or (equal c-after-brace-list-key "a\\`") ; Regexp which doesn't match
+	      (or (equal c-after-brace-list-key "\\`a\\`") ; Regexp which doesn't match
 		  (save-match-data
 		    (save-excursion
 		      (not
diff --git a/lisp/progmodes/cc-fonts.el b/lisp/progmodes/cc-fonts.el
index 5f09be60a6..bed9259e42 100644
--- a/lisp/progmodes/cc-fonts.el
+++ b/lisp/progmodes/cc-fonts.el
@@ -2580,14 +2580,14 @@ pike-font-lock-keywords
 \f
 ;;; Doc comments.
 
-(defvar c-doc-line-join-re "a\\`")
+(defvar c-doc-line-join-re "\\`a\\`")
 ;; Matches a join of two lines in a doc comment.
 ;; This should not be changed directly, but instead set by
 ;; `c-setup-doc-comment-style'.  This variable is used in `c-find-decl-spots'
 ;; in (e.g.) autodoc style comments to bridge the gap between a "@\n" at an
 ;; EOL and the token following "//!" on the next line.
 
-(defvar c-doc-bright-comment-start-re "a\\`")
+(defvar c-doc-bright-comment-start-re "\\`a\\`")
 ;; Matches the start of a "bright" comment, one whose contents may be
 ;; fontified by, e.g., `c-font-lock-declarations'.
 
diff --git a/lisp/progmodes/cc-langs.el b/lisp/progmodes/cc-langs.el
index 8b7e4ef7c0..6fdcd6cfc7 100644
--- a/lisp/progmodes/cc-langs.el
+++ b/lisp/progmodes/cc-langs.el
@@ -945,7 +945,7 @@ c-populate-syntax-table
 	 (c-make-keywords-re 'appendable
 	   (c-lang-const c-cpp-include-directives))
 	 "[ \t]*")
-      "a\\`"))				; Doesn't match anything
+      "\\`a\\`"))			; Doesn't match anything
 (c-lang-defvar c-cpp-include-key (c-lang-const c-cpp-include-key))
 
 (c-lang-defconst c-opt-cpp-macro-define
@@ -1331,7 +1331,7 @@ 'c-opt-op-identitier-prefix
 	   (c--set-difference (c-lang-const c-assignment-operators)
 			      '("=")
 			      :test 'string-equal)))
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-defvar c-assignment-op-regexp
   (c-lang-const c-assignment-op-regexp))
 
@@ -1554,7 +1554,7 @@ 'c-opt-op-identitier-prefix
   ;; language)
   t (if (c-lang-const c-block-comment-ender)
 	(regexp-quote (c-lang-const c-block-comment-ender))
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-defvar c-block-comment-ender-regexp
 	       (c-lang-const c-block-comment-ender-regexp))
 
@@ -1565,7 +1565,7 @@ 'c-opt-op-identitier-prefix
   ;; `font-lock-comment-delimiter-face'.
   t (if (c-lang-const c-block-comment-ender)
 	(concat "[ \t]*" (c-lang-const c-block-comment-ender-regexp))
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-setvar font-lock-comment-end-skip
 	       (c-lang-const c-font-lock-comment-end-skip))
 
@@ -1584,7 +1584,7 @@ 'c-opt-op-identitier-prefix
   ;; language)
   t (if (c-lang-const c-block-comment-starter)
 	(regexp-quote (c-lang-const c-block-comment-starter))
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-defvar c-block-comment-start-regexp
   (c-lang-const c-block-comment-start-regexp))
 
@@ -1593,7 +1593,7 @@ 'c-opt-op-identitier-prefix
   ;; language; it does in all 7 CC Mode languages).
   t (if (c-lang-const c-line-comment-starter)
 	(regexp-quote (c-lang-const c-line-comment-starter))
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-defvar c-line-comment-start-regexp
 	       (c-lang-const c-line-comment-start-regexp))
 
@@ -1628,7 +1628,7 @@ 'c-opt-op-identitier-prefix
 
 (c-lang-defconst c-doc-comment-start-regexp
   "Regexp to match the start of documentation comments."
-  t    "a\\`"	; Doesn't match anything.
+  t    "\\`a\\`"	; Doesn't match anything.
   ;; From font-lock.el: `doxygen' uses /*! while others use /**.
   (c c++ objc) "/\\*[*!]"
   java "/\\*\\*"
@@ -3112,7 +3112,7 @@ 'c-opt-op-identitier-prefix
   "Regexp matching a keyword that is followed by a colon, where
   the whole construct can precede a declaration.
   E.g. \"public:\" in C++."
-  t "a\\`"				; Doesn't match anything.
+  t "\\`a\\`"				; Doesn't match anything.
   c++ (c-make-keywords-re t (c-lang-const c-protection-kwds)))
 (c-lang-defvar c-decl-start-colon-kwd-re
   (c-lang-const c-decl-start-colon-kwd-re))
@@ -3309,7 +3309,7 @@ 'c-opt-op-identitier-prefix
   t (if (c-lang-const c-type-modifier-kwds)
 	(concat (regexp-opt (c-lang-const c-type-modifier-kwds) t) "\\>")
       ;; Default to a regexp that never matches.
-      "a\\`")
+      "\\`a\\`")
   ;; Check that there's no "=" afterwards to avoid matching tokens
   ;; like "*=".
   (c objc) (concat "\\("
@@ -3347,7 +3347,7 @@ 'c-opt-op-identitier-prefix
 as the end of the operator.  Identifier syntax is in effect when
 this is matched \(see `c-identifier-syntax-table')."
   t ;; Default to a regexp that never matches.
-    "a\\`"
+    "\\`a\\`"
   ;; Check that there's no "=" afterwards to avoid matching tokens
   ;; like "*=".
   (c objc) (concat "\\(\\*\\)"
@@ -3506,7 +3506,7 @@ 'c-opt-op-identitier-prefix
 (c-lang-defconst c-pre-id-bracelist-key
   "A regexp matching tokens which, preceding an identifier, signify a bracelist.
 "
-  t "a\\`"				; Doesn't match anything.
+  t "\\`a\\`"				; Doesn't match anything.
   c++ "new\\([^[:alnum:]_$]\\|$\\)\\|&&?\\(\\S.\\|$\\)")
 (c-lang-defvar c-pre-id-bracelist-key (c-lang-const c-pre-id-bracelist-key))
 
@@ -3562,7 +3562,7 @@ 'c-opt-op-identitier-prefix
 	 ;; before the '{' of the enum list, to avoid searching too far.
 	 "[^][{};/#=]*"
 	 "{")
-      "a\\`"))				; Doesn't match anything.
+      "\\`a\\`"))			; Doesn't match anything.
 (c-lang-defvar c-enum-clause-introduction-re
 	       (c-lang-const c-enum-clause-introduction-re))
 
@@ -3678,7 +3678,7 @@ 'c-opt-op-identitier-prefix
   "Regexp matching things that can't occur two symbols before a colon in
 a label construct.  This catches C++'s inheritance construct \"class foo
 : bar\".  Only used if `c-recognize-colon-labels' is set."
-  t "a\\`"				; Doesn't match anything.
+  t "\\`a\\`"				; Doesn't match anything.
   c++ (c-make-keywords-re t '("class")))
 (c-lang-defvar c-nonlabel-token-2-key (c-lang-const c-nonlabel-token-2-key))
 
diff --git a/lisp/progmodes/cc-mode.el b/lisp/progmodes/cc-mode.el
index bd62fc754a..57cfdb96b3 100644
--- a/lisp/progmodes/cc-mode.el
+++ b/lisp/progmodes/cc-mode.el
@@ -1825,7 +1825,7 @@ c-doc-fl-decl-start
   ;; by `c-doc-line-join-re'), return the position of the first line of the
   ;; sequence.  Otherwise, return nil.  Point has no significance at entry to
   ;; and exit from this function.
-  (when (not (equal c-doc-line-join-re "a\\`"))
+  (when (not (equal c-doc-line-join-re "\\`a\\`"))
     (goto-char pos)
     (back-to-indentation)
     (and (or (looking-at c-comment-start-regexp)
@@ -1842,7 +1842,7 @@ c-doc-fl-decl-end
   ;; marker (as defined by `c-doc-line-join-re), return the position of
   ;; the BOL at the end of the sequence.  Otherwise, return nil.  Point has no
   ;; significance at entry to and exit from this function.
-  (when (not (equal c-doc-line-join-re "a\\`"))
+  (when (not (equal c-doc-line-join-re "\\`a\\`"))
     (goto-char pos)
     (back-to-indentation)
     (let ((here (point)))
diff --git a/lisp/progmodes/cc-vars.el b/lisp/progmodes/cc-vars.el
index 6e8acd4c0d..3b30d18872 100644
--- a/lisp/progmodes/cc-vars.el
+++ b/lisp/progmodes/cc-vars.el
@@ -1648,9 +1648,9 @@ c-asymmetry-fontification-flag
   :group 'c)
 
 ;; Initialize the next two to a regexp which never matches.
-(defvar c-noise-macro-with-parens-name-re "a\\`")
+(defvar c-noise-macro-with-parens-name-re "\\`a\\`")
 (make-variable-buffer-local 'c-noise-macro-with-parens-name-re)
-(defvar c-noise-macro-name-re "a\\`")
+(defvar c-noise-macro-name-re "\\`a\\`")
 (make-variable-buffer-local 'c-noise-macro-name-re)
 
 (defcustom c-noise-macro-names nil
@@ -1682,7 +1682,7 @@ c-make-noise-macro-regexps
   ;; Convert `c-noise-macro-names' and `c-noise-macro-with-parens-names' into
   ;; `c-noise-macro-name-re' and `c-noise-macro-with-parens-name-re'.
   (setq c-noise-macro-with-parens-name-re
-	(cond ((null c-noise-macro-with-parens-names) "a\\`") ; Never matches.
+	(cond ((null c-noise-macro-with-parens-names) "\\`a\\`") ;Never matches.
 	      ((consp c-noise-macro-with-parens-names)
 	       (concat (regexp-opt c-noise-macro-with-parens-names t)
 		       "\\([^[:alnum:]_$]\\|$\\)"))
@@ -1691,7 +1691,7 @@ c-make-noise-macro-regexps
 	      (t (error "c-make-noise-macro-regexps: \
 c-noise-macro-with-parens-names is invalid: %s" c-noise-macro-with-parens-names))))
   (setq c-noise-macro-name-re
-	(cond ((null c-noise-macro-names) "a\\`") ; Never matches anything.
+	(cond ((null c-noise-macro-names) "\\`a\\`") ; Never matches anything.
 	      ((consp c-noise-macro-names)
 	       (concat (regexp-opt c-noise-macro-names t)
 		       "\\([^[:alnum:]_$]\\|$\\)"))
diff --git a/lisp/progmodes/octave.el b/lisp/progmodes/octave.el
index 52e5fd477f..96058bca14 100644
--- a/lisp/progmodes/octave.el
+++ b/lisp/progmodes/octave.el
@@ -1691,7 +1691,7 @@ octave-help-mode
   (eval-and-compile (require 'help-mode))
   ;; Don't highlight `EXAMPLE' as elisp symbols by using a regexp that
   ;; can never match.
-  (setq-local help-xref-symbol-regexp "x\\`"))
+  (setq-local help-xref-symbol-regexp "\\`x\\`"))
 
 (defun octave-help (fn)
   "Display the documentation of FN."
diff --git a/lisp/textmodes/ispell.el b/lisp/textmodes/ispell.el
index 6553a2799b..49698dd120 100644
--- a/lisp/textmodes/ispell.el
+++ b/lisp/textmodes/ispell.el
@@ -4016,7 +4016,7 @@ ispell-message
 
 (defun ispell-non-empty-string (string)
   (if (or (not string) (string-equal string ""))
-      "\\'\\`" ; An unmatchable string if string is null.
+      "\\`a\\`" ; An unmatchable string if string is null.
     (regexp-quote string)))
 
 
-- 
2.20.1


  reply	other threads:[~2019-05-14 10:14 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-14  7:25 regular expressions that match nothing philippe schnoebelen
2019-05-14 10:14 ` Mattias Engdegård [this message]
2019-05-14 19:41   ` Stefan Monnier
2019-05-15 16:21     ` Mattias Engdegård
2019-05-15 19:41       ` Alan Mackenzie
2019-05-16 10:54         ` Mattias Engdegård
2019-05-16 23:18           ` Phil Sainty
2019-05-17  9:43             ` Alan Mackenzie
2019-05-17 10:17               ` Mattias Engdegård
2019-05-17 12:53               ` Stefan Monnier
2019-05-15 20:17       ` Michael Heerdegen
2019-05-15 21:06         ` Stefan Monnier
2019-05-15 21:07         ` Mattias Engdegård
2019-05-15 21:38           ` Michael Heerdegen
2019-05-16  6:57           ` More re odditie [Was: regular expressions that match nothing] phs
2019-05-16  9:29             ` Mattias Engdegård
2019-05-16 10:59               ` phs
2019-05-16 12:31                 ` Stefan Monnier
2019-05-16 18:35             ` Michael Heerdegen
2019-05-16 20:31               ` Mattias Engdegård
2019-05-16 21:01                 ` Global and local definitions of non-functions/variable (was: More re odditie [Was: regular expressions that match nothing]) Stefan Monnier
2019-05-20 16:26           ` Bootstrap/autoload policy (was Re: regular expressions that match nothing) Mattias Engdegård
2019-05-22 14:02             ` Stefan Monnier
2019-05-22 14:07               ` Mattias Engdegård
2019-05-22 14:24                 ` Stefan Monnier
2019-05-22 15:06                   ` Mattias Engdegård
2019-05-22 15:53                     ` Stefan Monnier
2019-05-22 16:40                       ` Mattias Engdegård
2019-05-22 19:08                         ` Stefan Monnier
2019-05-26 12:05                         ` Basil L. Contovounesios
2019-05-16 18:12       ` regular expressions that match nothing Eric Abrahamsen
2019-05-19  4:30         ` 回复: " net june
2019-05-19  5:00           ` HaiJun Zhang
2019-05-19  7:32             ` Mattias Engdegård
2019-05-20  7:56               ` philippe schnoebelen
2019-05-20 23:19                 ` Richard Stallman
2019-05-19 14:12           ` 回复: " Drew Adams

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://www.gnu.org/software/emacs/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=deadfeca7150dfc0a4d29558aed4d1437a3e758f.camel@acm.org \
    --to=mattiase@acm.org \
    --cc=acm@muc.de \
    --cc=emacs-devel@gnu.org \
    --cc=schnoebelen.ph@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).