all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
From: "Mattias Engdegård" <mattiase@acm.org>
To: Alan Mackenzie <acm@muc.de>
Cc: Lars Ingebrigtsen <larsi@gnus.org>, 25706@debbugs.gnu.org
Subject: bug#25706: 26.0.50; Slow C file fontification
Date: Wed, 9 Dec 2020 18:00:30 +0100	[thread overview]
Message-ID: <FF2C8BEC-A227-4533-8ADC-93080A5BB5DF@acm.org> (raw)
In-Reply-To: <X8/JG7eD7SfkEimH@ACM>

[-- Attachment #1: Type: text/plain, Size: 1804 bytes --]

First, some Emacs regexp basics:

1. If A and B match single characters, then A\|B should be written [AB] whenever possible. The reason is that A\|B adds a backtrack record which uses stack space and wastes time if matching fails later on. The cost can be quite noticeable, which we have seen.

2. Syntax-class constructs are usually better written as character alternatives when possible.

The \sX construct, for some X, is typically somewhat slower to match than explicitly listing the characters to match. For example, if all you care about are space and tab, then "\\s *" should be written "[ \t]*".

3. Unicode character classes are slower to match than ASCII-only ones. For example, [[:alpha:]] is slower than [A-Za-z], assuming only those characters are of interest.

4. [^...] will match \n unless included in the set. For example, "[^a]\\|$" will almost never match the $ (end-of-line) branch, because a newline will be matched by the first branch. The only exception is at the very end of the buffer if it is not newline-terminated, but that is rarely worth considering for source code.

5. \r (carriage return) normally doesn't appear in buffers even if the file uses DOS line endings. Line endings are converted into a single \n (newline) when the buffer is read. In particular, $ does NOT match at \r, only before \n.

When \r appears it is usually because the file contains a mixture of line-ending styles, typically from being edited using broken tools. Whether you want to take such files into account is a matter of judgement; most modes don't bother.

6. Capturing groups costs more than non-capturing groups, but you already know that.

On to specifics: here are annotations for possible improvements in cc-langs.el. (I didn't bother about capturing groups here.)


[-- Attachment #2: cc-regexp-annot.diff --]
[-- Type: application/octet-stream, Size: 7058 bytes --]

diff --git a/lisp/progmodes/cc-langs.el b/lisp/progmodes/cc-langs.el
index d6089ea295..695c41fce6 100644
--- a/lisp/progmodes/cc-langs.el
+++ b/lisp/progmodes/cc-langs.el
@@ -903,6 +903,7 @@ c-opt-cpp-prefix
   ;; TODO (ACM, 2005-04-01).  Amend the following to recognize escaped NLs;
   ;; amend all uses of c-opt-cpp-prefix which count regexp-depth.
   t "\\s *#\\s *"
+;;; XXX replace "\\s " with char alt, presumably [ \t] (2x)
   (java awk) nil)
 (c-lang-defvar c-opt-cpp-prefix (c-lang-const c-opt-cpp-prefix))
 
@@ -910,6 +911,7 @@ c-anchored-cpp-prefix
   "Regexp matching the prefix of a cpp directive anchored to BOL,
 in the languages that have a macro preprocessor."
   t "^\\s *\\(#\\)\\s *"
+;;; XXX replace "\\s " with char alt, presumably [ \t] (2x)
   (java awk) nil)
 (c-lang-defvar c-anchored-cpp-prefix (c-lang-const c-anchored-cpp-prefix))
 
@@ -920,6 +922,7 @@ c-opt-cpp-start
   t    (if (c-lang-const c-opt-cpp-prefix)
 	   (concat (c-lang-const c-opt-cpp-prefix)
 		   "\\([" c-alnum "]+\\)"))
+;;; XXX all cpp directives are lower-case ASCII letters; should be [a-z]+
   ;; Pike, being a scripting language, recognizes hash-bangs too.
   pike (concat (c-lang-const c-opt-cpp-prefix)
 	       "\\([" c-alnum "]+\\|!\\)"))
@@ -968,6 +971,8 @@ c-opt-cpp-macro-define-start
 	(concat (c-lang-const c-opt-cpp-prefix)
 		(c-lang-const c-opt-cpp-macro-define)
 		"[ \t]+\\(\\(\\sw\\|_\\)+\\)\\(([^)]*)\\)?"
+;;; XXX \\(\\sw\\|_\\)+ should be [[:word:]_]+,
+;;; XXX or more likely [[:alpha:]_][[:alnum:]_]*
 		;;       ^                 ^ #defined name
 		"\\([ \t]\\|\\\\\n\\)*")))
 (c-lang-defvar c-opt-cpp-macro-define-start
@@ -980,6 +985,8 @@ c-opt-cpp-macro-define-id
 	(concat (c-lang-const c-opt-cpp-prefix)	; #
 		(c-lang-const c-opt-cpp-macro-define) ; define
 		"[ \t]+\\(\\sw\\|_\\)+")))
+;;; XXX \\(\\sw\\|_\\)+ should be [[:word:]_]+,
+;;; XXX or more likely [[:alpha:]_][[:alnum:]_]*
 (c-lang-defvar c-opt-cpp-macro-define-id
   (c-lang-const c-opt-cpp-macro-define-id))
 
@@ -990,6 +997,10 @@ c-anchored-hash-define-no-parens
 	(concat (c-lang-const c-anchored-cpp-prefix)
 		(c-lang-const c-opt-cpp-macro-define)
 		"[ \t]+\\(\\sw\\|_\\)+\\([^(a-zA-Z0-9_]\\|$\\)")))
+;;; XXX \\(\\sw\\|_\\)+ should be [[:word:]_]+,
+;;; XXX or more likely [[:alpha:]_][[:alnum:]_]*
+;;; XXX but what about the ASCII-only tail? Besides, [^(a-zA-Z0-9_] will
+;;; XXX always match \n so the $ is almost never useful!
 
 (c-lang-defconst c-cpp-expr-directives
   "List of cpp directives (without the prefix) that are followed by an
@@ -1353,6 +1364,7 @@ c-assignment-op-regexp
 	(concat
 	 ;; Need special case for "=" since it's a prefix of "==".
 	 "=\\([^=]\\|$\\)"
+;;; XXX [^=] matches \n so the $ is almost never useful
 	 "\\|"
 	 (c-make-keywords-re nil
 	   (c--set-difference (c-lang-const c-assignment-operators)
@@ -1412,6 +1424,7 @@ c-<-pseudo-digraph-cont-regexp
 template opener followed by the \"::\" operator - usually."
   t regexp-unmatchable
   c++ "::\\([^:>]\\|$\\)")
+;;; XXX [^:>] matches \n so the $ is almost never useful
 (c-lang-defvar c-<-pseudo-digraph-cont-regexp
 	       (c-lang-const c-<-pseudo-digraph-cont-regexp))
 
@@ -1599,6 +1612,7 @@ c-simple-ws
 Does not contain a \\| operator at the top level."
   ;; "\\s " is not enough since it doesn't match line breaks.
   t "\\(\\s \\|[\n\r]\\)")
+;;; XXX replace with single char alt: [ \t\n\r\f]
 
 (c-lang-defconst c-simple-ws-depth
   ;; Number of regexp grouping parens in `c-simple-ws'.
@@ -1702,6 +1716,7 @@ c-last-c-comment-end-on-line-re
 comments.  When a match is found, submatch 1 contains the comment
 ender."
   t "\\(\\*/\\)\\([^*]\\|\\*+\\([^*/]\\|$\\)\\)*$"
+;;; XXX [^*/] matches \n so the $ is almost never useful
   awk nil)
 (c-lang-defvar c-last-c-comment-end-on-line-re
 	       (c-lang-const c-last-c-comment-end-on-line-re))
@@ -1778,6 +1793,7 @@ comment-start-skip
 			   (c-lang-const c-block-comment-starter)))
 	     "\\|")
 	    "\\)\\s *"))
+;;; XXX replace "\\s " with char alt, presumably [ \t]
 (c-lang-setvar comment-start-skip (c-lang-const comment-start-skip))
 
 (c-lang-defconst comment-end-can-be-escaped
@@ -1792,6 +1808,7 @@ c-syntactic-ws-start
   ;; Regexp matching any sequence that can start syntactic whitespace.
   ;; The only uncertain case is '#' when there are cpp directives.
   t (concat "\\s \\|"
+;;; XXX replace "\\s " with char alt, presumably [ \t]
 	    (c-make-keywords-re nil
 	      (append (list (c-lang-const c-line-comment-starter)
 			    (c-lang-const c-block-comment-starter)
@@ -1799,6 +1816,7 @@ c-syntactic-ws-start
 			      "#"))
 		      '("\n" "\r")))
 	    "\\|\\\\[\n\r]"
+;;; XXX unclear if \r is ever relevant here (2x)
 	    (when (memq 'gen-comment-delim c-emacs-features)
 	      "\\|\\s!")))
 (c-lang-defvar c-syntactic-ws-start (c-lang-const c-syntactic-ws-start))
@@ -1847,6 +1865,8 @@ c-unterminated-block-comment-regexp
 			"]"
 			"[^" (substring end 0 1) "\n\r]*"
 			"\\)*"))
+;;; XXX this is baroque, since c-block-comment-ender is either nil or "*/",
+;;; XXX so why not special case those and be done with it?
 	       (t
 		(error "Can't handle a block comment ender of length %s"
 		       (length end))))))))
@@ -1868,6 +1888,7 @@ c-block-comment-regexp
 	       ((= (length end) 2)
 		(concat (regexp-quote (substring end 0 1)) "+"
 			(regexp-quote (substring end 1 2))))
+;;; XXX see above; c-block-comment-ender is nil or "*/"
 	       (t
 		(error "Can't handle a block comment ender of length %s"
 		       (length end))))))))
@@ -1883,6 +1904,7 @@ c-nonwhite-syntactic-ws
 		     "[^\n\r]*[\n\r]"))
 	   (c-lang-const c-block-comment-regexp)
 	   "\\\\[\n\r]"
+;;; XXX \r here is probably unnecessary (3x)
 	   (when (memq 'gen-comment-delim c-emacs-features)
 	     "\\s!\\S!*\\s!"))
      "\\|"))
@@ -1927,6 +1949,7 @@ c-single-line-syntactic-ws
 		(c-lang-const c-block-comment-regexp)
 		"\\s *\\)*")
       "\\s *"))
+;;; XXX replace "\\s " with char alt, presumably [ \t] (3x)
 
 (c-lang-defconst c-single-line-syntactic-ws-depth
   ;; Number of regexp grouping parens in `c-single-line-syntactic-ws'.
@@ -3476,6 +3499,7 @@ c-type-decl-prefix-key
 	       "\\)"
 	       "\\([^=]\\|$\\)")
   pike "\\(\\*\\)\\([^=]\\|$\\)")
+;;; XXX [^=] matches \n so the $ is almost never useful (3x)
 (c-lang-defvar c-type-decl-prefix-key (c-lang-const c-type-decl-prefix-key)
   'dont-doc)
 
@@ -3498,6 +3522,7 @@ c-type-decl-operator-prefix-key
 	       "\\)"
 	       "\\([^=]\\|$\\)")
   pike "\\(\\*\\)\\([^=]\\|$\\)")
+;;; XXX [^=] matches \n so the $ is almost never useful (3x)
 (c-lang-defvar c-type-decl-operator-prefix-key
   (c-lang-const c-type-decl-operator-prefix-key))
 
@@ -3647,6 +3672,8 @@ c-pre-id-bracelist-key
 "
   t regexp-unmatchable
   c++ "new\\([^[:alnum:]_$]\\|$\\)\\|&&?\\(\\S.\\|$\\)")
+;;; XXX [^[:alnum:_$] matches \n so the $ is almost never useful
+;;; XXX \\S. matches \n so the $ is almost never useful
 (c-lang-defvar c-pre-id-bracelist-key (c-lang-const c-pre-id-bracelist-key))
 
 (c-lang-defconst c-recognize-typeless-decls

  parent reply	other threads:[~2020-12-09 17:00 UTC|newest]

Thread overview: 45+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-02-13 18:20 bug#25706: 26.0.50; Slow C file fontification Sujith
2020-11-30 11:26 ` Lars Ingebrigtsen
2020-11-30 11:37   ` Lars Ingebrigtsen
2020-11-30 12:46 ` Mattias Engdegård
2020-11-30 12:49   ` Lars Ingebrigtsen
2020-11-30 16:27   ` Eli Zaretskii
2020-11-30 16:38   ` Alan Mackenzie
2020-11-30 16:53     ` Mattias Engdegård
2020-11-30 17:04       ` Mattias Engdegård
2020-12-01  5:48         ` Ravine Var
2020-12-01 13:34           ` Mattias Engdegård
2020-12-01  9:29         ` Alan Mackenzie
2020-12-01  9:44           ` martin rudalics
2020-12-01 10:07             ` Alan Mackenzie
2020-12-01  9:21       ` Alan Mackenzie
2020-12-01 12:03         ` Mattias Engdegård
2020-12-01 12:57           ` Alan Mackenzie
2020-12-01 14:07             ` Mattias Engdegård
2020-12-01 15:27               ` Alan Mackenzie
2020-12-01 18:59                 ` Mattias Engdegård
2020-12-02 10:15                   ` Alan Mackenzie
     [not found]                   ` <X8dpQeGaDD1w3kXX@ACM>
2020-12-02 15:06                     ` Mattias Engdegård
2020-12-03 10:48                       ` Alan Mackenzie
2020-12-03 14:03                         ` Mattias Engdegård
2020-12-04 21:04                           ` Alan Mackenzie
     [not found]                           ` <X8qkcokfZGbaK5A2@ACM>
2020-12-05 15:20                             ` Mattias Engdegård
2020-12-08 18:42                               ` Alan Mackenzie
     [not found]                               ` <X8/JG7eD7SfkEimH@ACM>
2020-12-08 19:32                                 ` Mattias Engdegård
2020-12-09  7:31                                 ` Ravine Var
2020-12-09  7:47                                   ` Ravine Var
2020-12-10  8:08                                     ` Alan Mackenzie
2020-12-09 18:46                                   ` Alan Mackenzie
     [not found]                                   ` <X9Ebn7hKnG/vpDcZ@ACM>
2020-12-09 20:04                                     ` Eli Zaretskii
2020-12-09 20:32                                       ` Alan Mackenzie
2020-12-10 17:02                                     ` Ravine Var
2020-12-10 20:02                                       ` Alan Mackenzie
2020-12-11 10:55                                         ` Ravine Var
2020-12-12 15:34                                           ` Alan Mackenzie
     [not found]                                           ` <X9TjCeydJaE2mpK8@ACM>
2020-12-14  7:20                                             ` Ravine Var
2020-12-14 11:44                                               ` Alan Mackenzie
2020-12-15  4:01                                                 ` Ravine Var
2020-12-15 12:27                                                   ` Alan Mackenzie
2020-12-09 17:00                                 ` Mattias Engdegård [this message]
2020-12-10 12:26                                   ` Alan Mackenzie
2020-11-30 18:30   ` Alan Mackenzie

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=FF2C8BEC-A227-4533-8ADC-93080A5BB5DF@acm.org \
    --to=mattiase@acm.org \
    --cc=25706@debbugs.gnu.org \
    --cc=acm@muc.de \
    --cc=larsi@gnus.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.