unofficial mirror of bug-gnu-emacs@gnu.org 
 help / color / mirror / code / Atom feed
* bug#36444: [PATCH] Improved regexp-opt KEEP-ORDER check
@ 2019-06-30 12:28 Mattias Engdegård
  2019-07-03 19:29 ` Noam Postavsky
  0 siblings, 1 reply; 6+ messages in thread
From: Mattias Engdegård @ 2019-06-30 12:28 UTC (permalink / raw)
  To: 36444

[-- Attachment #1: Type: text/plain, Size: 473 bytes --]

Currently, regexp-opt does not attempt optimisation with KEEP-ORDER set if the input list contains a proper prefix of another element, like

 ("ab" "abcd")

on the grounds that the optimised string would be

 "ab\\(?:cd\\)?"

which would not preserve the match order. However, this also prevents

 ("abcd" "ab")

from being optimised, even though doing so would be harmless.

The attached patch strengthens the check, allowing more inputs to be optimised.


[-- Attachment #2: 0001-Optimise-more-inputs-to-regexp-opt.patch --]
[-- Type: application/octet-stream, Size: 7400 bytes --]

From ee29b82719c6ca40b3ff9054fcbf4e964ed18ad3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mattias=20Engdeg=C3=A5rd?= <mattiase@acm.org>
Date: Sun, 30 Jun 2019 12:53:52 +0200
Subject: [PATCH] Optimise more inputs to `regexp-opt'

Use a more precise test to determine whether the input to `regexp-opt'
is safe to optimise when KEEP-ORDER is non-nil, permitting more inputs
to be optimised than before.  For example, ("good" "goal" "go") is now
accepted.

* lisp/emacs-lisp/regexp-opt.el (regexp-opt):
More precise test for whether the list is safe w.r.t. KEEP-ORDER.
(regexp-opt--contains-prefix): Remove.

* test/lisp/emacs-lisp/regexp-opt-tests.el: Use lexical-binding.
(regexp-opt-test--permutation, regexp-opt-test--factorial)
(regexp-opt-test--permutations, regexp-opt-test--match-all)
(regexp-opt-test--check-perm, regexp-opt-test--explain-perm)
(regexp-opt-keep-order): Test KEEP-ORDER.
---
 lisp/emacs-lisp/regexp-opt.el            | 45 ++++++++---------
 test/lisp/emacs-lisp/regexp-opt-tests.el | 62 +++++++++++++++++++++++-
 2 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/lisp/emacs-lisp/regexp-opt.el b/lisp/emacs-lisp/regexp-opt.el
index b6104f22e7..8afb0c08db 100644
--- a/lisp/emacs-lisp/regexp-opt.el
+++ b/lisp/emacs-lisp/regexp-opt.el
@@ -140,21 +140,33 @@ regexp-opt
 	   (completion-ignore-case nil)
 	   (completion-regexp-list nil)
 	   (open (cond ((stringp paren) paren) (paren "\\(")))
-	   (sorted-strings (delete-dups
-			    (sort (copy-sequence strings) 'string-lessp)))
 	   (re
             (cond
              ;; No strings: return an unmatchable regexp.
              ((null strings)
               (concat (or open "\\(?:") regexp-unmatchable "\\)"))
-             ;; If we cannot reorder, give up all attempts at
-             ;; optimisation.  There is room for improvement (Bug#34641).
-             ((and keep-order (regexp-opt--contains-prefix sorted-strings))
-              (concat (or open "\\(?:")
-                      (mapconcat #'regexp-quote strings "\\|")
-                      "\\)"))
+
+             ;; The algorithm will generate a pattern that matches
+             ;; longer strings in the list before shorter.  If the
+             ;; list order matters, then no string must come after a
+             ;; proper prefix of that string.  To check this, verify
+             ;; that a straight or-pattern matches each string
+             ;; entirely.
+             ((and keep-order
+                   (let* ((case-fold-search nil)
+                          (alts (mapconcat #'regexp-quote strings "\\|")))
+                     (and (save-match-data
+                            (let ((s strings))
+                              (while (and s
+                                          (string-match alts (car s))
+                                          (= (match-end 0) (length (car s))))
+                                (setq s (cdr s)))
+                              s))
+                          (concat (or open "\\(?:") alts "\\)")))))
              (t
-              (regexp-opt-group sorted-strings (or open t) (not open))))))
+              (regexp-opt-group
+               (delete-dups (sort (copy-sequence strings) 'string-lessp))
+               (or open t) (not open))))))
       (cond ((eq paren 'words)
 	     (concat "\\<" re "\\>"))
 	    ((eq paren 'symbols)
@@ -339,21 +351,6 @@ regexp-opt-charset
              (concat "[" all "]")))))))
 
 
-(defun regexp-opt--contains-prefix (strings)
-  "Whether STRINGS contains a proper prefix of one of its other elements.
-STRINGS must be a list of sorted strings without duplicates."
-  (let ((s strings))
-    ;; In a lexicographically sorted list, a string always immediately
-    ;; succeeds one of its prefixes.
-    (while (and (cdr s)
-                (not (string-equal
-                      (car s)
-                      (substring (cadr s) 0 (min (length (car s))
-                                                 (length (cadr s)))))))
-      (setq s (cdr s)))
-    (cdr s)))
-
-
 (provide 'regexp-opt)
 
 ;;; regexp-opt.el ends here
diff --git a/test/lisp/emacs-lisp/regexp-opt-tests.el b/test/lisp/emacs-lisp/regexp-opt-tests.el
index 927de8c6a5..3658964faa 100644
--- a/test/lisp/emacs-lisp/regexp-opt-tests.el
+++ b/test/lisp/emacs-lisp/regexp-opt-tests.el
@@ -1,4 +1,4 @@
-;;; regexp-opt-tests.el --- Tests for regexp-opt.el
+;;; regexp-opt-tests.el --- Tests for regexp-opt.el  -*- lexical-binding: t -*-
 
 ;; Copyright (C) 2013-2019 Free Software Foundation, Inc.
 
@@ -25,6 +25,66 @@
 
 (require 'regexp-opt)
 
+(defun regexp-opt-test--permutation (n list)
+  "The Nth permutation of LIST, 0 ≤ N < (length LIST)!."
+  (let ((len (length list))
+        (perm-list nil))
+    (dotimes (i len)
+      (let* ((d (- len i))
+             (k (mod n d)))
+        (push (nth k list) perm-list)
+        (setq list (append (butlast list (- (length list) k))
+                           (nthcdr (1+ k) list)))
+        (setq n (/ n d))))
+    (nreverse perm-list)))
+
+(defun regexp-opt-test--factorial (n)
+  "N!"
+  (apply #'* (number-sequence 1 n)))
+
+(defun regexp-opt-test--permutations (list)
+  "All permutations of LIST."
+  (mapcar (lambda (i) (regexp-opt-test--permutation i list))
+          (number-sequence 0 (1- (regexp-opt-test--factorial (length list))))))
+
+(defun regexp-opt-test--match-all (words re)
+  (mapcar (lambda (w) (and (string-match re w)
+                           (match-string 0 w)))
+          words))
+
+(defun regexp-opt-test--check-perm (perm)
+  (let* ((ref-re (mapconcat #'regexp-quote perm "\\|"))
+         (opt-re (regexp-opt perm nil t))
+         (ref (regexp-opt-test--match-all perm ref-re))
+         (opt (regexp-opt-test--match-all perm opt-re)))
+    (equal opt ref)))
+
+(defun regexp-opt-test--explain-perm (perm)
+  (let* ((ref-re (mapconcat #'regexp-quote perm "\\|"))
+         (opt-re (regexp-opt perm nil t))
+         (ref (regexp-opt-test--match-all perm ref-re))
+         (opt (regexp-opt-test--match-all perm opt-re)))
+    (concat "\n"
+            (format "Naïve regexp:     %s\n" ref-re)
+            (format "Optimised regexp: %s\n" opt-re)
+            (format "Got:      %s\n" opt)
+            (format "Expected: %s\n" ref))))
+
+(put 'regexp-opt-test--check-perm 'ert-explainer 'regexp-opt-test--explain-perm)
+
+(ert-deftest regexp-opt-keep-order ()
+  "Check that KEEP-ORDER works."
+  (dolist (perm (regexp-opt-test--permutations '("abc" "bca" "cab")))
+    (should (regexp-opt-test--check-perm perm)))
+  (dolist (perm (regexp-opt-test--permutations '("abc" "ab" "bca" "bc")))
+    (should (regexp-opt-test--check-perm perm)))
+  (dolist (perm (regexp-opt-test--permutations '("abxy" "cdxy")))
+    (should (regexp-opt-test--check-perm perm)))
+  (dolist (perm (regexp-opt-test--permutations '("afgx" "bfgx" "afgy" "bfgy")))
+    (should (regexp-opt-test--check-perm perm)))
+  (dolist (perm (regexp-opt-test--permutations '("a" "ab" "ac" "abc")))
+    (should (regexp-opt-test--check-perm perm))))
+
 (ert-deftest regexp-opt-charset ()
   (should (equal (regexp-opt-charset '(?a ?b ?a)) "[ab]"))
   (should (equal (regexp-opt-charset '(?D ?d ?B ?a ?b ?C ?7 ?a ?c ?A))
-- 
2.20.1 (Apple Git-117)


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2019-07-04 16:01 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2019-06-30 12:28 bug#36444: [PATCH] Improved regexp-opt KEEP-ORDER check Mattias Engdegård
2019-07-03 19:29 ` Noam Postavsky
2019-07-04 11:52   ` Mattias Engdegård
2019-07-04 14:18     ` Noam Postavsky
2019-07-04 15:18       ` Mattias Engdegård
2019-07-04 16:01         ` Drew Adams

Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/emacs.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).