;;; regex-tests.el --- tests for regex.c functions -*- lexical-binding: t -*- ;; Copyright (C) 2015-2016 Free Software Foundation, Inc. ;; This file is part of GNU Emacs. ;; GNU Emacs is free software: you can redistribute it and/or modify ;; it under the terms of the GNU General Public License as published by ;; the Free Software Foundation, either version 3 of the License, or ;; (at your option) any later version. ;; GNU Emacs is distributed in the hope that it will be useful, ;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs. If not, see . ;;; Code: (require 'ert) (defvar regex-tests--resources-dir (concat (concat (file-name-directory (or load-file-name buffer-file-name)) "/regex-resources/")) "Path to regex-resources directory next to the \"regex-tests.el\" file.") (ert-deftest regex-word-cc-fallback-test () "Test that ‘[[:cc:]]*x’ matches ‘x’ (bug#24020). Test that a regex of the form \"[[:cc:]]*x\" where CC is a character class which matches a multibyte character X, matches string \"x\". For example, ‘[[:word:]]*\u2620’ regex (note: \u2620 is a word character) must match a string \"\u2420\"." (dolist (class '("[[:word:]]" "\\sw")) (dolist (repeat '("*" "+")) (dolist (suffix '("" "b" "bar" "\u2620")) (dolist (string '("" "foo")) (when (not (and (string-equal repeat "+") (string-equal string ""))) (should (string-match (concat "^" class repeat suffix "$") (concat string suffix))))))))) (defun regex--test-cc (name matching not-matching) (let (case-fold-search) (should (string-match-p (concat "^[[:" name ":]]*$") matching)) (should (string-match-p (concat "^[[:" name ":]]*?\u2622$") (concat matching "\u2622"))) (should (string-match-p (concat "^[^[:" name ":]]*$") not-matching)) (should (string-match-p (concat "^[^[:" name ":]]*\u2622$") (concat not-matching "\u2622"))) (with-temp-buffer (insert matching) (let ((p (point))) (insert not-matching) (goto-char (point-min)) (skip-chars-forward (concat "[:" name ":]")) (should (equal (point) p)) (skip-chars-forward (concat "^[:" name ":]")) (should (equal (point) (point-max))) (goto-char (point-min)) (skip-chars-forward (concat "[:" name ":]\u2622")) (should (or (equal (point) p) (equal (point) (1+ p)))))))) (dolist (test '(("alnum" "abcABC012łąkaDZßIJfiDzΣσςɕ" "-, \t\n") ("alpha" "abcABCłąkaDZßIJfiDzΣσςɕ" "-,012 \t\n") ("digit" "012" "abcABCłąka-, \t\n") ("xdigit" "0123aBc" "łąk-, \t\n") ("upper" "ABCŁĄKADZIJΣ" "abcß0fiσςɕ12-, \t\n") ("lower" "abcłąkaßfiσς" "ABC012DZIJΣ-, \t\n") ("word" "abcABC012\u2620DZßIJfiDzΣσςɕ" "-, \t\n") ("punct" ".,-" "abcABC012\u2620 \t\n") ("cntrl" "\1\2\t\n" ".,-abcABC012\u2620 ") ("graph" "abcłąka\u2620-,DZßIJfiDzΣσςɕ" " \t\n\1") ("print" "abcłąka\u2620-,DZßIJfiDzΣσςɕ " "\t\n\1") ("space" " \t\n\u2001" "abcABCł0123") ("blank" " \t" "\n\u2001") ("ascii" "abcABC012 \t\n\1" "łą\u2620DZßIJfiDzΣσςɕ") ("nonascii" "łą\u2622DZßIJfiDzΣσςɕ" "abcABC012 \t\n\1") ;; Note: sharp s is unibyte since it’s code point is below 256. ("unibyte" "abcABC012ß \t\n\1" "łą\u2622DZIJfiDzΣσςɕ") ("multibyte" "łą\u2622DZIJfiDzΣσςɕ" "abcABC012ß \t\n\1"))) (let ((name (intern (concat "regex-tests-" (car test) "-character-class"))) (doc (concat "Perform sanity test of regexes using " (car test) " character class. Go over all the supported character classes and test whether the classes and their inversions match what they are supposed to match. The test is done using `string-match-p' as well as `skip-chars-forward'."))) (eval `(ert-deftest ,name () ,doc ,(cons 'regex--test-cc test)) t))) (defun regex-tests-benchmark-cc-match () "Benchmark regex character class matching." (interactive) (let* ((prn (if (called-interactively-p) 'insert (lambda (&rest args) (mapc 'princ args)))) (strings (nconc (list (apply 'string (number-sequence 32 126)) (apply 'string (number-sequence 0 127)) (apply 'unibyte-string (number-sequence 128 255)) (concat (apply 'string (number-sequence 0 255)) (apply 'unibyte-string (number-sequence 128 255))) (make-string 10000 #x3FFF80) (make-string 10000 #x3FFFFF)) (mapcar (lambda (ch) (make-string 10000 ch)) (number-sequence 0 256)))) (ccs '("alnum" "alpha" "digit" "xdigit" "upper" "lower" "word" "punct" "cntrl" "graph" "print" "space" "blank" "ascii" "nonascii" "unibyte" "multibyte")) (benchmark-re (lambda (re) (dolist (cf '(nil t)) ;; Compile the regex so it ends up in cache. (string-match re "") (let ((res (benchmark-run 10 (dolist (str strings) (string-match re str))))) (funcall prn (format " %10.3f" (* (- (nth 0 res) (nth 2 res)) 100)))))))) (when (called-interactively-p) (switch-to-buffer (get-buffer-create "*Regex Benchmark*")) (delete-region (point-min) (point-max))) (funcall prn (format "%-9s %-9s %-9s %-9s %-9s\n" "Class" "[[:cc:]]" "no-case" "[^[:cc:]]" "no-case") (make-string 9 ?-) " " (make-string 9 ?-) " " (make-string 9 ?-) " " (make-string 9 ?-) " " (make-string 9 ?-) "\n") (dolist (cc ccs) (funcall prn (format "%-9s" cc)) (dolist (re (list (format "[[:%s:]]" cc) (format "[^[:%s:]]" cc))) (funcall benchmark-re re)) (funcall prn "\n")) (funcall prn (format "%-9s" "...all...")) (let ((all-ccs (mapconcat (lambda (cc) (format "[:%s:]" cc)) ccs ""))) (funcall benchmark-re (concat "[" all-ccs "]")) (funcall benchmark-re (concat "[^" all-ccs "]"))) (funcall prn "\n" (make-string 53 ?-) "\nAll times in ms; lower is better.\n"))) (defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body) "Reads a line of the test file TEST-FILE, skipping comments (defined by COMMENT-CHAR), and evaluates the tests in this line as defined in the BODY. Line numbers in the WHITELIST are known failures, and are skipped." `(with-temp-buffer (modify-syntax-entry ?_ "w;; ") ; tests expect _ to be a word (insert-file-contents (concat regex-tests--resources-dir ,test-file)) (let ((case-fold-search nil) (line-number 1) (whitelist-idx 0)) (goto-char (point-min)) (while (not (eobp)) (let ((start (point))) (end-of-line) (narrow-to-region start (point)) (goto-char (point-min)) (when (and ;; ignore comments (save-excursion (re-search-forward ,(concat "^[^" (string comment-char) "]") nil t)) ;; skip lines in the whitelist (let ((whitelist-next (condition-case nil (aref ,whitelist whitelist-idx) (args-out-of-range nil)))) (cond ;; whitelist exhausted. do process this line ((null whitelist-next) t) ;; we're not yet at the next whitelist element. do ;; process this line ((< line-number whitelist-next) t) ;; we're past the next whitelist element. This ;; shouldn't happen ((> line-number whitelist-next) (error (format "We somehow skipped the next whitelist element: line %d" whitelist-next))) ;; we're at the next whitelist element. Skip this ;; line, and advance the whitelist index (t (setq whitelist-idx (1+ whitelist-idx)) nil)))) ,@body) (widen) (forward-line) (beginning-of-line) (setq line-number (1+ line-number))))))) (defun regex-tests-compare (string what-failed bounds-ref &optional substring-ref) "I just ran a search, looking at STRING. WHAT-FAILED describes what failed, if anything; valid values are 'search-failed, 'compilation-failed and nil. I compare the beginning/end of each group with their expected values. This is done with either BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil. BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1 end-ref1 ....] while SUBSTRING-REF is the expected substring obtained by indexing the input string by start/end-ref. If the search was supposed to fail then start-ref0/substring-ref0 is 'search-failed. If the search wasn't even supposed to compile successfully, then start-ref0/substring-ref0 is 'compilation-failed. If I only care about a match succeeding, this can be set to t. This function returns a string that describes the failure, or nil on success" (when (or (and bounds-ref substring-ref) (not (or bounds-ref substring-ref))) (error "Exactly one of bounds-ref and bounds-ref should be non-nil")) (let ((what-failed-ref (car (or bounds-ref substring-ref)))) (cond ((eq what-failed 'search-failed) (cond ((eq what-failed-ref 'search-failed) nil) ((eq what-failed-ref 'compilation-failed) "Expected pattern failure; but no match") (t "Expected match; but no match"))) ((eq what-failed 'compilation-failed) (cond ((eq what-failed-ref 'search-failed) "Expected no match; but pattern failure") ((eq what-failed-ref 'compilation-failed) nil) (t "Expected match; but pattern failure"))) ;; The regex match succeeded ((eq what-failed-ref 'search-failed) "Expected no match; but match") ((eq what-failed-ref 'compilation-failed) "Expected pattern failure; but match") ;; The regex match succeeded, as expected. I now check all the ;; bounds (t (let ((idx 0) msg ref next-ref-function compare-ref-function mismatched-ref-function) (if bounds-ref (setq ref bounds-ref next-ref-function (lambda (x) (cddr x)) compare-ref-function (lambda (ref start-pos end-pos) (or (eq (car ref) t) (and (eq start-pos (car ref)) (eq end-pos (cadr ref))))) mismatched-ref-function (lambda (ref start-pos end-pos) (format "beginning/end positions: %d/%s and %d/%s" start-pos (car ref) end-pos (cadr ref)))) (setq ref substring-ref next-ref-function (lambda (x) (cdr x)) compare-ref-function (lambda (ref start-pos end-pos) (or (eq (car ref) t) (string= (substring string start-pos end-pos) (car ref)))) mismatched-ref-function (lambda (ref start-pos end-pos) (format "beginning/end positions: %d/%s and %d/%s" start-pos (car ref) end-pos (cadr ref))))) (while (not (or (null ref) msg)) (let ((start (match-beginning idx)) (end (match-end idx))) (when (not (funcall compare-ref-function ref start end)) (setq msg (format "Have expected match, but mismatch in group %d: %s" idx (funcall mismatched-ref-function ref start end)))) (setq ref (funcall next-ref-function ref) idx (1+ idx)))) (or msg nil)))))) (defun regex-tests-match (pattern string bounds-ref &optional substring-ref) "I match the given STRING against PATTERN. I compare the beginning/end of each group with their expected values. BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1 ....]. If the search was supposed to fail then start-ref0 is 'search-failed. If the search wasn't even supposed to compile successfully, then start-ref0 is 'compilation-failed. This function returns a string that describes the failure, or nil on success" (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern) ;; Skipping test: [.x.] and [=x=] forms not supported by emacs nil (regex-tests-compare string (condition-case nil (if (string-match pattern string) nil 'search-failed) ('invalid-regexp 'compilation-failed)) bounds-ref substring-ref))) (defconst regex-tests-re-even-escapes "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*" "Regex that matches an even number of \\ characters") (defconst regex-tests-re-odd-escapes (concat regex-tests-re-even-escapes "\\\\") "Regex that matches an odd number of \\ characters") (defun regex-tests-unextend (pattern) "Basic conversion from extended regexes to emacs ones. This is mostly a hack that adds \\ to () and | and {}, and removes it if it already exists. We also change \\S (and \\s) to \\S- (and \\s-) because extended regexes see the former as whitespace, but emacs requires an extra symbol character" (with-temp-buffer (insert pattern) (goto-char (point-min)) (while (re-search-forward "[()|{}]" nil t) ;; point is past special character. If it is escaped, unescape ;; it (if (save-excursion (re-search-backward (concat regex-tests-re-odd-escapes ".\\=") nil t)) ;; This special character is preceded by an odd number of \, ;; so I unescape it by removing the last one (progn (forward-char -2) (delete-char 1) (forward-char 1)) ;; This special character is preceded by an even (possibly 0) ;; number of \. I add an escape (forward-char -1) (insert "\\") (forward-char 1))) ;; convert \s to \s- (goto-char (point-min)) (while (re-search-forward (concat regex-tests-re-odd-escapes "[Ss]") nil t) (insert "-")) (buffer-string))) (defun regex-tests-BOOST-frob-escapes (s ispattern) "Mangle \\ the way it is done in frob_escapes() in regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted; \\\\, \\^, \{, \\|, \} are unescaped for the string (not pattern)" ;; this is all similar to (regex-tests-unextend) (with-temp-buffer (insert s) (let ((interpret-list (list "t" "n" "r"))) (while interpret-list (goto-char (point-min)) (while (re-search-forward (concat "\\(" regex-tests-re-even-escapes "\\)" "\\\\" (car interpret-list)) nil t) (replace-match (concat "\\1" (car (read-from-string (concat "\"\\" (car interpret-list) "\"")))))) (setq interpret-list (cdr interpret-list)))) (when (not ispattern) ;; unescape \\, \^, \{, \|, \} (let ((unescape-list (list "\\\\" "^" "{" "|" "}"))) (while unescape-list (goto-char (point-min)) (while (re-search-forward (concat "\\(" regex-tests-re-even-escapes "\\)" "\\\\" (car unescape-list)) nil t) (replace-match (concat "\\1" (car unescape-list)))) (setq unescape-list (cdr unescape-list)))) ) (buffer-string))) (defconst regex-tests-BOOST-whitelist [ ;; emacs is more stringent with regexes involving unbalanced ) 63 65 69 ;; in emacs, regex . doesn't match \n 91 ;; emacs is more forgiving with * and ? that don't apply to ;; characters 107 108 109 122 123 124 140 141 142 ;; emacs accepts regexes with {} 161 ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5] 222 223 ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of ;; ababaaa match 284 294 ;; ambiguous groupings are ambiguous 443 444 445 446 448 449 450 ;; emacs doesn't know how to handle weird ranges such as [a-Z] and ;; [[:alpha:]-a] 539 580 581 ;; emacs matches non-greedy regex ab.*? non-greedily 639 677 712 ] "Line numbers in the boost test that should be skipped. These are false-positive test failures that represent known/benign differences in behavior.") ;; - Format ;; - Comments are lines starting with ; ;; - Lines starting with - set options passed to regcomp() and regexec(): ;; - if no "REG_BASIC" is found, with have an extended regex ;; - These set a flag: ;; - REG_ICASE ;; - REG_NEWLINE (ignored by this function) ;; - REG_NOTBOL ;; - REG_NOTEOL ;; ;; - Test lines are ;; pattern string start0 end0 start1 end1 ... ;; ;; - pattern, string can have escapes ;; - string can have whitespace if enclosed in "" ;; - if string is "!", then the pattern is supposed to fail compilation ;; - start/end are of group0, group1, etc. group 0 is the full match ;; - start<0 indicates "no match" ;; - start is the 0-based index of the first character ;; - end is the 0-based index of the first character past the group (defun regex-tests-BOOST () (let (failures basic icase notbol noteol) (regex-tests-generic-line ?; "BOOST.tests" regex-tests-BOOST-whitelist (if (save-excursion (re-search-forward "^-" nil t)) (setq basic (save-excursion (re-search-forward "REG_BASIC" nil t)) icase (save-excursion (re-search-forward "REG_ICASE" nil t)) notbol (save-excursion (re-search-forward "REG_NOTBOL" nil t)) noteol (save-excursion (re-search-forward "REG_NOTEOL" nil t))) (save-excursion (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t) (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t) (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t))) (let* ((pattern-raw (match-string 1)) (string-raw (match-string 2)) (positions-raw (match-string 3)) (pattern (regex-tests-BOOST-frob-escapes pattern-raw t)) (string (regex-tests-BOOST-frob-escapes string-raw nil)) (positions (if (string= string "!") (list 'compilation-failed 0) (mapcar (lambda (x) (let ((x (string-to-number x))) (if (< x 0) nil x))) (split-string positions-raw))))) (when (null (car positions)) (setcar positions 'search-failed)) (when (not basic) (setq pattern (regex-tests-unextend pattern))) ;; great. I now have all the data parsed. Let's use it to do ;; stuff (let* ((case-fold-search icase) (msg (regex-tests-match pattern string positions))) (if (and ;; Skipping test: notbol/noteol not supported (not notbol) (not noteol) msg) ;; store failure (setq failures (cons (format "line number %d: Regex '%s': %s" line-number pattern msg) failures))))))) failures)) (defconst regex-tests-PCRE-whitelist [ ;; ambiguous groupings are ambiguous 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203 ] "Line numbers in the PCRE test that should be skipped. These are false-positive test failures that represent known/benign differences in behavior.") ;; - Format ;; ;; regex ;; input_string ;; group_num: group_match | "No match" ;; input_string ;; group_num: group_match | "No match" ;; input_string ;; group_num: group_match | "No match" ;; input_string ;; group_num: group_match | "No match" ;; ... (defun regex-tests-PCRE () (let (failures pattern icase string what-failed matches-observed) (regex-tests-generic-line ?# "PCRE.tests" regex-tests-PCRE-whitelist (cond ;; pattern ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t)) (setq icase (string= "i" (match-string 2)) pattern (regex-tests-unextend (match-string 1)))) ;; string. read it in, match against pattern, and save all the results ((save-excursion (re-search-forward "^ \\(.*\\)" nil t)) (let ((case-fold-search icase)) (setq string (match-string 1) ;; the regex match under test what-failed (condition-case nil (if (string-match pattern string) nil 'search-failed) ('invalid-regexp 'compilation-failed)) matches-observed (cl-loop for x from 0 to 20 collect (and (not what-failed) (or (match-string x string) ""))))) nil) ;; verification line: failed match ((save-excursion (re-search-forward "^No match" nil t)) (unless what-failed (setq failures (cons (format "line number %d: Regex '%s': Expected no match; but match" line-number pattern) failures)))) ;; verification line: succeeded match ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t)) (let* ((match-ref (match-string 2)) (idx (string-to-number (match-string 1)))) (if what-failed "Expected match; but no match" (unless (string= match-ref (elt matches-observed idx)) (setq failures (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'" line-number pattern idx match-ref (elt matches-observed idx)) failures)))))) ;; reset (t (setq pattern nil) nil))) failures)) (defconst regex-tests-PTESTS-whitelist [ ;; emacs doesn't barf on weird ranges such as [b-a], but simply ;; fails to match 138 ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character 168 ] "Line numbers in the PTESTS test that should be skipped. These are false-positive test failures that represent known/benign differences in behavior.") ;; - Format ;; - fields separated by ¦ (note: this is not a |) ;; - start¦end¦pattern¦string ;; - start is the 1-based index of the first character ;; - end is the 1-based index of the last character (defun regex-tests-PTESTS () (let (failures) (regex-tests-generic-line ?# "PTESTS" regex-tests-PTESTS-whitelist (let* ((fields (split-string (buffer-string) "¦")) ;; string has 1-based index of first char in the ;; match. -1 means "no match". -2 means "invalid ;; regex". ;; ;; start-ref is 0-based index of first char in the ;; match ;; ;; string==0 is a special case, and I have to treat ;; it as start-ref = 0 (start-ref (let ((raw (string-to-number (elt fields 0)))) (cond ((= raw -2) 'compilation-failed) ((= raw -1) 'search-failed) ((= raw 0) 0) (t (1- raw))))) ;; string has 1-based index of last char in the ;; match. end-ref is 0-based index of first char past ;; the match (end-ref (string-to-number (elt fields 1))) (pattern (elt fields 2)) (string (elt fields 3))) (let ((msg (regex-tests-match pattern string (list start-ref end-ref)))) (when msg (setq failures (cons (format "line number %d: Regex '%s': %s" line-number pattern msg) failures)))))) failures)) (defconst regex-tests-TESTS-whitelist [ ;; emacs doesn't barf on weird ranges such as [b-a], but simply ;; fails to match 42 ;; emacs is more forgiving with * and ? that don't apply to ;; characters 57 58 59 60 ;; emacs is more stringent with regexes involving unbalanced ) 67 ] "Line numbers in the TESTS test that should be skipped. These are false-positive test failures that represent known/benign differences in behavior.") ;; - Format ;; - fields separated by :. Watch for [\[:xxx:]] ;; - expected:pattern:string ;; ;; expected: ;; | 0 | successful match | ;; | 1 | failed match | ;; | 2 | regcomp() should fail | (defun regex-tests-TESTS () (let (failures) (regex-tests-generic-line ?# "TESTS" regex-tests-TESTS-whitelist (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t)) (let* ((what-failed (let ((raw (string-to-number (match-string 1)))) (cond ((= raw 2) 'compilation-failed) ((= raw 1) 'search-failed) (t t)))) (string (match-string 3)) (pattern (regex-tests-unextend (match-string 2)))) (let ((msg (regex-tests-match pattern string nil (list what-failed)))) (when msg (setq failures (cons (format "line number %d: Regex '%s': %s" line-number pattern msg) failures))))) (error "Error parsing TESTS file line: '%s'" (buffer-string)))) failures)) (ert-deftest regex-tests-BOOST () "Tests of the regular expression engine. This evaluates the BOOST test cases from glibc." (should-not (regex-tests-BOOST))) (ert-deftest regex-tests-PCRE () "Tests of the regular expression engine. This evaluates the PCRE test cases from glibc." (should-not (regex-tests-PCRE))) (ert-deftest regex-tests-PTESTS () "Tests of the regular expression engine. This evaluates the PTESTS test cases from glibc." (should-not (regex-tests-PTESTS))) (ert-deftest regex-tests-TESTS () "Tests of the regular expression engine. This evaluates the TESTS test cases from glibc." (should-not (regex-tests-TESTS))) ;;; regex-tests.el ends here