From c5687df16cd4cb73fc593556a68a124354f273db Mon Sep 17 00:00:00 2001 From: Dima Kogan Date: Sat, 27 Feb 2016 18:06:35 -0800 Subject: [PATCH 2/2] Added driver for the regex tests * test/src/regex/regex-tests.el: regex test driver --- test/src/regex/regex-tests.el | 590 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 590 insertions(+) create mode 100644 test/src/regex/regex-tests.el diff --git a/test/src/regex/regex-tests.el b/test/src/regex/regex-tests.el new file mode 100644 index 0000000..8709f90 --- /dev/null +++ b/test/src/regex/regex-tests.el @@ -0,0 +1,590 @@ +;;; regex-tests.el --- tests for regex.c -*- lexical-binding: t; -*- + +;; Copyright (C) 2016 Free Software Foundation, Inc. + +;; This file is part of GNU Emacs. + +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. + +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. + +;; You should have received a copy of the GNU General Public License +;; along with GNU Emacs. If not, see . + +(require 'ert) + +(defmacro regex-tests-generic-line (comment-char test-file whitelist &rest body) + "Reads a line of the test file TEST-FILE, skipping +comments (defined by COMMENT-CHAR), and evaluates the tests in +this line as defined in the BODY. Line numbers in the WHITELIST +are known failures, and are skipped." + + `(with-temp-buffer + (modify-syntax-entry ?_ "w;; ") ; tests expect _ to be a word + (insert-file-contents ,(concat (file-name-directory (buffer-file-name)) test-file)) + + (let ((case-fold-search nil) + (line-number 1) + (whitelist-idx 0)) + + (goto-char (point-min)) + + (while (not (eobp)) + (let ((start (point))) + (end-of-line) + (narrow-to-region start (point)) + + (goto-char (point-min)) + + (when + (and + ;; ignore comments + (save-excursion + (re-search-forward ,(concat "^[^" (string comment-char) "]") nil t)) + + ;; skip lines in the whitelist + (let ((whitelist-next + (condition-case nil + (aref ,whitelist whitelist-idx) (args-out-of-range nil)))) + (cond + ;; whitelist exhausted. do process this line + ((null whitelist-next) t) + + ;; we're not yet at the next whitelist element. do + ;; process this line + ((< line-number whitelist-next) t) + + ;; we're past the next whitelist element. This + ;; shouldn't happen + ((> line-number whitelist-next) + (error + (format + "We somehow skipped the next whitelist element: line %d" whitelist-next))) + + ;; we're at the next whitelist element. Skip this + ;; line, and advance the whitelist index + (t + (setq whitelist-idx (1+ whitelist-idx)) nil)))) + ,@body) + + (widen) + (forward-line) + (beginning-of-line) + (setq line-number (1+ line-number))))))) + +(defun regex-tests-compare (string what-failed bounds-ref &optional substring-ref) + "I just ran a search, looking at STRING. WHAT-FAILED describes +what failed, if anything; valid values are 'search-failed, +'compilation-failed and nil. I compare the beginning/end of each +group with their expected values. This is done with either +BOUNDS-REF or SUBSTRING-REF; one of those should be non-nil. +BOUNDS-REF is a sequence \[start-ref0 end-ref0 start-ref1 +end-ref1 ....] while SUBSTRING-REF is the expected substring +obtained by indexing the input string by start/end-ref. + +If the search was supposed to fail then start-ref0/substring-ref0 +is 'search-failed. If the search wasn't even supposed to compile +successfully, then start-ref0/substring-ref0 is +'compilation-failed. If I only care about a match succeeding, +this can be set to t. + +This function returns a string that describes the failure, or nil +on success" + + (when (or + (and bounds-ref substring-ref) + (not (or bounds-ref substring-ref))) + (error "Exactly one of bounds-ref and bounds-ref should be non-nil")) + + (let ((what-failed-ref (car (or bounds-ref substring-ref)))) + + (cond + ((eq what-failed 'search-failed) + (cond + ((eq what-failed-ref 'search-failed) + nil) + ((eq what-failed-ref 'compilation-failed) + "Expected pattern failure; but no match") + (t + "Expected match; but no match"))) + + ((eq what-failed 'compilation-failed) + (cond + ((eq what-failed-ref 'search-failed) + "Expected no match; but pattern failure") + ((eq what-failed-ref 'compilation-failed) + nil) + (t + "Expected match; but pattern failure"))) + + ;; The regex match succeeded + ((eq what-failed-ref 'search-failed) + "Expected no match; but match") + ((eq what-failed-ref 'compilation-failed) + "Expected pattern failure; but match") + + ;; The regex match succeeded, as expected. I now check all the + ;; bounds + (t + (let ((idx 0) + msg + ref next-ref-function compare-ref-function mismatched-ref-function) + + (if bounds-ref + (setq ref bounds-ref + next-ref-function (lambda (x) (cddr x)) + compare-ref-function (lambda (ref start-pos end-pos) + (or (eq (car ref) t) + (and (eq start-pos (car ref)) + (eq end-pos (cadr ref))))) + mismatched-ref-function (lambda (ref start-pos end-pos) + (format + "beginning/end positions: %d/%s and %d/%s" + start-pos (car ref) end-pos (cadr ref)))) + (setq ref substring-ref + next-ref-function (lambda (x) (cdr x)) + compare-ref-function (lambda (ref start-pos end-pos) + (or (eq (car ref) t) + (string= (substring string start-pos end-pos) (car ref)))) + mismatched-ref-function (lambda (ref start-pos end-pos) + (format + "beginning/end positions: %d/%s and %d/%s" + start-pos (car ref) end-pos (cadr ref))))) + + (while (not (or (null ref) msg)) + + (let ((start (match-beginning idx)) + (end (match-end idx))) + + (when (not (funcall compare-ref-function ref start end)) + (setq msg + (format + "Have expected match, but mismatch in group %d: %s" idx (funcall mismatched-ref-function ref start end)))) + + (setq ref (funcall next-ref-function ref) + idx (1+ idx)))) + + (or msg + nil)))))) + + + +(defun regex-tests-match (pattern string bounds-ref &optional substring-ref) + "I match the given STRING against PATTERN. I compare the +beginning/end of each group with their expected values. +BOUNDS-REF is a sequence [start-ref0 end-ref0 start-ref1 end-ref1 +....]. + +If the search was supposed to fail then start-ref0 is +'search-failed. If the search wasn't even supposed to compile +successfully, then start-ref0 is 'compilation-failed. + +This function returns a string that describes the failure, or nil +on success" + + (if (string-match "\\[\\([\\.=]\\)..?\\1\\]" pattern) + ;; Skipping test: [.x.] and [=x=] forms not supported by emacs + nil + + (regex-tests-compare + string + (condition-case nil + (if (string-match pattern string) nil 'search-failed) + ('invalid-regexp 'compilation-failed)) + bounds-ref substring-ref))) + + +(defconst regex-tests-re-even-escapes + "\\(?:^\\|[^\\\\]\\)\\(?:\\\\\\\\\\)*" + "Regex that matches an even number of \\ characters") + +(defconst regex-tests-re-odd-escapes + (concat regex-tests-re-even-escapes "\\\\") + "Regex that matches an odd number of \\ characters") + + +(defun regex-tests-unextend (pattern) + "Basic conversion from extended regexen to emacs ones. This is +mostly a hack that adds \\ to () and | and {}, and removes it if +it already exists. We also change \\S (and \\s) to \\S- (and +\\s-) because extended regexen see the former as whitespace, but +emacs requires an extra symbol character" + + (with-temp-buffer + (insert pattern) + (goto-char (point-min)) + + (while (re-search-forward "[()|{}]" nil t) + ;; point is past special character. If it is escaped, unescape + ;; it + + (if (save-excursion + (re-search-backward (concat regex-tests-re-odd-escapes ".\\=") nil t)) + + ;; This special character is preceded by an odd number of \, + ;; so I unescape it by removing the last one + (progn + (forward-char -2) + (delete-char 1) + (forward-char 1)) + + ;; This special character is preceded by an even (possibly 0) + ;; number of \. I add an escape + (forward-char -1) + (insert "\\") + (forward-char 1))) + + ;; convert \s to \s- + (goto-char (point-min)) + (while (re-search-forward (concat regex-tests-re-odd-escapes "[Ss]") nil t) + (insert "-")) + + (buffer-string))) + +(defun regex-tests-BOOST-frob-escapes (s ispattern) + "Mangle \\ the way it is done in frob_escapes() in +regex-tests-BOOST.c in glibc: \\t, \\n, \\r are interpreted; +\\\\, \\^, \{, \\|, \} are unescaped for the string (not +pattern)" + + ;; this is all similar to (regex-tests-unextend) + (with-temp-buffer + (insert s) + + (let ((interpret-list (list "t" "n" "r"))) + (while interpret-list + (goto-char (point-min)) + (while (re-search-forward + (concat "\\(" regex-tests-re-even-escapes "\\)" + "\\\\" (car interpret-list)) + nil t) + (replace-match (concat "\\1" (car (read-from-string + (concat "\"\\" (car interpret-list) "\"")))))) + + (setq interpret-list (cdr interpret-list)))) + + (when (not ispattern) + ;; unescape \\, \^, \{, \|, \} + (let ((unescape-list (list "\\\\" "^" "{" "|" "}"))) + (while unescape-list + (goto-char (point-min)) + (while (re-search-forward + (concat "\\(" regex-tests-re-even-escapes "\\)" + "\\\\" (car unescape-list)) + nil t) + (replace-match (concat "\\1" (car unescape-list)))) + + (setq unescape-list (cdr unescape-list)))) + ) + (buffer-string))) + + + + +(defconst regex-tests-BOOST-whitelist + [ + ;; emacs is more stringent with regexen involving unbalanced ) + 63 65 69 + + ;; in emacs, regex . doesn't match \n + 91 + + ;; emacs is more forgiving with * and ? that don't apply to + ;; characters + 107 108 109 122 123 124 140 141 142 + + ;; emacs accepts regexen with {} + 161 + + ;; emacs doesn't fail on bogus ranges such as [3-1] or [1-3-5] + 222 223 + + ;; emacs doesn't match (ab*)[ab]*\1 greedily: only 4 chars of + ;; ababaaa match + 284 294 + + ;; ambiguous groupings are ambiguous + 443 444 445 446 448 449 450 + + ;; emacs doesn't know how to handle weird ranges such as [a-Z] and + ;; [[:alpha:]-a] + 539 580 581 + + ;; emacs matches non-greedy regex ab.*? non-greedily + 639 677 712 + ] + "Line numbers in the boost test that should be skipped. These +are false-positive test failures that represent known/benign +differences in behavior.") + +;; - Format +;; - Comments are lines starting with ; +;; - Lines starting with - set options passed to regcomp() and regexec(): +;; - if no "REG_BASIC" is found, with have an extended regex +;; - These set a flag: +;; - REG_ICASE +;; - REG_NEWLINE +;; - REG_NOTBOL +;; - REG_NOTEOL +;; +;; - Test lines are +;; pattern string start0 end0 start1 end1 ... +;; +;; - pattern, string can have escapes +;; - string can have whitespace if enclosed in "" +;; - if string is "!", then the pattern is supposed to fail compilation +;; - start/end are of group0, group1, etc. group 0 is the full match +;; - start<0 indicates "no match" +;; - start is the 0-based index of the first character +;; - end is the 0-based index of the first character past the group +(defun regex-tests-BOOST () + (let (failures + basic icase newline notbol noteol) + (regex-tests-generic-line + ?; "regex-resources/BOOST.tests" regex-tests-BOOST-whitelist + (if (save-excursion (re-search-forward "^-" nil t)) + (setq basic (save-excursion (re-search-forward "REG_BASIC" nil t)) + icase (save-excursion (re-search-forward "REG_ICASE" nil t)) + newline (save-excursion (re-search-forward "REG_NEWLINE" nil t)) + notbol (save-excursion (re-search-forward "REG_NOTBOL" nil t)) + noteol (save-excursion (re-search-forward "REG_NOTEOL" nil t))) + + (save-excursion + (or (re-search-forward "\\(\\S-+\\)\\s-+\"\\(.*\\)\"\\s-+?\\(.+\\)" nil t) + (re-search-forward "\\(\\S-+\\)\\s-+\\(\\S-+\\)\\s-+?\\(.+\\)" nil t) + (re-search-forward "\\(\\S-+\\)\\s-+\\(!\\)" nil t))) + + (let* ((pattern-raw (match-string 1)) + (string-raw (match-string 2)) + (positions-raw (match-string 3)) + (pattern (regex-tests-BOOST-frob-escapes pattern-raw t)) + (string (regex-tests-BOOST-frob-escapes string-raw nil)) + (positions + (if (string= string "!") + (list 'compilation-failed 0) + (mapcar + (lambda (x) + (let ((x (string-to-number x))) + (if (< x 0) nil x))) + (split-string positions-raw))))) + + (when (null (car positions)) + (setcar positions 'search-failed)) + + (when (not basic) + (setq pattern (regex-tests-unextend pattern))) + + ;; great. I now have all the data parsed. Let's use it to do + ;; stuff + (let* ((case-fold-search icase) + (msg (regex-tests-match pattern string positions))) + + (if (and + ;; Skipping test: notbol/noteol not supported + (not notbol) (not noteol) + + msg) + + ;; store failure + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures))))))) + + failures)) + +(defconst regex-tests-PCRE-whitelist + [ + ;; ambiguous groupings are ambiguous + 610 611 1154 1157 1160 1168 1171 1176 1179 1182 1185 1188 1193 1196 1203 + ] + "Line numbers in the PCRE test that should be skipped. These +are false-positive test failures that represent known/benign +differences in behavior.") + +;; - Format +;; +;; regex +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; input_string +;; group_num: group_match | "No match" +;; ... +(defun regex-tests-PCRE () + (let (failures + pattern icase string what-failed matches-observed) + (regex-tests-generic-line + ?# "regex-resources/PCRE.tests" regex-tests-PCRE-whitelist + + (cond + + ;; pattern + ((save-excursion (re-search-forward "^/\\(.*\\)/\\(.*i?\\)$" nil t)) + (setq icase (string= "i" (match-string 2)) + pattern (regex-tests-unextend (match-string 1)))) + + ;; string. read it in, match against pattern, and save all the results + ((save-excursion (re-search-forward "^ \\(.*\\)" nil t)) + (let ((case-fold-search icase)) + (setq string (match-string 1) + + ;; the regex match under test + what-failed + (condition-case nil + (if (string-match pattern string) nil 'search-failed) + ('invalid-regexp 'compilation-failed)) + + matches-observed + (loop for x from 0 to 20 + collect (and (not what-failed) + (or (match-string x string) ""))))) + nil) + + ;; verification line: failed match + ((save-excursion (re-search-forward "^No match" nil t)) + (unless what-failed + (setq failures + (cons (format "line number %d: Regex '%s': Expected no match; but match" + line-number pattern) + failures)))) + + ;; verification line: succeeded match + ((save-excursion (re-search-forward "^ *\\([0-9]+\\): \\(.*\\)" nil t)) + (let* ((match-ref (match-string 2)) + (idx (string-to-number (match-string 1)))) + + (if what-failed + "Expected match; but no match" + (unless (string= match-ref (elt matches-observed idx)) + (setq failures + (cons (format "line number %d: Regex '%s': Have expected match, but group %d is wrong: '%s'/'%s'" + line-number pattern + idx match-ref (elt matches-observed idx)) + failures)))))) + + ;; reset + (t (setq pattern nil) nil))) + + failures)) + +(defconst regex-tests-PTESTS-whitelist + [ + ;; emacs doesn't barf on weird ranges such as [b-a], but simply + ;; fails to match + 138 + + ;; emacs doesn't see DEL (0x78) as a [:cntrl:] character + 168 + ] + "Line numbers in the PTESTS test that should be skipped. These +are false-positive test failures that represent known/benign +differences in behavior.") + +;; - Format +;; - fields separated by ¦ (note: this is not a |) +;; - start¦end¦pattern¦string +;; - start is the 1-based index of the first character +;; - end is the 1-based index of the last character +(defun regex-tests-PTESTS () + (let (failures) + (regex-tests-generic-line + ?# "regex-resources/PTESTS" regex-tests-PTESTS-whitelist + (let* ((fields (split-string (buffer-string) "¦")) + + ;; string has 1-based index of first char in the + ;; match. -1 means "no match". -2 means "invalid + ;; regex". + ;; + ;; start-ref is 0-based index of first char in the + ;; match + ;; + ;; string==0 is a special case, and I have to treat + ;; it as start-ref = 0 + (start-ref (let ((raw (string-to-number (elt fields 0)))) + (cond + ((= raw -2) 'compilation-failed) + ((= raw -1) 'search-failed) + ((= raw 0) 0) + (t (1- raw))))) + + ;; string has 1-based index of last char in the + ;; match. end-ref is 0-based index of first char past + ;; the match + (end-ref (string-to-number (elt fields 1))) + (pattern (elt fields 2)) + (string (elt fields 3))) + + (let ((msg (regex-tests-match pattern string (list start-ref end-ref)))) + (when msg + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures)))))) + failures)) + +(defconst regex-tests-TESTS-whitelist + [ + ;; emacs doesn't barf on weird ranges such as [b-a], but simply + ;; fails to match + 42 + + ;; emacs is more forgiving with * and ? that don't apply to + ;; characters + 57 58 59 60 + + ;; emacs is more stringent with regexen involving unbalanced ) + 67 + ] + "Line numbers in the TESTS test that should be skipped. These +are false-positive test failures that represent known/benign +differences in behavior.") + +;; - Format +;; - fields separated by :. Watch for [\[:xxx:]] +;; - expected:pattern:string +;; +;; expected: +;; | 0 | successful match | +;; | 1 | failed match | +;; | 2 | regcomp() should fail | +(defun regex-tests-TESTS () + (let (failures) + (regex-tests-generic-line + ?# "regex-resources/TESTS" regex-tests-TESTS-whitelist + (if (save-excursion (re-search-forward "^\\([^:]+\\):\\(.*\\):\\([^:]*\\)$" nil t)) + (let* ((what-failed + (let ((raw (string-to-number (match-string 1)))) + (cond + ((= raw 2) 'compilation-failed) + ((= raw 1) 'search-failed) + (t t)))) + (string (match-string 3)) + (pattern (regex-tests-unextend (match-string 2)))) + + (let ((msg (regex-tests-match pattern string nil (list what-failed)))) + (when msg + (setq failures + (cons (format "line number %d: Regex '%s': %s" + line-number pattern msg) + failures))))) + + (error "Error parsing TESTS file line: '%s'" (buffer-string)))) + failures)) + +(ert-deftest regex-tests () + "Tests of the email regular expression engine. This evaluates +the BOOST, PCRE, PTESTS and TESTS test cases from glibc." + (should-not (regex-tests-BOOST)) + (should-not (regex-tests-PCRE)) + (should-not (regex-tests-PTESTS)) + (should-not (regex-tests-TESTS))) -- 2.1.4