* RFC: Parsing a Regexp to a `rx' for Introspection
@ 2009-10-26 14:34 Nordlöw
0 siblings, 0 replies; only message in thread
From: Nordlöw @ 2009-10-26 14:34 UTC (permalink / raw)
To: help-gnu-emacs
This is a preliminary code for parsing a regexp string to an rx
expression.
Tested on Emacs 23.1.
I have chosen a character-serial approach to parsing the string
inserted into a buffer, that is I have used char-after() plus forward-
char() rather than the use of looking-at(). This makes it easier to
create high-performing ports into other languages such as C.
TODO: alternatives and syntax specific charcters.
I have a question regarding evaluation and rx().
We could very conveniently test this package by automatically doing a
round-trip of regexp-to-rx-to-regex in a single evaluation and check
that we return the string we put in.
This is works
(rx-to-string (regexp-parse-string "a") t)
but
(rx-to-string (regexp-parse-string "a+") t)
gives error:
Symbol's function definition is void: x
What on earth have I missed?
Thanks in advance,
Nordlöw
;;; regexp-utils.el --- Extensions and add-ons to package `rx'.
;;
;; Filename: regexp-utils.el
;; Description:
;; Author: Per Nordlöw
;; Maintainer:
;; Created: tor okt 22 12:02:46 2009 (+0200)
;; Version:
;; Last-Updated:
;; By:
;; Update #: 428
;; URL:
;; Keywords:
;; Compatibility:
;;
;; Features that might be required by this library:
;;
;; `rx'.
;;
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;;; Commentary:
;;
;;
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;;; Change log:
;;
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;; This program is free software; you can redistribute it and/or
;; modify it under the terms of the GNU General Public License as
;; published by the Free Software Foundation; either version 3, or
;; (at your option) any later version.
;;
;; This program is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with this program; see the file COPYING. If not, write to
;; the Free Software Foundation, Inc., 51 Franklin Street, Fifth
;; Floor, Boston, MA 02110-1301, USA.
;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;
;;; Code:
(require 'rx)
;; Use:
;; - `char-before', `char-after'
;; - `looking-at', `looking-back'
;; - `search-forward', `search-backward'
;; - `re-search-forward', `re-search-backward'
;; - `skip-chars-forward',`skip-chars-backward'
;; - `bolp', `eolp'
;; - `string-equal'
;; - `defun'
;; - `make-string', `string', `char-to-string'
;; - `make-symbol'
(defun regexp-parse-char-alt ()
"Parse Emacs-Style Regular Expression Character Alternative to
`rx' representation."
(let (tree)
(while (not (eq (char-after) ?\]))
(let ((c0 (char-after)))
(cond ((eobp)
(error "Incomplete Character Alternative!")
)
((eq c0 ?-)
(forward-char)
(push (if tree
(let ((c1 (char-after)))
(if (or (eq c1 ?\]) ;`c0' is last char
(consp (car tree))) ;if previous is
a completed range
c0 ;as is
(forward-char)
(cons (pop tree) c1))) ;ranges are given
as (cons LOW HIGH)
c0) ;if first character
tree) ;push ?- as is
)
((looking-at "\\[:\\([a-zA-Z-_]+\\):]") ;for example
[:alpha:]
(goto-char (match-end 1)) ;skip whole class
specification
(push (make-symbol (match-string-no-properties 1))
tree) ;push class as is
)
(t
(push c0 tree) (forward-char) ;push character as is
)
)))
(unless (eobp) (forward-char))
;;(uniquify-list-members tree) ;uniquify list
(setq tree (nreverse tree))
(when tree (push 'any tree))
tree))
(defun rx-simplify (tree)
"Simplify the `rx'-expression tree TREE. See `rx` for details."
regexp)
(defconst regexp-special-chars-list
'(?$ ?^ ?. ?* ?+ ?? ?\[ ?\\)
"List of characters that have a special meaning in Emacs
Regular Expression.")
(defun regexp-parse-string (regexp &optional format)
"Parse the regular expression REGEXP into a lisp expression.
Expression syntax is given by FORMAT, defaulting to `rx'. See
the package `sregex' and `rx' for details returned
structure. Note: This function is written as a state machine with
code clarity in mind, so easily can transform it optimcal. Each
state transition is typically triggered by character reads."
(let (tree ;expression tree (stack)
groups ;group arguments counts into `tree'
)
(with-temp-buffer
(insert regexp)
(goto-char (point-min))
(display-buffer (current-buffer))
(while (not (eobp))
(let ((c0 (char-after)))
(cond ((memq c0 '(?? ?* ?+)) ;special regexp operator
(let ((op (char-to-string c0))) ;operator string
(forward-char) ;skip char
(when (eq (char-after) ??) ;if greedy operator
(setq op (concat op (char-to-string ??))) ;append
to operator
(forward-char)) ;skip greedy operator
(push (if tree ;if postfix operator has
argument
(list (make-symbol op) (pop tree)) ;use
it as operator
c0) ;otherwise use as a plain
characterx
tree))
)
((eq c0 ?.) ;any single character except a
newline
(push 'nonl tree) (forward-char)
)
((eq c0 ?^) ;beginning of line
(push 'bol tree) (forward-char)
)
((eq c0 ?$) ;end of line
(push 'eol tree) (forward-char)
)
((eq c0 ?\[) ;opening hook: start
alternative
(forward-char)
(let ((alt-tree (regexp-parse-char-alt)))
(when alt-tree (push alt-tree tree)))
)
((eq c0 ?\\) ;backquoting
(forward-char)
(let ((c1 (char-after)))
(cond ((memq c1 regexp-special-chars-list)
(push c1 tree) (forward-char) ;as is
)
((eq c1 ?_) ;backqoute underscore
(forward-char)
(let ((c2 (char-after)))
(cond ((eq c2 ?<)
(push 'symbol-start tree) (forward-
char)
)
((eq c2 ?>)
(push 'symbol-end tree) (forward-
char)
)
(t
(push (string c0 c1)
tree) ;backqouted character as is
)))
)
((memq c1 '(0 1 2 3 4 5 6 7 8 9)) ;same text
that matched the digitth occurrence of a grouping (‘\( ... \)’)
construct.
(push `(backref (- c1 ?0)) tree) (forward-
char)
)
((eq c1 ?\() ;beginning of group
(forward-char)
(push 0 groups)
(if (looking-at "?:")
(progn (forward-char 2)
(push 'shy-start tree))
(push 'group-start tree))
)
((eq c1 ?\)) ;end of word
(forward-char)
(if groups (pop groups) (error "Unbalanced
Group End!"))
(let (group-tree)
;; push all arguments until we find `group-
start'
(while (and tree ;args left
(not (memq (car tree) '(shy-
start group-start))))
(push (pop tree) group-tree))
(if tree
(let ((g-sym (pop tree)))
(cond ((eq 'group-start g-sym)
(push `(group ,@group-tree)
tree))
((eq 'shy-start g-sym)
(push `(: ,@group-tree)
tree))
(t
(error "Unbalanced Group
End!"))
))
(error "Unbalanced Group End!"))
)
)
((eq c1 ?<) ;beginning of word
(push 'bow tree) (forward-char)
)
((eq c1 ?>) ;end of word
(push 'eow tree) (forward-char)
)
((eq c1 ?`) ;beginning of buffer/string/
text
(push 'bot tree) (forward-char)
)
((eq c1 ?') ;end of buffer/string/text
(push 'eot tree) (forward-char)
)
((eq c1 ?w) ;any word-constituent character.
The editor syntax table determines which characters these are. See
Syntax Tables.
(push 'wordchar tree) (forward-char)
)
((eq c1 ?W) ;any character that is not a
word constituent.
(push 'not-wordchar tree) (forward-char)
)
(t ;any other backquoted
character
(push (string c0 c1) tree) (forward-
char) ;backqouted character as is
)
)
)
)
(c0 ;if we have character
(when nil
(if (and (looking-at (concat regexp-ordinary-char-
regexp "+")) ;one or more number of ordinary chars
(> (match-length 0) 0))
(progn
;; TODO: if next char c1 fullfils (memq c1
'(?? ?* ?+)) push all but last to string and push last on stack
(goto-char (match-end 0)) ;goto end of string
(push (match-string-no-properties 0)
tree) ;list as is
)
(error "Unhandled regexp special character %s!"
c0)))
(forward-char) (push c0 tree) ;regexp as is
)
))))
(setq tree (nreverse tree))
(push ': tree)
tree))
(defalias 're-parse 'regexp-parse-string)
(defalias 'make-rx 'regexp-parse-string)
(when nil
(regexp-parse-string (concat "\\(ab\\)"))
(regexp-parse-string (concat "\\(?:ab?\\)"))
(let ((str (concat "\\([]\\)"
"\\([-a]\\)"
"\\([a-]\\)"
"\\([a-zA-Z-]\\)"
"\\`" "^" "\\_<" "\\<" "\\w" "\\W" "a\\*" "a"
"\b" "\0" "\\>" "\\_>" "aa*bb*" "$" "\\'")))
(regexp-parse-string str))
)
;; Use: (eval `(rx-to-string ',(regexp-parse-string "ab+") t))
;; (rx-to-string '(: "f" "g"))
(when nil
(let* ((re "aa.*bb.?cc.+dd")
(rt (regexp-parse-string re)))
;;(eval `(rx ,re)))
(equal re
(when rt (eval `(rx ,rt))))))
(provide 'regexp-utils)
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; rx-utils.el ends here
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2009-10-26 14:34 UTC | newest]
Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-10-26 14:34 RFC: Parsing a Regexp to a `rx' for Introspection Nordlöw
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).