all messages for Emacs-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
blob aee26db381727e75dfe4e45ae008ca7ff7f66b72 12840 bytes (raw)
name: lisp/emacs-lisp/regexp-disasm.el 	 # note: path name is non-authoritative(*)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
 
;;; regexp-disasm -- disassemble regexp bytecode  -*- lexical-binding: t -*-

;; Copyright (C) 2020 Free Software Foundation, Inc.

;; This file is part of GNU Emacs.

;; GNU Emacs is free software: you can redistribute it and/or modify
;; it under the terms of the GNU General Public License as published by
;; the Free Software Foundation, either version 3 of the License, or
;; (at your option) any later version.

;; GNU Emacs is distributed in the hope that it will be useful,
;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;; GNU General Public License for more details.

;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs.  If not, see <https://www.gnu.org/licenses/>.

;;; Commentary:

;; Decode compiled Emacs regexp bytecode and pretty-print.

(defconst regexp-disasm--classes
  [word lower punct space upper multibyte alpha alnum graph print blank]
  "Vector of character classes, corresponding to BIT_* in regex-emacs.c.")

(defconst regexp-disasm--syntax-codes
  [whitespace punctuation word symbol
   open-parenthesis close-parenthesis expression-prefix string-quote
   paired-delimiter escape character-quote comment-start comment-end
   inherit comment-delimiter string-delimiter]
  "Vector of syntax codes, corresponding to enum syntaxcode in syntax.h
but using names from `rx'.")

;;;###autoload
(defun regexp-disasm (regexp)
  "Disassemble REGEXP; return list of instructions.
Instructions are on the form (ADDRESS . INSTR) where ADDRESS is the
byte offset and INSTR an S-expression representing the instruction."
  (let* ((bc (regexp-bytecode regexp))
         (read-u16 (lambda (ofs) (+ (aref bc ofs)
                                    (ash (aref bc (1+ ofs)) 8))))
         (read-u24 (lambda (ofs) (+ (aref bc ofs)
                                    (ash (aref bc (+ ofs 1)) 8)
                                    (ash (aref bc (+ ofs 2)) 16))))
         (read-s16 (lambda (ofs) (let ((x (funcall read-u16 ofs)))
                                   (- x (ash (logand x #x8000) 1)))))
         (mb (multibyte-string-p regexp))
         (len (length bc))
         (i 0)
         (entries nil))
    (while (< i len)
      (let* ((opcode (aref bc i))
             (entry-and-size
               (pcase opcode
                 (0 (cons 'no-op 1))
                 (1 (cons 'succeed 1))
                 (2 (let* ((nbytes (aref bc (1+ i)))
                           (raw (substring bc (+ i 2) (+ i 2 nbytes)))
                           (str (if mb
                                    (decode-coding-string raw 'utf-8-emacs)
                                  raw)))
                      (cons (list 'exactn str) (+ 2 nbytes))))
                 (3 (cons 'nonl 1))     ; `anychar' is a misnomer
                 ((or 4 5)              ; `charset', `notcharset'
                  (let* ((negated (= opcode 5))
                         (bitmap-len-raw (aref bc (1+ i)))
                         (bitmap-len (logand bitmap-len-raw #x7f))
                         (have-range-table (/= (logand bitmap-len-raw #x80) 0))
                         (npairs (if have-range-table
                                     (funcall read-u16 (+ i 2 bitmap-len 2))
                                   0))
                         (bitmap-pairs nil)
                         (classes nil)
                         (pairs nil))

                    ;; Convert the bitmap to ranges.
                    (let ((first nil))
                      (dotimes (j (* bitmap-len 8))
                        (if (/= (logand (aref bc (+ i 2 (ash j -3)))
                                        (ash 1 (logand j 7)))
                                0)
                            (unless first
                              (setq first j))
                          (when first
                            (push (cons first (1- j)) bitmap-pairs)
                            (setq first nil))))
                      (when first
                        (push (cons first (1- (* bitmap-len 8))) bitmap-pairs)))

                    (when have-range-table
                      ;; Convert class bits to list of classes.
                      (let ((class-bits (funcall read-u16 (+ i 2 bitmap-len))))
                        (dotimes (j (length regexp-disasm--classes))
                          (when (/= (logand class-bits (ash 1 j)) 0)
                            (push (aref regexp-disasm--classes j) classes))))

                      ;; Read range table.
                      (dotimes (j npairs)
                        (let* ((ofs (+ i 2 bitmap-len 4 (* j 6)))
                               (from (funcall read-u24 ofs))
                               (to   (funcall read-u24 (+ ofs 3))))
                          (push (cons from to) pairs))))

                    (cons (list (if negated 'notcharset 'charset)
                                (reverse bitmap-pairs)
                                (reverse classes)
                                (reverse pairs))
                          (+ 2 bitmap-len
                             (if have-range-table 4 0) (* npairs 6)))))
                 (6 (cons (list 'start-memory (aref bc (1+ i)))
                          2))
                 (7 (cons (list 'stop-memory (aref bc (1+ i)))
                          2))
                 (8 (cons (list 'duplicate (aref bc (1+ i)))
                          2))
                 (9 (cons 'begline 1))
                 (10 (cons 'endline 1))
                 (11 (cons 'begbuf 1))
                 (12 (cons 'endbuf 1))
                 (13 (cons (list 'jump
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (14 (cons (list 'on-failure-jump
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (15 (cons (list 'on-failure-keep-string-jump
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (16 (cons (list 'on-failure-jump-loop
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (17 (cons (list 'on-failure-jump-nastyloop
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (18 (cons (list 'on-failure-jump-smart
                                 (+ (funcall read-s16 (1+ i)) i 3))
                           3))
                 (19 (cons (list 'succeed-n
                                 (+ (funcall read-s16 (1+ i)) i 3)
                                 (funcall read-u16 (+ i 3)))
                           5))
                 (20 (cons (list 'jump-n
                                 (+ (funcall read-s16 (1+ i)) i 3)
                                 (funcall read-u16 (+ i 3)))
                           5))
                 (21 (cons (list 'set-number-at
                                 (+ (funcall read-s16 (1+ i)) i 3)
                                 (funcall read-u16 (+ i 3)))
                           5))
                 (22 (cons 'wordbeg 1))
                 (23 (cons 'wordend 1))
                 (24 (cons 'wordbound 1))
                 (25 (cons 'notwordbound 1))
                 (26 (cons 'symbeg 1))
                 (27 (cons 'symend 1))
                 (28 (cons (list 'syntaxspec
                                 (aref regexp-disasm--syntax-codes
                                       (aref bc (1+ i))))
                           2))
                 (29 (cons (list 'notsyntaxspec
                                 (aref regexp-disasm--syntax-codes
                                       (aref bc (1+ i))))
                           2))
                 (30 (cons 'at-dot 1))
                 (31 (cons (list 'categoryspec (aref bc (1+ i)))
                           2))
                 (32 (cons (list 'notcategoryspec (aref bc (1+ i)))
                           2))
                 (_ (error "bad opcode at ofs %d: 0x%02x" i opcode))))
             (entry (car entry-and-size))
             (size (cdr entry-and-size)))
        (push (cons i entry) entries)
        (setq i (+ i size))))
    (reverse entries)))

;;;###autoload
(defun regexp-disassemble (regexp)
  "Compile REGEXP and print the disassembled bytecode."
  (interactive "XRegexp (evaluated): ")
  (let* ((instructions (regexp-disasm regexp))
         (control-chars '((?\b . ?b)
                          (?\t . ?t)
                          (?\n . ?n)
                          (?\v . ?v)
                          (?\f . ?f)
                          (?\r . ?r)
                          (?\e . ?e)))
         (quote-byte (lambda (c)
                       (let ((esc (assq c control-chars)))
                         (cond (esc (string ?\\ (cdr esc)))
                               ((or (<= c 31) (<= #x7f c #xff))
                                (format "\\%03o" c))
                               (t (string c))))))
         (quote-string-char (lambda (c)
                              (let ((esc (assq c control-chars)))
                                (cond (esc (string ?\\ (cdr esc)))
                                      ((memq c '(?\\ ?\"))
                                       (string ?\\ c))
                                      ((or (<= c 31) (= c 127)
                                           (>= c #x3fff80))
                                       (format "\\%03o" (logand c #xff)))
                                      (t (string c))))))
         (quote-string (lambda (s)
                         (concat "\""
                                 (mapconcat quote-string-char s "")
                                 "\"")))
         (quote-range (lambda (range quote-char)
                        (if (eq (car range) (cdr range))
                            (funcall quote-char (car range))
                          (format "%s-%s"
                                  (funcall quote-char (car range))
                                  (funcall quote-char (cdr range))))))
         (quote-range-uni
          (lambda (range) (funcall quote-range range quote-byte)))
         (quote-range-multi
          (lambda (range) (funcall quote-range range #'string))))
    (with-output-to-temp-buffer "*Regexp-disassemble*"
      (with-current-buffer standard-output
        (insert (format "Disassembly of regexp %s\n\n"
                        (funcall quote-string regexp)))
        (dolist (instr instructions)
          (let* ((addr (car instr))
                 (op (cdr instr))
                 (line
                  (pcase op
                    ((pred symbolp) (symbol-name op))
                    (`(exactn ,s) (format "exactn %s" (funcall quote-string s)))
                    (`(,(or 'charset 'notcharset)
                       ,bitmap-pairs ,classes ,pairs)
                     ;; FIXME: Maybe use a less ambiguous charset syntax.
                     ;; Avoid ranges when endpoints are adjacent.
                     ;; What to do about metachars like `]' and `-'?
                     (concat (format "%s [%s]"
                                     (car op)
                                     (mapconcat quote-range-uni
                                                bitmap-pairs ""))
                             (and classes
                                  (concat " [:"
                                          (mapconcat
                                           #'symbol-name classes ",")
                                          ":]"))
                             (and pairs
                                  (concat " ["
                                          (mapconcat quote-range-multi pairs "")
                                          "]"))))
                    (`(,(or 'start-memory 'stop-memory 'duplicate) ,n)
                     (format "%s group %d" (car op) n))
                    (`(,(or 'jump 'on-failure-jump 'on-failure-keep-string-jump
                            'on-failure-jump-loop 'on-failure-jump-nastyloop
                            'on-failure-jump-smart)
                       ,dest)
                     (format "%s to %d" (car op) dest))
                    (`(,(or 'succeed-n 'jump-n 'set-number-at)
                       ,dest ,val)
                     (format "%s addr %d, value %d" (car op) dest val))
                    (`(,(or 'syntaxspec 'notsyntaxspec) ,syn)
                     (format "%s %s" (car op) syn))
                    (`(,(or 'categoryspec 'notcategoryspec) ,ch)
                     (format "%s '%c'" (car op) ch))
                    (_ (error "unrecognised opcode: %S" op)))))
            (insert (format "%5d  %s\n" addr line))))))))

(provide 'regexp-disasm)

;;; regexp-disasm.el ends here

debug log:

solving aee26db381 ...
found aee26db381 in https://yhetil.org/emacs/4201DF24-BCC4-4C08-9857-38207B7C10B4@acm.org/

applying [1/1] https://yhetil.org/emacs/4201DF24-BCC4-4C08-9857-38207B7C10B4@acm.org/
diff --git a/lisp/emacs-lisp/regexp-disasm.el b/lisp/emacs-lisp/regexp-disasm.el
new file mode 100644
index 0000000000..aee26db381

Checking patch lisp/emacs-lisp/regexp-disasm.el...
Applied patch lisp/emacs-lisp/regexp-disasm.el cleanly.

index at:
100644 aee26db381727e75dfe4e45ae008ca7ff7f66b72	lisp/emacs-lisp/regexp-disasm.el

(*) Git path names are given by the tree(s) the blob belongs to.
    Blobs themselves have no identifier aside from the hash of its contents.^

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/emacs.git
	https://git.savannah.gnu.org/cgit/emacs/org-mode.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.