From: Ramin Honary <ramin.honary@gmail.com>
To: 74454@debbugs.gnu.org
Subject: bug#74454: Bug fix: Emacs Lisp parser
Date: Thu, 21 Nov 2024 07:34:41 +0000 [thread overview]
Message-ID: <CAFhnQTTy-bVzm-NGR4LtHFQP2nOHLcB+C6E4F-tpwAC8AKLNpg@mail.gmail.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 1539 bytes --]
Hello Guile developers:
I originally sent this to the guile-devel list but someone kindly
explained that I should send it here.
I have made a small bug fix for the Emacs Lisp parser in the (language
elisp) module.
- I fixed a simple bug in the lexer that decodes the carat encoding
for control characters (for example "^@" translates to #\null),
- and I implemented octal digit decoding in the lexer.
I have tested these changes against Emacs Lisp code from the GNU Emacs
source code. I can confirm that files like "subr.el" can be fully
parsed to list data structures in Scheme now. Here is a simple test
program that parses "subr.el" from GNU Emacs:
(use-modules ((language elisp parser) #:select (read-elisp)))
(define (file-read-all-forms filepath)
(call-with-port (open-input-file filepath)
(lambda (port)
(let loop ((forms-list '()))
(let ((form (read-elisp port)))
(cond
((eof-object? form) (reverse forms-list))
(else (loop (cons form forms-list)))
))))))
(define path-to-test-el "/home/ramin/src/emacs-29.4/li
sp/subr.el")
(define (main)
(for-each
(lambda (form) (display form) (newline))
(file-read-all-forms path-to-test-el)))
Before applying my patch, the above test program fails with:
ice-9/boot-9.scm:1685:16: In procedure raise-exception:
In procedure integer->char: Argument 1 out of range: 67108955
After applying my patch, the whole file parses successfully.
Regards,
-- Ramin Honary
(ActivityPub: @ramin_hal9001@fe.disroot.org)
[-- Attachment #2: 2024-11-20_ramin-honary_improve-elisp-parser.patch --]
[-- Type: text/x-patch, Size: 11916 bytes --]
From be34c3eda87e7b2a369de6412b0a42a0a53819a3 Mon Sep 17 00:00:00 2001
From: Ramin Honary <ramin@tropic-isle.honary.home>
Date: Wed, 20 Nov 2024 13:32:11 +0900
Subject: [PATCH] change to the Emacs Lisp parser in the (language elisp) module
- I fixed a simple bug in the lexer that decodes the carat encoding
for control characters (for example "^@" translates to #\null),
- and I implemented octal digit decoding in the lexer.
I have tested these changes against Emacs Lisp code from the GNU Emacs
source code. I can confirm that files like "subr.el" can be
fully parsed to list data structures in Scheme now.
---
module/language/elisp/lexer.scm | 133 ++++++++++++++++++++++++++-----
module/language/elisp/parser.scm | 21 +++--
2 files changed, 128 insertions(+), 26 deletions(-)
diff --git a/module/language/elisp/lexer.scm b/module/language/elisp/lexer.scm
index 5a0e6b3..baa0fec 100644
--- a/module/language/elisp/lexer.scm
+++ b/module/language/elisp/lexer.scm
@@ -20,6 +20,8 @@
(define-module (language elisp lexer)
#:use-module (ice-9 regex)
+ #:use-module ((scheme base)
+ #:select (exact eof-object))
#:export (get-lexer get-lexer/1))
;;; This is the lexical analyzer for the elisp reader. It is
@@ -37,6 +39,17 @@
;;; Report an error from the lexer (that is, invalid input given).
+
+;; Re-defining some symbols here to make this code more R7RS compliant
+;; and easier to port to other Scheme implementations. (ice-9 regex) are
+;; the only non-standard feature used here now.
+
+(define make-symbol string->symbol)
+ ;; ^ not sure if these do the exact same thing, so renaming the
+ ;; "make-symbol" procedure used in this module so it can be easily
+ ;; restored later by commenting out this definition.
+
+
(define (lexer-error port msg . args)
(apply error msg args))
@@ -68,14 +81,14 @@
;;; handled as such, and in elisp C-? is the delete character for
;;; historical reasons. Otherwise, we set bit 26.
+(define |int?| (char->integer #\?))
+(define |int@| (char->integer #\@))
+
(define (add-control chr)
- (let ((real (real-character chr)))
- (if (char-alphabetic? real)
- (- (char->integer (char-upcase real)) (char->integer #\@))
- (case real
- ((#\?) 127)
- ((#\@) 0)
- (else (set-char-bit chr 26))))))
+ (cond
+ ((= chr |int?|) 127)
+ ((>= chr |int@|)
+ (- (char->integer (char-upcase (integer->char chr))) |int@|))))
;;; Parse a charcode given in some base, basically octal or hexadecimal
;;; are needed. A requested number of digits can be given (#f means it
@@ -110,7 +123,7 @@
"invalid digit in escape-code"
base
cur))
- (iterate (+ (* result base) value) (1+ procdigs)))))))
+ (iterate (+ (* result base) value) (+ 1 procdigs)))))))
;;; Read a character and process escape-sequences when necessary. The
;;; special in-string argument defines if this character is part of a
@@ -197,6 +210,67 @@
(define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?."))
+(define (signdigit? c)
+ (or (char=? c #\+) (char=? c #\-)))
+
+(define (octdigit? c)
+ ;; assumes you already checked that c is greater/equal to #\0.
+ (char<=? c #\7))
+
+(define (hexdigit? c)
+ ;; assumes you already checked that c is not an octdigit
+ (or (char<=? c #\9)
+ (and (char>=? c #\A) (char<=? c #\F))
+ (and (char>=? c #\a) (char<=? c #\f))))
+
+(define (get-number-other-base init-char port)
+ (let*((sign (read-char port))
+ (init-stack (list sign init-char #\#))
+ (bad-token
+ (lambda (msg chars)
+ (error (if msg msg "invalid integer literal")
+ (list->string (reverse chars))
+ #:line (port-line port)
+ #:column (port-column port))))
+ (finish
+ (lambda (stack has-hexdigit)
+ (cond
+ ((and (char=? init-char #\o) has-hexdigit)
+ (bad-token "invalid octal integer literal" stack))
+ (else
+ (let*((str (list->string (reverse stack)))
+ (int (string->number str)))
+ (cond
+ (int (cons 'integer int))
+ (else (error "invalid integer literal" str)))))
+ )))
+ )
+ (define (iterate stack has-hexdigit)
+ (let ((c (read-char port)))
+ (cond
+ ((eof-object? c) (finish stack has-hexdigit))
+ (else
+ (cond
+ ((char>=? c #\0)
+ (cond
+ ((octdigit? c) (iterate (cons c stack) has-hexdigit))
+ ((hexdigit? c) (iterate (cons c stack) #t))
+ ((char-alphabetic? c) (bad-token #f (reverse stack)))
+ (else (unread-char c port) (finish stack has-hexdigit))))
+ (else (unread-char c port) (finish stack has-hexdigit))))
+ )))
+ (cond
+ ((eof-object? sign) (bad-token #f init-stack))
+ ((signdigit? sign) (iterate init-stack #f))
+ ((char>=? sign #\0)
+ (cond
+ ((eof-object? sign) (bad-token #f init-stack))
+ ((octdigit? sign) (iterate init-stack #f))
+ ((hexdigit? sign) (iterate init-stack #t))
+ (else (bad-token #f init-stack))))
+ (else (bad-token #f init-stack)))
+ ))
+
(define (get-symbol-or-number port)
(let iterate ((result-chars '())
(had-escape #f))
@@ -264,8 +338,8 @@
(let* ((return (let ((file (if (file-port? port)
(port-filename port)
#f))
- (line (1+ (port-line port)))
- (column (1+ (port-column port))))
+ (line (+ 1 (port-line port)))
+ (column (+ 1 (port-column port))))
(lambda (token value)
(let ((obj (cons token value)))
(set-source-property! obj 'filename file)
@@ -325,9 +399,21 @@
(else
(unread-char escaped port)
(unread-char cur port)
- (iterate
- (cons (integer->char (get-character port #t))
- result-chars))))))
+ (let ((c (get-character port #t)))
+ (cond
+ ((> c #x10FFFF)
+ (error "bad character"
+ #:char c
+ #:cur cur
+ #:escaped escaped
+ #:file (port-filename port)
+ #:line (port-line port)
+ #:column (port-column port)
+ #:after (list->string (reverse result-chars))))
+ (else
+ (iterate
+ (cons (integer->char c)
+ result-chars)))))))))
(else (iterate (cons cur result-chars)))))))
((#\#)
(let ((c (read-char port)))
@@ -342,7 +428,18 @@
(call-with-values
(lambda () (get-symbol-or-number port))
(lambda (type str)
- (return 'symbol (make-symbol str))))))))
+ (return 'symbol (make-symbol str)))))
+ ((#\o #\O) ;;octal literal
+ (let ((token (get-number-other-base #\o port)))
+ (return (car token) (cdr token))))
+ ((#\x #\X) ;;hexadecimal literal
+ (let ((token (get-number-other-base #\x port)))
+ (return (car token) (cdr token))))
+ (else
+ (let ((line (port-line port))
+ (col (port-column port)))
+ (error "unknown token" c #:line line #:column col)))
+ )))
;; Parentheses and other special-meaning single characters.
((#\() (return 'paren-open #f))
((#\)) (return 'paren-close #f))
@@ -386,7 +483,7 @@
;; integer!
(return
'integer
- (let ((num (inexact->exact (string->number str))))
+ (let ((num (exact (string->number str))))
(if (not (integer? num))
(error "expected integer" str num))
num)))
@@ -415,14 +512,14 @@
(paren-level 0))
(lambda ()
(if finished
- (cons 'eof ((@ (ice-9 binary-ports) eof-object)))
+ (cons 'eof eof-object)
(let ((next (lex))
(quotation #f))
(case (car next)
((paren-open square-open)
- (set! paren-level (1+ paren-level)))
+ (set! paren-level (+ 1 paren-level)))
((paren-close square-close)
- (set! paren-level (1- paren-level)))
+ (set! paren-level (- paren-level 1)))
((quote backquote unquote unquote-splicing circular-def)
(set! quotation #t)))
(if (and (not quotation) (<= paren-level 0))
diff --git a/module/language/elisp/parser.scm b/module/language/elisp/parser.scm
index a7aeff0..4e4f770 100644
--- a/module/language/elisp/parser.scm
+++ b/module/language/elisp/parser.scm
@@ -20,8 +20,11 @@
(define-module (language elisp parser)
#:use-module (language elisp lexer)
+ #:use-module ((srfi srfi-69)
+ #:select (hash-table-ref hash-table-set!))
#:export (read-elisp))
+
;;; The parser (reader) for elisp expressions.
;;;
;;; It is hand-written (just as the lexer is) instead of using some
@@ -47,7 +50,7 @@
;;; one call to read-elisp (but not only the currently parsed
;;; expression!).
-(define circular-definitions (make-fluid))
+(define circular-definitions (make-parameter #f))
(define (make-circular-definitions)
(make-hash-table))
@@ -56,7 +59,7 @@
(if (not (eq? (car token) 'circular-ref))
(error "invalid token for circular-ref" token))
(let* ((id (cdr token))
- (value (hashq-ref (fluid-ref circular-definitions) id)))
+ (value (hash-table-ref (circular-definitions) id)))
(if value
value
(parse-error token "undefined circular reference" id))))
@@ -69,12 +72,12 @@
(if (not (eq? (car token) 'circular-def))
(error "invalid token for circular-define!" token))
(let ((value #f)
- (table (fluid-ref circular-definitions))
+ (table (circular-definitions))
(id (cdr token)))
- (hashq-set! table id (delay value))
+ (hash-table-set! table id (delay value))
(lambda (real-value)
(set! value real-value)
- (hashq-set! table id real-value))))
+ (hash-table-set! table id real-value))))
;;; Work through a parsed data structure and force the promises there.
;;; After a promise is forced, the resulting value must not be recursed
@@ -101,7 +104,7 @@
(if (promise? el)
(vector-set! data i (force el))
(force-promises! el))
- (iterate (1+ i)))))))
+ (iterate (+ 1 i)))))))
;; Else nothing needs to be done.
))
@@ -121,7 +124,9 @@
#f)
(begin
(if (not look-ahead)
- (set! look-ahead (lex)))
+ (let ((next (lex)))
+ (set! look-ahead next)
+ next))
(case action
((peek) look-ahead)
((get)
@@ -211,7 +216,7 @@
;;; define a circular-definitions data structure to use.
(define (read-elisp port)
- (with-fluids ((circular-definitions (make-circular-definitions)))
+ (parameterize ((circular-definitions (make-circular-definitions)))
(let* ((lexer (get-lexer port))
(lexbuf (make-lexer-buffer lexer))
(next (lexbuf 'peek)))
--
2.39.5
reply other threads:[~2024-11-21 7:34 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
List information: https://www.gnu.org/software/guile/
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAFhnQTTy-bVzm-NGR4LtHFQP2nOHLcB+C6E4F-tpwAC8AKLNpg@mail.gmail.com \
--to=ramin.honary@gmail.com \
--cc=74454@debbugs.gnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).