From be34c3eda87e7b2a369de6412b0a42a0a53819a3 Mon Sep 17 00:00:00 2001 From: Ramin Honary Date: Wed, 20 Nov 2024 13:32:11 +0900 Subject: [PATCH] change to the Emacs Lisp parser in the (language elisp) module - I fixed a simple bug in the lexer that decodes the carat encoding for control characters (for example "^@" translates to #\null), - and I implemented octal digit decoding in the lexer. I have tested these changes against Emacs Lisp code from the GNU Emacs source code. I can confirm that files like "subr.el" can be fully parsed to list data structures in Scheme now. --- module/language/elisp/lexer.scm | 133 ++++++++++++++++++++++++++----- module/language/elisp/parser.scm | 21 +++-- 2 files changed, 128 insertions(+), 26 deletions(-) diff --git a/module/language/elisp/lexer.scm b/module/language/elisp/lexer.scm index 5a0e6b3..baa0fec 100644 --- a/module/language/elisp/lexer.scm +++ b/module/language/elisp/lexer.scm @@ -20,6 +20,8 @@ (define-module (language elisp lexer) #:use-module (ice-9 regex) + #:use-module ((scheme base) + #:select (exact eof-object)) #:export (get-lexer get-lexer/1)) ;;; This is the lexical analyzer for the elisp reader. It is @@ -37,6 +39,17 @@ ;;; Report an error from the lexer (that is, invalid input given). + +;; Re-defining some symbols here to make this code more R7RS compliant +;; and easier to port to other Scheme implementations. (ice-9 regex) are +;; the only non-standard feature used here now. + +(define make-symbol string->symbol) + ;; ^ not sure if these do the exact same thing, so renaming the + ;; "make-symbol" procedure used in this module so it can be easily + ;; restored later by commenting out this definition. + + (define (lexer-error port msg . args) (apply error msg args)) @@ -68,14 +81,14 @@ ;;; handled as such, and in elisp C-? is the delete character for ;;; historical reasons. Otherwise, we set bit 26. +(define |int?| (char->integer #\?)) +(define |int@| (char->integer #\@)) + (define (add-control chr) - (let ((real (real-character chr))) - (if (char-alphabetic? real) - (- (char->integer (char-upcase real)) (char->integer #\@)) - (case real - ((#\?) 127) - ((#\@) 0) - (else (set-char-bit chr 26)))))) + (cond + ((= chr |int?|) 127) + ((>= chr |int@|) + (- (char->integer (char-upcase (integer->char chr))) |int@|)))) ;;; Parse a charcode given in some base, basically octal or hexadecimal ;;; are needed. A requested number of digits can be given (#f means it @@ -110,7 +123,7 @@ "invalid digit in escape-code" base cur)) - (iterate (+ (* result base) value) (1+ procdigs))))))) + (iterate (+ (* result base) value) (+ 1 procdigs))))))) ;;; Read a character and process escape-sequences when necessary. The ;;; special in-string argument defines if this character is part of a @@ -197,6 +210,67 @@ (define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?.")) +(define (signdigit? c) + (or (char=? c #\+) (char=? c #\-))) + +(define (octdigit? c) + ;; assumes you already checked that c is greater/equal to #\0. + (char<=? c #\7)) + +(define (hexdigit? c) + ;; assumes you already checked that c is not an octdigit + (or (char<=? c #\9) + (and (char>=? c #\A) (char<=? c #\F)) + (and (char>=? c #\a) (char<=? c #\f)))) + +(define (get-number-other-base init-char port) + (let*((sign (read-char port)) + (init-stack (list sign init-char #\#)) + (bad-token + (lambda (msg chars) + (error (if msg msg "invalid integer literal") + (list->string (reverse chars)) + #:line (port-line port) + #:column (port-column port)))) + (finish + (lambda (stack has-hexdigit) + (cond + ((and (char=? init-char #\o) has-hexdigit) + (bad-token "invalid octal integer literal" stack)) + (else + (let*((str (list->string (reverse stack))) + (int (string->number str))) + (cond + (int (cons 'integer int)) + (else (error "invalid integer literal" str))))) + ))) + ) + (define (iterate stack has-hexdigit) + (let ((c (read-char port))) + (cond + ((eof-object? c) (finish stack has-hexdigit)) + (else + (cond + ((char>=? c #\0) + (cond + ((octdigit? c) (iterate (cons c stack) has-hexdigit)) + ((hexdigit? c) (iterate (cons c stack) #t)) + ((char-alphabetic? c) (bad-token #f (reverse stack))) + (else (unread-char c port) (finish stack has-hexdigit)))) + (else (unread-char c port) (finish stack has-hexdigit)))) + ))) + (cond + ((eof-object? sign) (bad-token #f init-stack)) + ((signdigit? sign) (iterate init-stack #f)) + ((char>=? sign #\0) + (cond + ((eof-object? sign) (bad-token #f init-stack)) + ((octdigit? sign) (iterate init-stack #f)) + ((hexdigit? sign) (iterate init-stack #t)) + (else (bad-token #f init-stack)))) + (else (bad-token #f init-stack))) + )) + (define (get-symbol-or-number port) (let iterate ((result-chars '()) (had-escape #f)) @@ -264,8 +338,8 @@ (let* ((return (let ((file (if (file-port? port) (port-filename port) #f)) - (line (1+ (port-line port))) - (column (1+ (port-column port)))) + (line (+ 1 (port-line port))) + (column (+ 1 (port-column port)))) (lambda (token value) (let ((obj (cons token value))) (set-source-property! obj 'filename file) @@ -325,9 +399,21 @@ (else (unread-char escaped port) (unread-char cur port) - (iterate - (cons (integer->char (get-character port #t)) - result-chars)))))) + (let ((c (get-character port #t))) + (cond + ((> c #x10FFFF) + (error "bad character" + #:char c + #:cur cur + #:escaped escaped + #:file (port-filename port) + #:line (port-line port) + #:column (port-column port) + #:after (list->string (reverse result-chars)))) + (else + (iterate + (cons (integer->char c) + result-chars))))))))) (else (iterate (cons cur result-chars))))))) ((#\#) (let ((c (read-char port))) @@ -342,7 +428,18 @@ (call-with-values (lambda () (get-symbol-or-number port)) (lambda (type str) - (return 'symbol (make-symbol str)))))))) + (return 'symbol (make-symbol str))))) + ((#\o #\O) ;;octal literal + (let ((token (get-number-other-base #\o port))) + (return (car token) (cdr token)))) + ((#\x #\X) ;;hexadecimal literal + (let ((token (get-number-other-base #\x port))) + (return (car token) (cdr token)))) + (else + (let ((line (port-line port)) + (col (port-column port))) + (error "unknown token" c #:line line #:column col))) + ))) ;; Parentheses and other special-meaning single characters. ((#\() (return 'paren-open #f)) ((#\)) (return 'paren-close #f)) @@ -386,7 +483,7 @@ ;; integer! (return 'integer - (let ((num (inexact->exact (string->number str)))) + (let ((num (exact (string->number str)))) (if (not (integer? num)) (error "expected integer" str num)) num))) @@ -415,14 +512,14 @@ (paren-level 0)) (lambda () (if finished - (cons 'eof ((@ (ice-9 binary-ports) eof-object))) + (cons 'eof eof-object) (let ((next (lex)) (quotation #f)) (case (car next) ((paren-open square-open) - (set! paren-level (1+ paren-level))) + (set! paren-level (+ 1 paren-level))) ((paren-close square-close) - (set! paren-level (1- paren-level))) + (set! paren-level (- paren-level 1))) ((quote backquote unquote unquote-splicing circular-def) (set! quotation #t))) (if (and (not quotation) (<= paren-level 0)) diff --git a/module/language/elisp/parser.scm b/module/language/elisp/parser.scm index a7aeff0..4e4f770 100644 --- a/module/language/elisp/parser.scm +++ b/module/language/elisp/parser.scm @@ -20,8 +20,11 @@ (define-module (language elisp parser) #:use-module (language elisp lexer) + #:use-module ((srfi srfi-69) + #:select (hash-table-ref hash-table-set!)) #:export (read-elisp)) + ;;; The parser (reader) for elisp expressions. ;;; ;;; It is hand-written (just as the lexer is) instead of using some @@ -47,7 +50,7 @@ ;;; one call to read-elisp (but not only the currently parsed ;;; expression!). -(define circular-definitions (make-fluid)) +(define circular-definitions (make-parameter #f)) (define (make-circular-definitions) (make-hash-table)) @@ -56,7 +59,7 @@ (if (not (eq? (car token) 'circular-ref)) (error "invalid token for circular-ref" token)) (let* ((id (cdr token)) - (value (hashq-ref (fluid-ref circular-definitions) id))) + (value (hash-table-ref (circular-definitions) id))) (if value value (parse-error token "undefined circular reference" id)))) @@ -69,12 +72,12 @@ (if (not (eq? (car token) 'circular-def)) (error "invalid token for circular-define!" token)) (let ((value #f) - (table (fluid-ref circular-definitions)) + (table (circular-definitions)) (id (cdr token))) - (hashq-set! table id (delay value)) + (hash-table-set! table id (delay value)) (lambda (real-value) (set! value real-value) - (hashq-set! table id real-value)))) + (hash-table-set! table id real-value)))) ;;; Work through a parsed data structure and force the promises there. ;;; After a promise is forced, the resulting value must not be recursed @@ -101,7 +104,7 @@ (if (promise? el) (vector-set! data i (force el)) (force-promises! el)) - (iterate (1+ i))))))) + (iterate (+ 1 i))))))) ;; Else nothing needs to be done. )) @@ -121,7 +124,9 @@ #f) (begin (if (not look-ahead) - (set! look-ahead (lex))) + (let ((next (lex))) + (set! look-ahead next) + next)) (case action ((peek) look-ahead) ((get) @@ -211,7 +216,7 @@ ;;; define a circular-definitions data structure to use. (define (read-elisp port) - (with-fluids ((circular-definitions (make-circular-definitions))) + (parameterize ((circular-definitions (make-circular-definitions))) (let* ((lexer (get-lexer port)) (lexbuf (make-lexer-buffer lexer)) (next (lexbuf 'peek))) -- 2.39.5