unofficial mirror of bug-guile@gnu.org 
 help / color / mirror / Atom feed
* bug#74454: Bug fix: Emacs Lisp parser
@ 2024-11-21  7:34 Ramin Honary
  0 siblings, 0 replies; only message in thread
From: Ramin Honary @ 2024-11-21  7:34 UTC (permalink / raw)
  To: 74454

[-- Attachment #1: Type: text/plain, Size: 1539 bytes --]

Hello Guile developers:

I originally sent this to the guile-devel list but someone kindly
explained that I should send it here.

I have made a small bug fix for the Emacs Lisp parser in the (language
elisp) module.

  - I fixed a simple bug in the lexer that decodes the carat encoding
for control characters (for example "^@" translates to #\null),

  - and I implemented octal digit decoding in the lexer.

I have tested these changes against Emacs Lisp code from the GNU Emacs
source code. I can confirm that files like "subr.el" can be fully
parsed to list data structures in Scheme now. Here is a simple test
program that parses "subr.el" from GNU Emacs:

(use-modules ((language elisp parser) #:select (read-elisp)))

(define (file-read-all-forms filepath)
  (call-with-port (open-input-file filepath)
    (lambda (port)
      (let loop ((forms-list '()))
        (let ((form (read-elisp port)))
          (cond
           ((eof-object? form) (reverse forms-list))
           (else (loop (cons form forms-list)))
           ))))))

(define path-to-test-el "/home/ramin/src/emacs-29.4/li
sp/subr.el")

(define (main)
  (for-each
   (lambda (form) (display form) (newline))
   (file-read-all-forms path-to-test-el)))

Before applying my patch, the above test program fails with:

ice-9/boot-9.scm:1685:16: In procedure raise-exception:
In procedure integer->char: Argument 1 out of range: 67108955

After applying my patch, the whole file parses successfully.

Regards,

-- Ramin Honary


(ActivityPub: @ramin_hal9001@fe.disroot.org)

[-- Attachment #2: 2024-11-20_ramin-honary_improve-elisp-parser.patch --]
[-- Type: text/x-patch, Size: 11916 bytes --]

From be34c3eda87e7b2a369de6412b0a42a0a53819a3 Mon Sep 17 00:00:00 2001
From: Ramin Honary <ramin@tropic-isle.honary.home>
Date: Wed, 20 Nov 2024 13:32:11 +0900
Subject: [PATCH] change to the Emacs Lisp parser in the (language elisp) module

  - I fixed a simple bug in the lexer that decodes the carat encoding
    for control characters (for example "^@" translates to #\null),

  - and I implemented octal digit decoding in the lexer.

  I have tested these changes against Emacs Lisp code from the GNU Emacs
  source code. I can confirm that files like "subr.el" can be
  fully parsed to list data structures in Scheme now.
---
 module/language/elisp/lexer.scm  | 133 ++++++++++++++++++++++++++-----
 module/language/elisp/parser.scm |  21 +++--
 2 files changed, 128 insertions(+), 26 deletions(-)

diff --git a/module/language/elisp/lexer.scm b/module/language/elisp/lexer.scm
index 5a0e6b3..baa0fec 100644
--- a/module/language/elisp/lexer.scm
+++ b/module/language/elisp/lexer.scm
@@ -20,6 +20,8 @@
 
 (define-module (language elisp lexer)
   #:use-module (ice-9 regex)
+  #:use-module ((scheme base)
+                #:select (exact eof-object))
   #:export (get-lexer get-lexer/1))
 
 ;;; This is the lexical analyzer for the elisp reader.  It is
@@ -37,6 +39,17 @@
 
 ;;; Report an error from the lexer (that is, invalid input given).
 
+
+;; Re-defining some symbols here to make this code more R7RS compliant
+;; and easier to port to other Scheme implementations. (ice-9 regex) are
+;; the only non-standard feature used here now.
+
+(define make-symbol string->symbol)
+   ;; ^ not sure if these do the exact same thing, so renaming the
+   ;;   "make-symbol" procedure used in this module so it can be easily
+   ;;   restored later by commenting out this definition.
+
+
 (define (lexer-error port msg . args)
   (apply error msg args))
 
@@ -68,14 +81,14 @@
 ;;; handled as such, and in elisp C-? is the delete character for
 ;;; historical reasons.  Otherwise, we set bit 26.
 
+(define |int?| (char->integer #\?))
+(define |int@| (char->integer #\@))
+
 (define (add-control chr)
-  (let ((real (real-character chr)))
-    (if (char-alphabetic? real)
-        (- (char->integer (char-upcase real)) (char->integer #\@))
-        (case real
-          ((#\?) 127)
-          ((#\@) 0)
-          (else (set-char-bit chr 26))))))
+  (cond
+   ((=  chr |int?|) 127)
+   ((>= chr |int@|)
+    (- (char->integer (char-upcase (integer->char chr))) |int@|))))
 
 ;;; Parse a charcode given in some base, basically octal or hexadecimal
 ;;; are needed.  A requested number of digits can be given (#f means it
@@ -110,7 +123,7 @@
                                "invalid digit in escape-code"
                                base
                                cur))
-              (iterate (+ (* result base) value) (1+ procdigs)))))))
+              (iterate (+ (* result base) value) (+ 1 procdigs)))))))
 
 ;;; Read a character and process escape-sequences when necessary.  The
 ;;; special in-string argument defines if this character is part of a
@@ -197,6 +210,67 @@
 
 (define no-escape-punctuation (string->char-set "-+=*/_~!@$%^&:<>{}?."))
 
+(define (signdigit? c)
+  (or (char=? c #\+) (char=? c #\-)))
+
+(define (octdigit? c)
+  ;; assumes you already checked that c is greater/equal to #\0.
+  (char<=? c #\7))
+
+(define (hexdigit? c)
+  ;; assumes you already checked that c is not an octdigit
+  (or (char<=? c #\9)
+      (and (char>=? c #\A) (char<=? c #\F))
+      (and (char>=? c #\a) (char<=? c #\f))))
+
+(define (get-number-other-base init-char port)
+  (let*((sign (read-char port))
+        (init-stack (list sign init-char #\#))
+        (bad-token
+         (lambda (msg chars)
+           (error (if msg msg "invalid integer literal")
+            (list->string (reverse chars))
+            #:line (port-line port)
+            #:column (port-column port))))
+        (finish
+         (lambda (stack has-hexdigit)
+           (cond
+            ((and (char=? init-char #\o) has-hexdigit)
+             (bad-token "invalid octal integer literal" stack))
+            (else
+             (let*((str (list->string (reverse stack)))
+                   (int (string->number str)))
+               (cond
+                (int (cons 'integer int))
+                (else (error "invalid integer literal" str)))))
+            )))
+        )
+    (define (iterate stack has-hexdigit)
+      (let ((c (read-char port)))
+        (cond
+         ((eof-object? c) (finish stack has-hexdigit))
+         (else
+          (cond
+           ((char>=? c #\0)
+            (cond
+             ((octdigit? c) (iterate (cons c stack) has-hexdigit))
+             ((hexdigit? c) (iterate (cons c stack) #t))
+             ((char-alphabetic? c) (bad-token #f (reverse stack)))
+             (else (unread-char c port) (finish stack has-hexdigit))))
+           (else (unread-char c port) (finish stack has-hexdigit))))
+         )))
+    (cond
+     ((eof-object? sign) (bad-token #f init-stack))
+     ((signdigit? sign) (iterate init-stack #f))
+     ((char>=? sign #\0)
+      (cond
+       ((eof-object? sign) (bad-token #f init-stack))
+       ((octdigit? sign) (iterate init-stack #f))
+       ((hexdigit? sign) (iterate init-stack #t))
+       (else (bad-token #f init-stack))))
+     (else (bad-token #f init-stack)))
+     ))
+
 (define (get-symbol-or-number port)
   (let iterate ((result-chars '())
                 (had-escape #f))
@@ -264,8 +338,8 @@
   (let* ((return (let ((file (if (file-port? port)
                                  (port-filename port)
                                  #f))
-                       (line (1+ (port-line port)))
-                       (column (1+ (port-column port))))
+                       (line (+ 1 (port-line port)))
+                       (column (+ 1 (port-column port))))
                    (lambda (token value)
                      (let ((obj (cons token value)))
                        (set-source-property! obj 'filename file)
@@ -325,9 +399,21 @@
                     (else
                      (unread-char escaped port)
                      (unread-char cur port)
-                     (iterate
-                      (cons (integer->char (get-character port #t))
-                            result-chars))))))
+                     (let ((c (get-character port #t)))
+                       (cond
+                        ((> c #x10FFFF)
+                         (error "bad character"
+                          #:char c
+                          #:cur cur
+                          #:escaped escaped
+                          #:file (port-filename port)
+                          #:line (port-line port)
+                          #:column (port-column port)
+                          #:after (list->string (reverse result-chars))))
+                        (else
+                         (iterate
+                          (cons (integer->char c)
+                                result-chars)))))))))
                (else (iterate (cons cur result-chars)))))))
         ((#\#)
          (let ((c (read-char port)))
@@ -342,7 +428,18 @@
              (call-with-values
                  (lambda () (get-symbol-or-number port))
                (lambda (type str)
-                 (return 'symbol (make-symbol str))))))))
+                 (return 'symbol (make-symbol str)))))
+            ((#\o #\O) ;;octal literal
+             (let ((token (get-number-other-base #\o port)))
+               (return (car token) (cdr token))))
+            ((#\x #\X) ;;hexadecimal literal
+             (let ((token (get-number-other-base #\x port)))
+               (return (car token) (cdr token))))
+            (else
+             (let ((line (port-line port))
+                   (col (port-column port)))
+               (error "unknown token" c #:line line #:column col)))
+            )))
         ;; Parentheses and other special-meaning single characters.
         ((#\() (return 'paren-open #f))
         ((#\)) (return 'paren-close #f))
@@ -386,7 +483,7 @@
                 ;; integer!
                 (return
                  'integer
-                 (let ((num (inexact->exact (string->number str))))
+                 (let ((num (exact (string->number str))))
                    (if (not (integer? num))
                        (error "expected integer" str num))
                    num)))
@@ -415,14 +512,14 @@
         (paren-level 0))
     (lambda ()
       (if finished
-          (cons 'eof ((@ (ice-9 binary-ports) eof-object)))
+          (cons 'eof eof-object)
           (let ((next (lex))
                 (quotation #f))
             (case (car next)
               ((paren-open square-open)
-               (set! paren-level (1+ paren-level)))
+               (set! paren-level (+ 1 paren-level)))
               ((paren-close square-close)
-               (set! paren-level (1- paren-level)))
+               (set! paren-level (- paren-level 1)))
               ((quote backquote unquote unquote-splicing circular-def)
                (set! quotation #t)))
             (if (and (not quotation) (<= paren-level 0))
diff --git a/module/language/elisp/parser.scm b/module/language/elisp/parser.scm
index a7aeff0..4e4f770 100644
--- a/module/language/elisp/parser.scm
+++ b/module/language/elisp/parser.scm
@@ -20,8 +20,11 @@
 
 (define-module (language elisp parser)
   #:use-module (language elisp lexer)
+  #:use-module ((srfi srfi-69)
+                #:select (hash-table-ref hash-table-set!))
   #:export (read-elisp))
 
+
 ;;; The parser (reader) for elisp expressions.
 ;;;
 ;;; It is hand-written (just as the lexer is) instead of using some
@@ -47,7 +50,7 @@
 ;;; one call to read-elisp (but not only the currently parsed
 ;;; expression!).
 
-(define circular-definitions (make-fluid))
+(define circular-definitions (make-parameter #f))
 
 (define (make-circular-definitions)
   (make-hash-table))
@@ -56,7 +59,7 @@
   (if (not (eq? (car token) 'circular-ref))
       (error "invalid token for circular-ref" token))
   (let* ((id (cdr token))
-         (value (hashq-ref (fluid-ref circular-definitions) id)))
+         (value (hash-table-ref (circular-definitions) id)))
     (if value
         value
         (parse-error token "undefined circular reference" id))))
@@ -69,12 +72,12 @@
   (if (not (eq? (car token) 'circular-def))
       (error "invalid token for circular-define!" token))
   (let ((value #f)
-        (table (fluid-ref circular-definitions))
+        (table (circular-definitions))
         (id (cdr token)))
-    (hashq-set! table id (delay value))
+    (hash-table-set! table id (delay value))
     (lambda (real-value)
       (set! value real-value)
-      (hashq-set! table id real-value))))
+      (hash-table-set! table id real-value))))
 
 ;;; Work through a parsed data structure and force the promises there.
 ;;; After a promise is forced, the resulting value must not be recursed
@@ -101,7 +104,7 @@
               (if (promise? el)
                   (vector-set! data i (force el))
                   (force-promises! el))
-              (iterate (1+ i)))))))
+              (iterate (+ 1 i)))))))
    ;; Else nothing needs to be done.
    ))
 
@@ -121,7 +124,9 @@
               #f)
           (begin
             (if (not look-ahead)
-                (set! look-ahead (lex)))
+                (let ((next (lex)))
+                  (set! look-ahead next)
+                  next))
             (case action
               ((peek) look-ahead)
               ((get)
@@ -211,7 +216,7 @@
 ;;; define a circular-definitions data structure to use.
 
 (define (read-elisp port)
-  (with-fluids ((circular-definitions (make-circular-definitions)))
+  (parameterize ((circular-definitions (make-circular-definitions)))
     (let* ((lexer (get-lexer port))
            (lexbuf (make-lexer-buffer lexer))
            (next (lexbuf 'peek)))
-- 
2.39.5


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2024-11-21  7:34 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-11-21  7:34 bug#74454: Bug fix: Emacs Lisp parser Ramin Honary

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).