unofficial mirror of bug-guix@gnu.org 
 help / color / mirror / code / Atom feed
From: Mark H Weaver <mhw@netris.org>
To: Pierre Neidhardt <mail@ambrevar.xyz>,
	Guillaume Le Vaillant <glv@posteo.net>
Cc: 33848@debbugs.gnu.org
Subject: bug#33848: Store references in SBCL-compiled code are "invisible"
Date: Fri, 02 Apr 2021 18:46:41 -0400	[thread overview]
Message-ID: <87r1js9udv.fsf@netris.org> (raw)
In-Reply-To: <871rbtc3j5.fsf@netris.org>

[-- Attachment #1: Type: text/plain, Size: 269 bytes --]

Here's a preliminary draft patch to add support for UTF-32 and UTF-16
references to our grafting code.  I haven't yet measured the efficiency
impact of these changes, but I suspect it's not too bad.

I'd be curious to know whether it fixes the Nyxt graft.

      Mark


[-- Attachment #2: [PATCH] DRAFT: grafts: Add support for UTF-16 and UTF-32 store references --]
[-- Type: text/x-patch, Size: 15392 bytes --]

From 0fcfd804570fd1c07ffb1f6c176d6ec3430907df Mon Sep 17 00:00:00 2001
From: Mark H Weaver <mhw@netris.org>
Date: Fri, 2 Apr 2021 18:36:23 -0400
Subject: [PATCH] DRAFT: grafts: Add support for UTF-16 and UTF-32 store
 references.

---
 guix/build/graft.scm | 138 +++++++++++++++++++++++++++++--------------
 tests/grafts.scm     |  68 +++++++++++++++++++++
 2 files changed, 162 insertions(+), 44 deletions(-)

diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index c119ee71d1..6e7f3859cb 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -1,6 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
-;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
+;;; Copyright © 2016, 2021 Mark H Weaver <mhw@netris.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -55,6 +55,36 @@
         (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
         <>))
 
+(define (nix-base32-char-or-nul? byte)
+  (or (nix-base32-char? byte)
+      (char=? byte #\nul)))
+
+(define (has-utf16-zeroes? buffer i)
+  (let loop ((j (+ 1 (- i (* 2 hash-length)))))
+    (or (>= j i)
+        (and (zero? (bytevector-u8-ref buffer j))
+             (loop (+ j 2))))))
+
+(define (has-utf32-zeroes? buffer i)
+  (let loop ((j (+ 1 (- i (* 4 hash-length)))))
+    (or (>= j i)
+        (and (zero? (bytevector-u8-ref buffer j))
+             (zero? (bytevector-u8-ref buffer (+ j 1)))
+             (zero? (bytevector-u8-ref buffer (+ j 2)))
+             (loop (+ j 4))))))
+
+(define (expand-bytevector bv char-size)
+  (let* ((len (bytevector-length bv))
+         (bv* (make-bytevector (+ 1 (* char-size
+                                       (- len 1)))
+                               0)))
+    (let loop ((i 0))
+      (when (< i len)
+        (bytevector-u8-set! bv* (* i char-size)
+                            (bytevector-u8-ref bv i))
+        (loop (+ i 1))))
+    bv*))
+
 (define* (replace-store-references input output replacement-table
                                    #:optional (store (%store-directory)))
   "Read data from INPUT, replacing store references according to
@@ -76,15 +106,16 @@ bytevectors to the same value."
           (list->vector (map pred (iota 256)))
           <>))
 
-  (define nix-base32-byte?
+  (define nix-base32-byte-or-nul?
     (optimize-u8-predicate
-     (compose nix-base32-char?
+     (compose nix-base32-char-or-nul?
               integer->char)))
 
   (define (dash? byte) (= byte 45))
 
   (define request-size (expt 2 20))  ; 1 MiB
 
+  ;; XXX This comment is no longer accurate!
   ;; We scan the file for the following 33-byte pattern: 32 bytes of
   ;; nix-base32 characters followed by a dash.  To accommodate large files,
   ;; we do not read the entire file, but instead work on buffers of up to
@@ -116,43 +147,61 @@ bytevectors to the same value."
            ;; written.
            (if (< i end)
                (let ((byte (bytevector-u8-ref buffer i)))
-                 (cond ((and (dash? byte)
-                             ;; We've found a dash.  Note that we do not know
-                             ;; whether the preceeding 32 bytes are nix-base32
-                             ;; characters, but we do not need to know.  If
-                             ;; they are not, the following lookup will fail.
-                             (lookup-replacement
-                              (string-tabulate (lambda (j)
-                                                 (integer->char
-                                                  (bytevector-u8-ref buffer
-                                                   (+ j (- i hash-length)))))
-                                               hash-length)))
-                        => (lambda (replacement)
-                             ;; We've found a hash that needs to be replaced.
-                             ;; First, write out all bytes preceding the hash
-                             ;; that have not yet been written.
-                             (put-bytevector output buffer written
-                                             (- i hash-length written))
-                             ;; Now write the replacement string.
-                             (put-bytevector output replacement)
-                             ;; Since the byte at position 'i' is a dash,
-                             ;; which is not a nix-base32 char, the earliest
-                             ;; position where the next hash might start is
-                             ;; i+1, and the earliest position where the
-                             ;; following dash might start is (+ i 1
-                             ;; hash-length).  Also, increase the write
-                             ;; position to account for REPLACEMENT.
-                             (let ((len (bytevector-length replacement)))
-                               (scan-from (+ i 1 len)
-                                          (+ i (- len hash-length))))))
-                       ;; If the byte at position 'i' is a nix-base32 char,
+                 (cond ((dash? byte)
+                        (let* ((char-size
+                                (if (zero? (bytevector-u8-ref buffer (- i 1)))
+                                    (if (zero? (bytevector-u8-ref buffer (- i 2)))
+                                        (if (and (<= (* 4 hash-length)
+                                                     (- i written))
+                                                 (has-utf32-zeroes? buffer i))
+                                            4
+                                            1)
+                                        (if (and (<= (* 2 hash-length)
+                                                     (- i written))
+                                                 (has-utf16-zeroes? buffer i))
+                                            2
+                                            1))
+                                    1))
+                               (replacement*
+                                (lookup-replacement
+                                 (string-tabulate (lambda (j)
+                                                    (integer->char
+                                                     (bytevector-u8-ref buffer
+                                                      (- i (* char-size
+                                                              (- hash-length j))))))
+                                                  hash-length)))
+                               (replacement
+                                (and replacement*
+                                     (expand-bytevector replacement*
+                                                        char-size))))
+                          (if replacement
+                              (begin
+                                ;; We've found a hash that needs to be replaced.
+                                ;; First, write out all bytes preceding the hash
+                                ;; that have not yet been written.
+                                (put-bytevector output buffer written
+                                                (- i (* char-size hash-length) written))
+                                ;; Now write the replacement string.
+                                (put-bytevector output replacement)
+                                ;; Now compute the new value of 'written' and
+                                ;; the new value of 'i', and iterate.
+                                (let ((written (+ (- i (* char-size hash-length))
+                                                  (bytevector-length replacement))))
+                                  (scan-from (+ written hash-length) written)))
+                              ;; The byte at position 'i' is a dash, which is
+                              ;; not a nix-base32 char, so the earliest
+                              ;; position where the next hash might start is
+                              ;; i+1, with the following dash at position (+ i
+                              ;; 1 hash-length).
+                              (scan-from (+ i 1 hash-length) written))))
+                       ;; If the byte at position 'i' is a nix-base32 char or nul,
                        ;; then the dash we're looking for might be as early as
                        ;; the following byte, so we can only advance by 1.
-                       ((nix-base32-byte? byte)
+                       ((nix-base32-byte-or-nul? byte)
                         (scan-from (+ i 1) written))
-                       ;; If the byte at position 'i' is NOT a nix-base32
-                       ;; char, then the earliest position where the next hash
-                       ;; might start is i+1, with the following dash at
+                       ;; If the byte at position 'i' is NOT a nix-base32 char
+                       ;; or nul, then the earliest position where the next
+                       ;; hash might start is i+1, with the following dash at
                        ;; position (+ i 1 hash-length).
                        (else
                         (scan-from (+ i 1 hash-length) written))))
@@ -162,18 +211,19 @@ bytevectors to the same value."
                ;; "unget".  If 'end' is less than 'request-size' then we read
                ;; less than we asked for, which indicates that we are at EOF,
                ;; so we needn't unget anything.  Otherwise, we unget up to
-               ;; 'hash-length' bytes (32 bytes).  However, we must be careful
-               ;; not to unget bytes that have already been written, because
-               ;; that would cause them to be written again from the next
-               ;; buffer.  In practice, this case occurs when a replacement is
-               ;; made near or beyond the end of the buffer.  When REPLACEMENT
-               ;; went beyond END, we consume the extra bytes from INPUT.
+               ;; (* 4 hash-length) bytes.  However, we must be careful not to
+               ;; unget bytes that have already been written, because that
+               ;; would cause them to be written again from the next buffer.
+               ;; In practice, this case occurs when a replacement is made
+               ;; near or beyond the end of the buffer.  When REPLACEMENT went
+               ;; beyond END, we consume the extra bytes from INPUT.
                (begin
                  (if (> written end)
                      (get-bytevector-n! input buffer 0 (- written end))
                      (let* ((unwritten  (- end written))
                             (unget-size (if (= end request-size)
-                                            (min hash-length unwritten)
+                                            (min (* 4 hash-length)
+                                                 unwritten)
                                             0))
                             (write-size (- unwritten unget-size)))
                        (put-bytevector output buffer written write-size)
diff --git a/tests/grafts.scm b/tests/grafts.scm
index a12c6a5911..0e1c7355b1 100644
--- a/tests/grafts.scm
+++ b/tests/grafts.scm
@@ -1,5 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2021 Mark H Weaver <mhw@netris.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -468,4 +469,71 @@
          replacement
          "/gnu/store")))))
 
+(define (nul-expand str char-size)
+  (string-join (map string (string->list str))
+               (make-string (- char-size 1) #\nul)))
+
+(for-each
+ (lambda (char-size1)
+   (for-each
+    (lambda (char-size2)
+      (for-each
+       (lambda (gap)
+        (for-each
+         (lambda (offset)
+           (test-equal (format #f "replace-store-references, char-sizes ~a ~a, gap ~s, offset ~a"
+                               char-size1 char-size2 gap offset)
+             (string-append (make-string offset #\=)
+                            (nul-expand (string-append "/gnu/store/"
+                                                       (make-string 32 #\6)
+                                                       "-BlahBlaH")
+                                        char-size1)
+                            gap
+                            (nul-expand (string-append "/gnu/store/"
+                                                       (make-string 32 #\8)
+                                                       "-SoMeTHiNG")
+                                        char-size2)
+                            (list->string (map integer->char (iota 77 33))))
+
+             ;; Create input data where the right-hand-size of the dash ("-something"
+             ;; here) goes beyond the end of the internal buffer of
+             ;; 'replace-store-references'.
+             (let* ((content     (string-append (make-string offset #\=)
+                                                (nul-expand (string-append "/gnu/store/"
+                                                                           (make-string 32 #\5)
+                                                                           "-blahblah")
+                                                            char-size1)
+                                                gap
+                                                (nul-expand (string-append "/gnu/store/"
+                                                                           (make-string 32 #\7)
+                                                                           "-something")
+                                                            char-size2)
+                                                (list->string
+                                                 (map integer->char (iota 77 33)))))
+                    (replacement (alist->vhash
+                                  `((,(make-string 32 #\5)
+                                     . ,(string->utf8 (string-append
+                                                       (make-string 32 #\6)
+                                                       "-BlahBlaH")))
+                                    (,(make-string 32 #\7)
+                                     . ,(string->utf8 (string-append
+                                                       (make-string 32 #\8)
+                                                       "-SoMeTHiNG")))))))
+               (call-with-output-string
+                 (lambda (output)
+                   ((@@ (guix build graft) replace-store-references)
+                    (open-input-string content) output
+                    replacement
+                    "/gnu/store"))))))
+         ;; offsets to test
+         (map (lambda (i) (- buffer-size (* 40 char-size1) i))
+              (iota 30))))
+       ;; gaps
+       '("" "-" " " "a")))
+    ;; char-size2 values to test
+    '(1 2)))
+ ;; char-size1 values to test
+ '(1 2 4))
+
+
 (test-end)
-- 
2.31.1


  reply	other threads:[~2021-04-02 22:49 UTC|newest]

Thread overview: 63+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-12-23 14:19 bug#33848: Store references in SBCL-compiled code are "invisible" Ludovic Courtès
2018-12-23 15:05 ` Pierre Neidhardt
2018-12-24 14:57   ` Ludovic Courtès
2018-12-23 16:45 ` Mark H Weaver
2018-12-23 17:32   ` Ludovic Courtès
2018-12-23 22:01     ` Pierre Neidhardt
2018-12-24 15:06       ` Ludovic Courtès
2018-12-24 17:08         ` Pierre Neidhardt
2018-12-26 16:07           ` Ludovic Courtès
2018-12-24 18:12         ` Mark H Weaver
2018-12-24 23:58           ` Pierre Neidhardt
2018-12-26 16:14           ` Ludovic Courtès
2018-12-27 10:37             ` Pierre Neidhardt
2018-12-27 14:03               ` Mark H Weaver
2018-12-27 14:45                 ` Ludovic Courtès
2018-12-27 15:02                   ` Pierre Neidhardt
2018-12-27 16:15                     ` Pierre Neidhardt
2018-12-27 17:03                     ` Ludovic Courtès
2018-12-27 18:57                       ` Pierre Neidhardt
2018-12-27 21:54                         ` Ludovic Courtès
2018-12-27 22:05                           ` Pierre Neidhardt
2018-12-27 22:59                             ` Ludovic Courtès
2018-12-28  7:47                               ` Pierre Neidhardt
2021-03-30 10:15                                 ` Pierre Neidhardt
2021-03-30 20:09                                   ` Ludovic Courtès
2021-03-31  7:10                                     ` Pierre Neidhardt
2021-03-31 16:12                                     ` Pierre Neidhardt
2021-03-31 20:42                                       ` Ludovic Courtès
2021-03-31 20:57                                         ` Pierre Neidhardt
2021-04-01 17:23                                         ` Mark H Weaver
2021-04-02 15:13                                           ` Ludovic Courtès
2021-04-01  6:03                                       ` Mark H Weaver
2021-04-01  7:13                                         ` Pierre Neidhardt
2021-04-01  7:57                                         ` Ludovic Courtès
2021-04-01  8:48                                           ` Pierre Neidhardt
2021-04-01  9:07                                           ` Guillaume Le Vaillant
2021-04-01  9:13                                             ` Pierre Neidhardt
2021-04-01  9:52                                               ` Guillaume Le Vaillant
2021-04-01 10:06                                                 ` Pierre Neidhardt
2021-04-01 10:07                                                 ` Pierre Neidhardt
2021-04-01 15:24                                                   ` Ludovic Courtès
2021-04-01 17:33                                                   ` Mark H Weaver
2021-04-02 22:46                                                     ` Mark H Weaver [this message]
2021-04-03  6:51                                                       ` Pierre Neidhardt
2021-04-03 20:10                                                         ` Mark H Weaver
2021-04-05 19:45                                                           ` Ludovic Courtès
2021-04-05 23:04                                                             ` Mark H Weaver
2021-04-06  8:16                                                               ` Ludovic Courtès
2021-04-06  8:23                                                                 ` Pierre Neidhardt
2021-04-30 20:03                                                                   ` Mark H Weaver
2021-05-01  9:22                                                                     ` Pierre Neidhardt
2021-05-11  8:46                                                                     ` Ludovic Courtès
2021-04-06 17:23                                                             ` Leo Famulari
2021-04-06 23:21                                                               ` Mark H Weaver
2021-04-06 11:19                                                       ` Mark H Weaver
2021-04-08 10:13                                                         ` Ludovic Courtès
2021-04-13 20:06                                                           ` Mark H Weaver
2021-04-14 10:55                                                             ` Ludovic Courtès
2021-04-14 22:37                                                               ` Leo Famulari
2021-04-15  7:26                                                               ` Mark H Weaver
2021-04-16  9:44                                                                 ` Pierre Neidhardt
2018-12-27 13:52           ` Danny Milosavljevic
2018-12-27 14:29             ` Mark H Weaver

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://guix.gnu.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87r1js9udv.fsf@netris.org \
    --to=mhw@netris.org \
    --cc=33848@debbugs.gnu.org \
    --cc=glv@posteo.net \
    --cc=mail@ambrevar.xyz \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/guix.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).