From 6eec36e66d20d82fe02c6de793422875477b90d8 Mon Sep 17 00:00:00 2001
From: Mark H Weaver <mhw@netris.org>
Date: Fri, 2 Apr 2021 18:36:23 -0400
Subject: [PATCH] DRAFT: grafts: Support rewriting UTF-16 and UTF-32 store
 references.

* guix/build/graft.scm (replace-store-references): Add support for
finding and rewriting UTF-16 and UTF-32 store references.
* tests/grafts.scm: Add tests.
---
 guix/build/graft.scm | 247 +++++++++++++++++++++++++++----------------
 tests/grafts.scm     |  68 ++++++++++++
 2 files changed, 224 insertions(+), 91 deletions(-)

diff --git a/guix/build/graft.scm b/guix/build/graft.scm
index c119ee71d1..23fca8f29c 100644
--- a/guix/build/graft.scm
+++ b/guix/build/graft.scm
@@ -1,6 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès <ludo@gnu.org>
-;;; Copyright © 2016 Mark H Weaver <mhw@netris.org>
+;;; Copyright © 2016, 2021 Mark H Weaver <mhw@netris.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -55,6 +55,40 @@
         (string->char-set "0123456789abcdfghijklmnpqrsvwxyz")
         <>))
 
+(define (nix-base32-char-or-nul? byte)
+  (or (nix-base32-char? byte)
+      (char=? byte #\nul)))
+
+(define (possible-utf16-hash? buffer i w)
+  (and (<= (* 2 hash-length) (- i w))
+       (let loop ((j (+ 1 (- i (* 2 hash-length)))))
+         (or (>= j i)
+             (and (zero? (bytevector-u8-ref buffer j))
+                  (loop (+ j 2)))))))
+
+(define (possible-utf32-hash? buffer i w)
+  (and (<= (* 4 hash-length) (- i w))
+       (let loop ((j (+ 1 (- i (* 4 hash-length)))))
+         (or (>= j i)
+             (and (zero? (bytevector-u8-ref buffer j))
+                  (zero? (bytevector-u8-ref buffer (+ j 1)))
+                  (zero? (bytevector-u8-ref buffer (+ j 2)))
+                  (loop (+ j 4)))))))
+
+(define (insert-nuls char-size bv)
+  (if (or (not bv) (= char-size 1))
+      bv
+      (let* ((len (bytevector-length bv))
+             (bv* (make-bytevector (+ 1 (* char-size
+                                           (- len 1)))
+                                   0)))
+        (let loop ((i 0))
+          (when (< i len)
+            (bytevector-u8-set! bv* (* i char-size)
+                                (bytevector-u8-ref bv i))
+            (loop (+ i 1))))
+        bv*)))
+
 (define* (replace-store-references input output replacement-table
                                    #:optional (store (%store-directory)))
   "Read data from INPUT, replacing store references according to
@@ -76,9 +110,9 @@ bytevectors to the same value."
           (list->vector (map pred (iota 256)))
           <>))
 
-  (define nix-base32-byte?
+  (define nix-base32-byte-or-nul?
     (optimize-u8-predicate
-     (compose nix-base32-char?
+     (compose nix-base32-char-or-nul?
               integer->char)))
 
   (define (dash? byte) (= byte 45))
@@ -86,100 +120,131 @@ bytevectors to the same value."
   (define request-size (expt 2 20))  ; 1 MiB
 
   ;; We scan the file for the following 33-byte pattern: 32 bytes of
-  ;; nix-base32 characters followed by a dash.  To accommodate large files,
-  ;; we do not read the entire file, but instead work on buffers of up to
-  ;; 'request-size' bytes.  To ensure that every 33-byte sequence appears
-  ;; entirely within exactly one buffer, adjacent buffers must overlap,
-  ;; i.e. they must share 32 byte positions.  We accomplish this by
-  ;; "ungetting" the last 32 bytes of each buffer before reading the next
-  ;; buffer, unless we know that we've reached the end-of-file.
+  ;; nix-base32 characters followed by a dash.  When we find such a pattern
+  ;; whose hash is in REPLACEMENT-TABLE, we perform the required rewrite and
+  ;; continue scanning.
+  ;;
+  ;; To support UTF-16 and UTF-32 store references, the 33 bytes comprising
+  ;; this hash+dash pattern may optionally be interspersed by extra NUL bytes.
+  ;; This simple approach works because the characters we are looking for are
+  ;; restricted to ASCII.  UTF-16 hashes are interspersed with single NUL
+  ;; bytes ("\0"), and UTF-32 hashes are interspersed with triplets of NULs
+  ;; ("\0\0\0").  Note that we require NULs to be present only *between* the
+  ;; other bytes, and not at either end, in order to be insensitive to byte
+  ;; order.
+  ;;
+  ;; To accommodate large files, we do not read the entire file at once, but
+  ;; instead work on buffers of up to 'request-size' bytes.  To ensure that
+  ;; every hash+dash pattern appears in its entirety in at least one buffer,
+  ;; adjacent buffers must overlap by one byte less than the maximum size of a
+  ;; hash+dash pattern.  We accomplish this by "ungetting" a suffix of each
+  ;; buffer before reading the next buffer, unless we know that we've reached
+  ;; the end-of-file.
   (let ((buffer (make-bytevector request-size)))
-    (let loop ()
-      ;; Note: We avoid 'get-bytevector-n' to work around
-      ;; <http://bugs.gnu.org/17466>.
+    (define-syntax-rule (byte-at i)
+      (bytevector-u8-ref buffer i))
+    (let outer-loop ()
       (match (get-bytevector-n! input buffer 0 request-size)
         ((? eof-object?) 'done)
         (end
-         ;; We scan the buffer for dashes that might be preceded by a
-         ;; nix-base32 hash.  The key optimization here is that whenever we
-         ;; find a NON-nix-base32 character at position 'i', we know that it
-         ;; cannot be part of a hash, so the earliest position where the next
-         ;; hash could start is i+1 with the following dash at position i+33.
-         ;;
-         ;; Since nix-base32 characters comprise only 1/8 of the 256 possible
-         ;; byte values, and exclude some of the most common letters in
-         ;; English text (e t o u), in practice we can advance by 33 positions
-         ;; most of the time.
-         (let scan-from ((i hash-length) (written 0))
-           ;; 'i' is the first position where we look for a dash.  'written'
-           ;; is the number of bytes in the buffer that have already been
-           ;; written.
+         (define (scan-from i w)
+           ;; Scan the buffer for dashes that might be preceded by nix hashes,
+           ;; where 'i' is the minimum position where such a dash might be
+           ;; found, and 'w' is the number of bytes in the buffer that have
+           ;; been written so far.
+           ;;
+           ;; The key optimization here is that whenever we find a byte at
+           ;; position 'i' that cannot occur within a nix hash (because it's
+           ;; neither a nix-base32 character nor NUL), we can infer that the
+           ;; earliest position where the next hash could start is at i+1,
+           ;; and therefore the earliest position for the following dash is
+           ;; (+ i 1 hash-length), which is i+33.
+           ;;
+           ;; Since nix-base32-or-nul characters comprise only about 1/8 of
+           ;; the 256 possible byte values, and exclude some of the most
+           ;; common letters in English text (e t o u), we can advance 33
+           ;; positions most of the time.
            (if (< i end)
-               (let ((byte (bytevector-u8-ref buffer i)))
-                 (cond ((and (dash? byte)
-                             ;; We've found a dash.  Note that we do not know
-                             ;; whether the preceeding 32 bytes are nix-base32
-                             ;; characters, but we do not need to know.  If
-                             ;; they are not, the following lookup will fail.
-                             (lookup-replacement
-                              (string-tabulate (lambda (j)
-                                                 (integer->char
-                                                  (bytevector-u8-ref buffer
-                                                   (+ j (- i hash-length)))))
-                                               hash-length)))
-                        => (lambda (replacement)
-                             ;; We've found a hash that needs to be replaced.
-                             ;; First, write out all bytes preceding the hash
-                             ;; that have not yet been written.
-                             (put-bytevector output buffer written
-                                             (- i hash-length written))
-                             ;; Now write the replacement string.
-                             (put-bytevector output replacement)
-                             ;; Since the byte at position 'i' is a dash,
-                             ;; which is not a nix-base32 char, the earliest
-                             ;; position where the next hash might start is
-                             ;; i+1, and the earliest position where the
-                             ;; following dash might start is (+ i 1
-                             ;; hash-length).  Also, increase the write
-                             ;; position to account for REPLACEMENT.
-                             (let ((len (bytevector-length replacement)))
-                               (scan-from (+ i 1 len)
-                                          (+ i (- len hash-length))))))
-                       ;; If the byte at position 'i' is a nix-base32 char,
-                       ;; then the dash we're looking for might be as early as
-                       ;; the following byte, so we can only advance by 1.
-                       ((nix-base32-byte? byte)
-                        (scan-from (+ i 1) written))
-                       ;; If the byte at position 'i' is NOT a nix-base32
-                       ;; char, then the earliest position where the next hash
-                       ;; might start is i+1, with the following dash at
-                       ;; position (+ i 1 hash-length).
+               (let ((byte (byte-at i)))
+                 (cond ((dash? byte)
+                        (found-dash i w))
+                       ((nix-base32-byte-or-nul? byte)
+                        (scan-from (+ i 1) w))
                        (else
-                        (scan-from (+ i 1 hash-length) written))))
-
-               ;; We have finished scanning the buffer.  Now we determine how
-               ;; many bytes have not yet been written, and how many bytes to
-               ;; "unget".  If 'end' is less than 'request-size' then we read
-               ;; less than we asked for, which indicates that we are at EOF,
-               ;; so we needn't unget anything.  Otherwise, we unget up to
-               ;; 'hash-length' bytes (32 bytes).  However, we must be careful
-               ;; not to unget bytes that have already been written, because
-               ;; that would cause them to be written again from the next
-               ;; buffer.  In practice, this case occurs when a replacement is
-               ;; made near or beyond the end of the buffer.  When REPLACEMENT
-               ;; went beyond END, we consume the extra bytes from INPUT.
-               (begin
-                 (if (> written end)
-                     (get-bytevector-n! input buffer 0 (- written end))
-                     (let* ((unwritten  (- end written))
-                            (unget-size (if (= end request-size)
-                                            (min hash-length unwritten)
-                                            0))
-                            (write-size (- unwritten unget-size)))
-                       (put-bytevector output buffer written write-size)
-                       (unget-bytevector input buffer (+ written write-size)
-                                         unget-size)))
-                 (loop)))))))))
+                        (not-part-of-hash i w))))
+               (finish-buffer i w)))
+
+         (define (not-part-of-hash i w)
+           ;; Position 'i' is known to not be within a nix hash.  Therefore,
+           ;; the earliest position where the next hash might start is i+1,
+           ;; with the following dash at position (+ i 1 hash-length).
+           (scan-from (+ i 1 hash-length) w))
+
+         (define (found-dash i w)
+           (cond ((not (zero? (byte-at (- i 1))))
+                  (found-possible-hash 1 i w))
+                 ((not (zero? (byte-at (- i 2))))
+                  (if (possible-utf16-hash? buffer i w)
+                      (found-possible-hash 2 i w)
+                      (not-part-of-hash i w)))
+                 (else
+                  (if (possible-utf32-hash? buffer i w)
+                      (found-possible-hash 4 i w)
+                      (not-part-of-hash i w)))))
+
+         (define (found-possible-hash char-size i w)
+           (let* ((hash (string-tabulate
+                         (lambda (j)
+                           (integer->char
+                            (byte-at (- i (* char-size
+                                             (- hash-length j))))))
+                         hash-length))
+                  (replacement* (lookup-replacement hash))
+                  (replacement (and replacement*
+                                    (insert-nuls char-size replacement*))))
+             (cond
+              ((not replacement)
+               (not-part-of-hash i w))
+              (else
+               ;; We've found a hash that needs to be replaced.
+               ;; First, write out all bytes preceding the hash
+               ;; that have not yet been written.
+               (put-bytevector output buffer w
+                               (- i (* char-size hash-length) w))
+               ;; Now write the replacement string.
+               (put-bytevector output replacement)
+               ;; Now compute the new value of 'w' and
+               ;; the new value of 'i', and iterate.
+               (let ((w (+ (- i (* char-size hash-length))
+                           (bytevector-length replacement))))
+                 (scan-from (+ w hash-length) w))))))
+
+         (define (finish-buffer i w)
+           ;; We have finished scanning the buffer.  Now we determine how
+           ;; many bytes have not yet been written, and how many bytes to
+           ;; "unget".  If 'end' is less than 'request-size' then we read
+           ;; less than we asked for, which indicates that we are at EOF,
+           ;; so we needn't unget anything.  Otherwise, we unget up to
+           ;; (* 4 hash-length) bytes.  However, we must be careful not to
+           ;; unget bytes that have already been written, because that
+           ;; would cause them to be written again from the next buffer.
+           ;; In practice, this case occurs when a replacement is made
+           ;; near or beyond the end of the buffer.  When REPLACEMENT went
+           ;; beyond END, we consume the extra bytes from INPUT.
+           (if (> w end)
+               (get-bytevector-n! input buffer 0 (- w end))
+               (let* ((unwritten  (- end w))
+                      (unget-size (if (= end request-size)
+                                      (min (* 4 hash-length)
+                                           unwritten)
+                                      0))
+                      (write-size (- unwritten unget-size)))
+                 (put-bytevector output buffer w write-size)
+                 (unget-bytevector input buffer (+ w write-size)
+                                   unget-size)))
+           (outer-loop))
+
+         (scan-from hash-length 0))))))
 
 (define (rename-matching-files directory mapping)
   "Apply MAPPING to the names of all the files in DIRECTORY, where MAPPING is
diff --git a/tests/grafts.scm b/tests/grafts.scm
index a12c6a5911..0e1c7355b1 100644
--- a/tests/grafts.scm
+++ b/tests/grafts.scm
@@ -1,5 +1,6 @@
 ;;; GNU Guix --- Functional package management for GNU
 ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès <ludo@gnu.org>
+;;; Copyright © 2021 Mark H Weaver <mhw@netris.org>
 ;;;
 ;;; This file is part of GNU Guix.
 ;;;
@@ -468,4 +469,71 @@
          replacement
          "/gnu/store")))))
 
+(define (nul-expand str char-size)
+  (string-join (map string (string->list str))
+               (make-string (- char-size 1) #\nul)))
+
+(for-each
+ (lambda (char-size1)
+   (for-each
+    (lambda (char-size2)
+      (for-each
+       (lambda (gap)
+        (for-each
+         (lambda (offset)
+           (test-equal (format #f "replace-store-references, char-sizes ~a ~a, gap ~s, offset ~a"
+                               char-size1 char-size2 gap offset)
+             (string-append (make-string offset #\=)
+                            (nul-expand (string-append "/gnu/store/"
+                                                       (make-string 32 #\6)
+                                                       "-BlahBlaH")
+                                        char-size1)
+                            gap
+                            (nul-expand (string-append "/gnu/store/"
+                                                       (make-string 32 #\8)
+                                                       "-SoMeTHiNG")
+                                        char-size2)
+                            (list->string (map integer->char (iota 77 33))))
+
+             ;; Create input data where the right-hand-size of the dash ("-something"
+             ;; here) goes beyond the end of the internal buffer of
+             ;; 'replace-store-references'.
+             (let* ((content     (string-append (make-string offset #\=)
+                                                (nul-expand (string-append "/gnu/store/"
+                                                                           (make-string 32 #\5)
+                                                                           "-blahblah")
+                                                            char-size1)
+                                                gap
+                                                (nul-expand (string-append "/gnu/store/"
+                                                                           (make-string 32 #\7)
+                                                                           "-something")
+                                                            char-size2)
+                                                (list->string
+                                                 (map integer->char (iota 77 33)))))
+                    (replacement (alist->vhash
+                                  `((,(make-string 32 #\5)
+                                     . ,(string->utf8 (string-append
+                                                       (make-string 32 #\6)
+                                                       "-BlahBlaH")))
+                                    (,(make-string 32 #\7)
+                                     . ,(string->utf8 (string-append
+                                                       (make-string 32 #\8)
+                                                       "-SoMeTHiNG")))))))
+               (call-with-output-string
+                 (lambda (output)
+                   ((@@ (guix build graft) replace-store-references)
+                    (open-input-string content) output
+                    replacement
+                    "/gnu/store"))))))
+         ;; offsets to test
+         (map (lambda (i) (- buffer-size (* 40 char-size1) i))
+              (iota 30))))
+       ;; gaps
+       '("" "-" " " "a")))
+    ;; char-size2 values to test
+    '(1 2)))
+ ;; char-size1 values to test
+ '(1 2 4))
+
+
 (test-end)
-- 
2.31.1