From 0fcfd804570fd1c07ffb1f6c176d6ec3430907df Mon Sep 17 00:00:00 2001 From: Mark H Weaver Date: Fri, 2 Apr 2021 18:36:23 -0400 Subject: [PATCH] DRAFT: grafts: Add support for UTF-16 and UTF-32 store references. --- guix/build/graft.scm | 138 +++++++++++++++++++++++++++++-------------- tests/grafts.scm | 68 +++++++++++++++++++++ 2 files changed, 162 insertions(+), 44 deletions(-) diff --git a/guix/build/graft.scm b/guix/build/graft.scm index c119ee71d1..6e7f3859cb 100644 --- a/guix/build/graft.scm +++ b/guix/build/graft.scm @@ -1,6 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016, 2018 Ludovic Courtès -;;; Copyright © 2016 Mark H Weaver +;;; Copyright © 2016, 2021 Mark H Weaver ;;; ;;; This file is part of GNU Guix. ;;; @@ -55,6 +55,36 @@ (string->char-set "0123456789abcdfghijklmnpqrsvwxyz") <>)) +(define (nix-base32-char-or-nul? byte) + (or (nix-base32-char? byte) + (char=? byte #\nul))) + +(define (has-utf16-zeroes? buffer i) + (let loop ((j (+ 1 (- i (* 2 hash-length))))) + (or (>= j i) + (and (zero? (bytevector-u8-ref buffer j)) + (loop (+ j 2)))))) + +(define (has-utf32-zeroes? buffer i) + (let loop ((j (+ 1 (- i (* 4 hash-length))))) + (or (>= j i) + (and (zero? (bytevector-u8-ref buffer j)) + (zero? (bytevector-u8-ref buffer (+ j 1))) + (zero? (bytevector-u8-ref buffer (+ j 2))) + (loop (+ j 4)))))) + +(define (expand-bytevector bv char-size) + (let* ((len (bytevector-length bv)) + (bv* (make-bytevector (+ 1 (* char-size + (- len 1))) + 0))) + (let loop ((i 0)) + (when (< i len) + (bytevector-u8-set! bv* (* i char-size) + (bytevector-u8-ref bv i)) + (loop (+ i 1)))) + bv*)) + (define* (replace-store-references input output replacement-table #:optional (store (%store-directory))) "Read data from INPUT, replacing store references according to @@ -76,15 +106,16 @@ bytevectors to the same value." (list->vector (map pred (iota 256))) <>)) - (define nix-base32-byte? + (define nix-base32-byte-or-nul? (optimize-u8-predicate - (compose nix-base32-char? + (compose nix-base32-char-or-nul? integer->char))) (define (dash? byte) (= byte 45)) (define request-size (expt 2 20)) ; 1 MiB + ;; XXX This comment is no longer accurate! ;; We scan the file for the following 33-byte pattern: 32 bytes of ;; nix-base32 characters followed by a dash. To accommodate large files, ;; we do not read the entire file, but instead work on buffers of up to @@ -116,43 +147,61 @@ bytevectors to the same value." ;; written. (if (< i end) (let ((byte (bytevector-u8-ref buffer i))) - (cond ((and (dash? byte) - ;; We've found a dash. Note that we do not know - ;; whether the preceeding 32 bytes are nix-base32 - ;; characters, but we do not need to know. If - ;; they are not, the following lookup will fail. - (lookup-replacement - (string-tabulate (lambda (j) - (integer->char - (bytevector-u8-ref buffer - (+ j (- i hash-length))))) - hash-length))) - => (lambda (replacement) - ;; We've found a hash that needs to be replaced. - ;; First, write out all bytes preceding the hash - ;; that have not yet been written. - (put-bytevector output buffer written - (- i hash-length written)) - ;; Now write the replacement string. - (put-bytevector output replacement) - ;; Since the byte at position 'i' is a dash, - ;; which is not a nix-base32 char, the earliest - ;; position where the next hash might start is - ;; i+1, and the earliest position where the - ;; following dash might start is (+ i 1 - ;; hash-length). Also, increase the write - ;; position to account for REPLACEMENT. - (let ((len (bytevector-length replacement))) - (scan-from (+ i 1 len) - (+ i (- len hash-length)))))) - ;; If the byte at position 'i' is a nix-base32 char, + (cond ((dash? byte) + (let* ((char-size + (if (zero? (bytevector-u8-ref buffer (- i 1))) + (if (zero? (bytevector-u8-ref buffer (- i 2))) + (if (and (<= (* 4 hash-length) + (- i written)) + (has-utf32-zeroes? buffer i)) + 4 + 1) + (if (and (<= (* 2 hash-length) + (- i written)) + (has-utf16-zeroes? buffer i)) + 2 + 1)) + 1)) + (replacement* + (lookup-replacement + (string-tabulate (lambda (j) + (integer->char + (bytevector-u8-ref buffer + (- i (* char-size + (- hash-length j)))))) + hash-length))) + (replacement + (and replacement* + (expand-bytevector replacement* + char-size)))) + (if replacement + (begin + ;; We've found a hash that needs to be replaced. + ;; First, write out all bytes preceding the hash + ;; that have not yet been written. + (put-bytevector output buffer written + (- i (* char-size hash-length) written)) + ;; Now write the replacement string. + (put-bytevector output replacement) + ;; Now compute the new value of 'written' and + ;; the new value of 'i', and iterate. + (let ((written (+ (- i (* char-size hash-length)) + (bytevector-length replacement)))) + (scan-from (+ written hash-length) written))) + ;; The byte at position 'i' is a dash, which is + ;; not a nix-base32 char, so the earliest + ;; position where the next hash might start is + ;; i+1, with the following dash at position (+ i + ;; 1 hash-length). + (scan-from (+ i 1 hash-length) written)))) + ;; If the byte at position 'i' is a nix-base32 char or nul, ;; then the dash we're looking for might be as early as ;; the following byte, so we can only advance by 1. - ((nix-base32-byte? byte) + ((nix-base32-byte-or-nul? byte) (scan-from (+ i 1) written)) - ;; If the byte at position 'i' is NOT a nix-base32 - ;; char, then the earliest position where the next hash - ;; might start is i+1, with the following dash at + ;; If the byte at position 'i' is NOT a nix-base32 char + ;; or nul, then the earliest position where the next + ;; hash might start is i+1, with the following dash at ;; position (+ i 1 hash-length). (else (scan-from (+ i 1 hash-length) written)))) @@ -162,18 +211,19 @@ bytevectors to the same value." ;; "unget". If 'end' is less than 'request-size' then we read ;; less than we asked for, which indicates that we are at EOF, ;; so we needn't unget anything. Otherwise, we unget up to - ;; 'hash-length' bytes (32 bytes). However, we must be careful - ;; not to unget bytes that have already been written, because - ;; that would cause them to be written again from the next - ;; buffer. In practice, this case occurs when a replacement is - ;; made near or beyond the end of the buffer. When REPLACEMENT - ;; went beyond END, we consume the extra bytes from INPUT. + ;; (* 4 hash-length) bytes. However, we must be careful not to + ;; unget bytes that have already been written, because that + ;; would cause them to be written again from the next buffer. + ;; In practice, this case occurs when a replacement is made + ;; near or beyond the end of the buffer. When REPLACEMENT went + ;; beyond END, we consume the extra bytes from INPUT. (begin (if (> written end) (get-bytevector-n! input buffer 0 (- written end)) (let* ((unwritten (- end written)) (unget-size (if (= end request-size) - (min hash-length unwritten) + (min (* 4 hash-length) + unwritten) 0)) (write-size (- unwritten unget-size))) (put-bytevector output buffer written write-size) diff --git a/tests/grafts.scm b/tests/grafts.scm index a12c6a5911..0e1c7355b1 100644 --- a/tests/grafts.scm +++ b/tests/grafts.scm @@ -1,5 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019 Ludovic Courtès +;;; Copyright © 2021 Mark H Weaver ;;; ;;; This file is part of GNU Guix. ;;; @@ -468,4 +469,71 @@ replacement "/gnu/store"))))) +(define (nul-expand str char-size) + (string-join (map string (string->list str)) + (make-string (- char-size 1) #\nul))) + +(for-each + (lambda (char-size1) + (for-each + (lambda (char-size2) + (for-each + (lambda (gap) + (for-each + (lambda (offset) + (test-equal (format #f "replace-store-references, char-sizes ~a ~a, gap ~s, offset ~a" + char-size1 char-size2 gap offset) + (string-append (make-string offset #\=) + (nul-expand (string-append "/gnu/store/" + (make-string 32 #\6) + "-BlahBlaH") + char-size1) + gap + (nul-expand (string-append "/gnu/store/" + (make-string 32 #\8) + "-SoMeTHiNG") + char-size2) + (list->string (map integer->char (iota 77 33)))) + + ;; Create input data where the right-hand-size of the dash ("-something" + ;; here) goes beyond the end of the internal buffer of + ;; 'replace-store-references'. + (let* ((content (string-append (make-string offset #\=) + (nul-expand (string-append "/gnu/store/" + (make-string 32 #\5) + "-blahblah") + char-size1) + gap + (nul-expand (string-append "/gnu/store/" + (make-string 32 #\7) + "-something") + char-size2) + (list->string + (map integer->char (iota 77 33))))) + (replacement (alist->vhash + `((,(make-string 32 #\5) + . ,(string->utf8 (string-append + (make-string 32 #\6) + "-BlahBlaH"))) + (,(make-string 32 #\7) + . ,(string->utf8 (string-append + (make-string 32 #\8) + "-SoMeTHiNG"))))))) + (call-with-output-string + (lambda (output) + ((@@ (guix build graft) replace-store-references) + (open-input-string content) output + replacement + "/gnu/store")))))) + ;; offsets to test + (map (lambda (i) (- buffer-size (* 40 char-size1) i)) + (iota 30)))) + ;; gaps + '("" "-" " " "a"))) + ;; char-size2 values to test + '(1 2))) + ;; char-size1 values to test + '(1 2 4)) + + (test-end) -- 2.31.1