;;; Copyright © 2020 Ludovic Courtès ;;; Hereby placed under the GNU General Public License, version 3 or later. (define-module (similarities) #:use-module (json) #:use-module ((gcrypt hash) #:select (open-sha256-port)) #:use-module ((guix store) #:select (%default-substitute-urls)) #:use-module ((guix progress) #:hide (dump-port*)) #:use-module (guix http-client) #:use-module (guix serialization) #:use-module (guix scripts substitute) #:use-module (srfi srfi-1) #:use-module (srfi srfi-9) #:use-module (srfi srfi-11) #:use-module (rnrs bytevectors) #:use-module (charting) #:use-module (ice-9 match)) ;;; ;;; Data Service client. ;;; (define-json-mapping make-package-instance package-instance? json->package-instance (version package-instance-version) (output package-instance-output "derivation")) (define %data-service-base-url (make-parameter "https://data.guix.gnu.org")) (define* (package-instances package #:key (branch "master")) "Return a list of representing instances of PACKAGE over time known to the Data Service." (match (assoc-ref (json->scm (http-fetch (string-append (%data-service-base-url) "/repository/1/branch/" branch "/package/" package "/output-history.json"))) "derivations") (#(lst ...) (map json->package-instance lst)) (#f #f))) ;;; ;;; Similarity measurement. ;;; (define (port-sha256* port size) ;from (guix scripts challenge) ;; Like 'port-sha256', but limited to SIZE bytes. (let-values (((out get) (open-sha256-port))) (dump-port* port out size) (close-port out) (get))) (define-record-type (file-info name type size sha256) file-info? (name file-info-name) (type file-info-type) (size file-info-size) (sha256 file-info-sha256)) (define (archive-contents port) "Return a list of records from the nar read from PORT." ;; As in (guix scripts challenge), but return records that ;; include file size and ignore symlinks. (fold-archive (lambda (file type contents result) (match type ((or 'regular 'executable) (match contents ((port . size) (cons (file-info file type size (port-sha256* port size)) result)))) ('directory result) ('directory-complete result) ('symlink result))) '() port "")) (define (narinfo-contents narinfo) ;from (guix scripts challenge) "Fetch the nar described by NARINFO and return a list representing the file it contains." ((@@ (guix scripts challenge) call-with-nar) narinfo archive-contents)) (define (at-most max-length lst) ;from (guix scripts substitute) "If LST is shorter than MAX-LENGTH, return it and the empty list; otherwise return its MAX-LENGTH first elements and its tail." (let loop ((len 0) (lst lst) (result '())) (match lst (() (values (reverse result) '())) ((head . tail) (if (>= len max-length) (values (reverse result) lst) (loop (+ 1 len) tail (cons head result))))))) (define* (package-archive-contents package #:key (max 10) (substitute-urls %default-substitute-urls)) "Look at the MAX latest instances of PACKAGE, fetch them, and return a summary of their contents as returned by 'narinfo-contents'." (let ((instances (at-most max (package-instances package)))) (map narinfo-contents (lookup-narinfos (first substitute-urls) (map package-instance-output instances))))) (define (similarity contents1 contents2) "Return two values: the ratio of identical bytes between CONTENTS2 and CONTENTS2, and the ratio of identical files." (define (matches name) (lambda (info) (string=? (file-info-name info) name))) (let ((files (delete-duplicates (append (map file-info-name contents1) (map file-info-name contents2))))) (let loop ((files files) (seen 0) (identical 0) (seen-bytes 0) (identical-bytes 0)) (match files (() (values (/ identical-bytes seen-bytes) (/ identical seen))) ((head . tail) (let ((file1 (find (matches head) contents1)) (file2 (find (matches head) contents2))) (cond ((not file1) (loop tail (+ seen 1) identical (+ seen-bytes (file-info-size file2)) identical-bytes)) ((not file2) (loop tail (+ seen 1) identical (+ seen-bytes (file-info-size file1)) identical-bytes)) (else (let ((identical? (and (= (file-info-size file1) (file-info-size file2)) (bytevector=? (file-info-sha256 file1) (file-info-sha256 file2))))) (loop tail (+ seen 1) (if identical? (+ identical 1) identical) (+ seen-bytes (max (file-info-size file1) (file-info-size file2))) (if identical? (+ identical-bytes (file-info-size file1)) identical-bytes))))))))))) (define (pairwise-similarities contents) (let loop ((contents contents) (similarities '())) (match contents ((or () (_)) (reverse similarities)) ((a b . tail) (loop (cons b tail) (cons (similarity a b) similarities)))))) (define* (similarity-chart packages file #:key (max 20)) (make-bar-chart "Similarity across subsequent package revisions" (map (lambda (package) (let* ((contents (package-archive-contents package #:max max)) (similarities (pairwise-similarities contents)) (labels (iota (length similarities)))) `(,package ,@(map (lambda (label ratio) `(,(* ratio 100.) ,(number->string label))) labels similarities)))) packages) #:chart-params '(#:x-axis-label "package" #:y-axis-label "similarity ratio (%)") #:write-to-png file))