unofficial mirror of guix-devel@gnu.org 
 help / color / mirror / code / Atom feed
From: Timothy Sample <samplet@ngyro.com>
To: zimoun <zimon.toutoune@gmail.com>
Cc: Guix Devel <guix-devel@gnu.org>
Subject: Re: ImageMagick from 2020?
Date: Sat, 22 Jan 2022 11:48:17 -0500	[thread overview]
Message-ID: <87pmoj7m3y.fsf@ngyro.com> (raw)
In-Reply-To: <CAJ3okZ3gbi-SxPBWUkVk3G8_332BdZ=ohpngUbWCOmF2V2ZXmw@mail.gmail.com> (zimoun's message of "Wed, 19 Jan 2022 12:30:52 +0100")

[-- Attachment #1: Type: text/plain, Size: 2074 bytes --]

Hey,

zimoun <zimon.toutoune@gmail.com> writes:

> On Wed, 19 Jan 2022 at 11:36, Ludovic Courtès <ludo@gnu.org> wrote:
>
>> Oh right, so we’ll need to feed them historical ‘sources.json’ files
>> eventually, I think Timothy was planning to do that eventually.
>
> From my side, what I would like to achieve soon:
>
> [...]
>
> Then what is also missing is:
>
>  3- have a collection of sources.json per Guix revision -- at least
> some determined by us;

I’ve attached a script that makes a “sources.json” per commit from the
PoG database.  It only lists regularly downloaded sources (no VCS
sources), since that’s all the SWH loader supports so far.

I also played around with it and came up with

    https://ngyro.com/pog-reports/2022-01-16/missing-sources.json

This is a “sources.json” that only lists the “missing” and “unknown”
sources from the PoG report.  It lists sources across all commits (since
1.0.0).  This might be the easiest thing for SWH to handle, since it
omits nearly 20k sources that they definitely already have.  Since they
don’t have the tarball hashes, they have no way to skip downloading and
processing tarballs that they already have by hash.  Hence, filtering it
with the extra data we have through the PoG projects should be something
that they welcome!

If they want, they could point a loader task at

   https://ngyro.com/pog-reports/latest/missing-sources.json

and I could publish updates when I publish new PoG reports.

There’s one other thing to think about.  Some of our sources are
arguably unsuitable for SWH.  For instance, our bootstrap binaries.  I
bet we have a bunch of other borderline things, too, like game assets.
Of course, if they are indiscriminately ingesting Github, I’m sure
they’ve loaded plenty of garbage.  Mostly, I think about these things
because I believe it’s important to maintain the Guix-SWH relationship.

Here’s the per-commit script.  You can run it like this:

    $ guile sources.scm pog.db output-directory


[-- Attachment #2: sources.scm --]
[-- Type: text/plain, Size: 3721 bytes --]

(use-modules (gcrypt base64)
             (guix base32)
             (guix build download)
             ((guix download) #:select (%mirrors))
             (ice-9 match)
             (json)
             (sqlite3)
             (srfi srfi-1)
             (srfi srfi-9 gnu)
             (srfi srfi-19)
             (web uri))

(define-immutable-record-type <commit>
  (make-commit push-time hash)
  commit?
  (push-time commit-push-time)
  (hash commit-hash))

(define lookup-commits-query "\
SELECT c.push_time,
    c.hash
FROM commits c
ORDER BY c.push_time DESC")

(define (lookup-commits db)
  (define (record->commit rec)
    (match-let ((#(push-time hash) rec))
      (make-commit (and push-time (make-time time-utc 0 push-time))
                   hash)))
  (define (kons rec acc)
    (cons (record->commit rec) acc))
  (let* ((stmt (sqlite-prepare db lookup-commits-query))
         (commits (sqlite-fold kons '() stmt)))
    (sqlite-finalize stmt)
    commits))

(define lookup-sources-query "\
SELECT f.hash,
    fr.reference
FROM commits c
    JOIN fod_commit_links fcl USING (commit_id)
    JOIN fods f USING (fod_id)
    JOIN fod_references fr USING (fod_id)
WHERE c.hash = ?
    AND f.algorithm = 'sha256'
    AND (fr.reference LIKE '\"%'
        OR fr.reference LIKE '(\"%')
    AND NOT fr.is_error")

(define (nix-base32-sha256->subresource-integrity digest)
  "Convert the Nix-style base32-encoded SHA-256 hash DIGEST into a
Subresource Integrity metadata value."
  (define bv (nix-base32-string->bytevector digest))
  (define b64 (base64-encode bv))
  (string-append "sha256-" b64))

(define (web-reference-urls reference)
  (define uris
    (match (call-with-input-string reference read)
      ((urls ...) (map string->uri urls))
      (url (list (string->uri url)))))
  (append-map (lambda (uri)
                (map uri->string
                     (maybe-expand-mirrors uri %mirrors)))
              uris))

(define (lookup-sources db commit)
  (define (record->url-source rec)
    (match-let ((#(digest reference) rec))
      (let ((urls (web-reference-urls reference))
            (integrity (nix-base32-sha256->subresource-integrity digest)))
        `(("type" . "url")
          ("urls" . ,(list->vector urls))
          ("integrity" . ,integrity)))))
  (define (kons rec acc)
    (cons (record->url-source rec) acc))
  (let* ((stmt (sqlite-prepare db lookup-sources-query))
         (_ (sqlite-bind-arguments stmt commit))
         (sources (sqlite-fold kons '() stmt)))
    (sqlite-finalize stmt)
    sources))

(define (commit-sources-name directory commit)
  (string-append directory
                 "/"
                 (date->string (time-utc->date (commit-push-time commit))
                               "~Y-~m-~d")
                 "-"
                 (string-take (commit-hash commit) 7)
                 "-sources.json"))

(match (program-arguments)
  ((_ db-file directory)
   (mkdir directory)
   (let* ((db (sqlite-open db-file))
          (commits (lookup-commits db)))
     (for-each (lambda (commit)
                 (call-with-output-file (commit-sources-name directory commit)
                   (lambda (out)
                     (let* ((hash (commit-hash commit))
                            (sources (lookup-sources db hash)))
                       (scm->json `(("version" . "1")
                                    ("revision" . ,hash)
                                    ("sources" . ,(list->vector sources)))
                                  out)
                       (newline out)))))
               (list (car commits)))
     (sqlite-close db)
     (exit EXIT_SUCCESS)))
  (_ (display "usage: sources.scm DB-FILE\n" (current-error-port))
     (exit EXIT_FAILURE)))

[-- Attachment #3: Type: text/plain, Size: 9 bytes --]



-- Tim

  reply	other threads:[~2022-01-22 16:48 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-01-10 17:20 ImageMagick from 2020? zimoun
2022-01-10 18:01 ` Timothy Sample
2022-01-10 18:36   ` Maxime Devos
2022-01-10 18:59     ` zimoun
2022-01-18 14:38     ` Ludovic Courtès
2022-01-18 14:49       ` zimoun
2022-01-19 10:36         ` Ludovic Courtès
2022-01-19 11:30           ` zimoun
2022-01-22 16:48             ` Timothy Sample [this message]
2022-01-24 15:00               ` Ludovic Courtès

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://guix.gnu.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=87pmoj7m3y.fsf@ngyro.com \
    --to=samplet@ngyro.com \
    --cc=guix-devel@gnu.org \
    --cc=zimon.toutoune@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/guix.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).