unofficial mirror of gwl-devel@gnu.org
 help / color / mirror / Atom feed
* support for containers
@ 2019-01-28 23:03 Ricardo Wurmus
  2019-01-29  9:38 ` Ricardo Wurmus
  2019-01-29 10:22 ` zimoun
  0 siblings, 2 replies; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-28 23:03 UTC (permalink / raw)
  To: gwl-devel

Hi,

the GWL could already support execution in containers with this patch:

--8<---------------cut here---------------start------------->8---
diff --git a/gwl/processes.scm b/gwl/processes.scm
index b7251db..9ec5925 100644
--- a/gwl/processes.scm
+++ b/gwl/processes.scm
@@ -19,13 +19,20 @@
   #:use-module ((guix derivations)
                 #:select (derivation->output-path
                           build-derivations))
+  #:use-module ((guix packages)
+                #:select (package-file))
   #:use-module (guix gexp)
-  #:use-module ((guix monads) #:select (mlet return))
+  #:use-module ((guix monads) #:select (mlet mapm return))
   #:use-module (guix records)
   #:use-module ((guix store)
                 #:select (open-connection
                           run-with-store
+                          with-store
                           %store-monad))
+  #:use-module ((guix modules)
+                #:select (source-module-closure))
+  #:use-module (gnu system file-systems)
+  #:use-module (gnu build linux-container)
   #:use-module (ice-9 format)
   #:use-module (ice-9 match)
   #:use-module (srfi srfi-1)
@@ -232,34 +239,82 @@ of PROCESS."
   (arguments code-snippet-arguments)
   (code      code-snippet-code))

-(define (procedure->gexp process)
+(define* (procedure->gexp process #:key (container? #t))
   "Transform the procedure of PROCESS to a G-expression or return the
 plain S-expression."
   (define (sanitize-path path)
     (string-join (delete ".." (string-split path #\/))
                  "/"))
-  (match (process-procedure process)
-    ((? gexp? g) g)
-    ((? list? s) s)
-    (($ <code-snippet> name arguments code)
-     (let ((call (or (and=> (find (lambda (lang)
-                                    (eq? name (language-name lang)))
-                                  languages)
-                            language-call)
-                     ;; There is no pre-defined way to execute the
-                     ;; snippet.  Use generic approach.
-                     (lambda (process code)
-                       #~(begin
-                           (for-each (lambda (pair)
-                                       (setenv (car pair) (cdr pair)))
-                                     '#$(process->env process))
-                           (apply system*
-                                  (string-append (getenv "_GWL_PROFILE")
-                                                 #$(sanitize-path (symbol->string name)))
-                                  '#$(append arguments
-                                             (list code))))))))
-       (call process code)))
-    (whatever (error (format #f "unsupported procedure: ~a\n" whatever)))))
+  (define contents
+    (match (process-procedure process)
+      ((? gexp? g) g)
+      ((? list? s) s)
+      (($ <code-snippet> name arguments code)
+       (let ((call (or (and=> (find (lambda (lang)
+                                      (eq? name (language-name lang)))
+                                    languages)
+                              language-call)
+                       ;; There is no pre-defined way to execute the
+                       ;; snippet.  Use generic approach.
+                       (lambda (process code)
+                         #~(begin
+                             (for-each (lambda (pair)
+                                         (setenv (car pair) (cdr pair)))
+                                       '#$(process->env process))
+                             (apply system*
+                                    (string-append (getenv "_GWL_PROFILE")
+                                                   #$(sanitize-path (symbol->string name)))
+                                    '#$(append arguments
+                                               (list code))))))))
+         (call process code)))
+      (whatever (error (format #f "unsupported procedure: ~a\n" whatever)))))
+
+  (if container?
+      (let* ((package-dirs
+              (with-store store
+                (run-with-store store
+                  (mapm %store-monad package-file
+                        (process-package-inputs process)))))
+             (data-input-dirs
+              (delete-duplicates
+               (map dirname (process-data-inputs process))))
+             (output-dirs
+              (delete-duplicates
+               (map dirname (process-outputs process))))
+             (input-mappings
+              (map (lambda (dir)
+                     (file-system-mapping
+                      (source dir)
+                      (target dir)
+                      (writable? #f)))
+                   (lset-difference string=?
+                                    (append package-dirs
+                                            data-input-dirs)
+                                    output-dirs)))
+             (output-mappings
+              (map (lambda (dir)
+                     (file-system-mapping
+                      (source dir)
+                      (target dir)
+                      (writable? #t)))
+                   output-dirs))
+             (specs
+              (map (compose file-system->spec
+                            file-system-mapping->bind-mount)
+                   (append input-mappings
+                           output-mappings))))
+        (with-imported-modules (source-module-closure
+                                '((gnu build linux-container)
+                                  (gnu system file-systems)))
+          #~(begin
+              (use-modules (gnu build linux-container)
+                           (gnu system file-systems))
+              (call-with-container (append %container-file-systems
+                                           (map spec->file-system
+                                                '#$specs))
+                (lambda ()
+                  #$contents)))))
+      contents))

 ;;; ---------------------------------------------------------------------------
 ;;; ADDITIONAL FUNCTIONS
--8<---------------cut here---------------end--------------->8---

The directories to be mounted in the container are derived from the
declared inputs and outputs.  The only problem is that inputs are
read-only in this implementation.  I like it this way, actually, but it
means that the extended example workflow won’t work as it tries to
delete its inputs.

Should data inputs be declared as (mutable-file …) or (file …) instead
of being plain strings?

--
Ricardo

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-28 23:03 support for containers Ricardo Wurmus
@ 2019-01-29  9:38 ` Ricardo Wurmus
  2019-01-29 10:39   ` zimoun
  2019-01-29 10:22 ` zimoun
  1 sibling, 1 reply; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-29  9:38 UTC (permalink / raw)
  To: gwl-devel


Ricardo Wurmus <rekado@elephly.net> writes:

> the GWL could already support execution in containers with this patch:
[…]
> The directories to be mounted in the container are derived from the
> declared inputs and outputs.  The only problem is that inputs are
> read-only in this implementation.  I like it this way, actually, but it
> means that the extended example workflow won’t work as it tries to
> delete its inputs.
>
> Should data inputs be declared as (mutable-file …) or (file …) instead
> of being plain strings?

Taking a step back I think it is worth remembering that ultimately we’d
like to have output caching via an immutable data store.  I think
declaring files as mutable would be a mistake.  Garbage collection is
better than modifying output files.

Some thoughts on how the data store should work: it’s easy to add stuff
to the store (we just hash the inputs leading up to the output,
excluding the output itself); but how do we make store items available
to a process?  When using containers we can bind mount the file to the
declared input location; when not using containers we need to link or
copy the file from

I also want to have finer control over inputs.  Only declared input
*files* should be available in the container, not whole *directories*.

--
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-28 23:03 support for containers Ricardo Wurmus
  2019-01-29  9:38 ` Ricardo Wurmus
@ 2019-01-29 10:22 ` zimoun
  2019-01-29 11:44   ` Ricardo Wurmus
  1 sibling, 1 reply; 12+ messages in thread
From: zimoun @ 2019-01-29 10:22 UTC (permalink / raw)
  To: Ricardo Wurmus; +Cc: gwl-devel

Hi Ricardo,

On Tue, 29 Jan 2019 at 00:08, Ricardo Wurmus <rekado@elephly.net> wrote:

> the GWL could already support execution in containers with this patch:

Nice !


> --8<---------------cut here---------------start------------->8---
> diff --git a/gwl/processes.scm b/gwl/processes.scm
> index b7251db..9ec5925 100644
> --- a/gwl/processes.scm
> +++ b/gwl/processes.scm
> @@ -19,13 +19,20 @@

[...]

> -(define (procedure->gexp process)
> +(define* (procedure->gexp process #:key (container? #t))
>    "Transform the procedure of PROCESS to a G-expression or return the
>  plain S-expression."

Why the default is #t ?


> The directories to be mounted in the container are derived from the
> declared inputs and outputs.  The only problem is that inputs are
> read-only in this implementation.  I like it this way, actually, but it
> means that the extended example workflow won’t work as it tries to
> delete its inputs.
>
> Should data inputs be declared as (mutable-file …) or (file …) instead
> of being plain strings?

Hum? I am not sure that it should a good design to declare inputs as mutable.


All the best,
simon

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29  9:38 ` Ricardo Wurmus
@ 2019-01-29 10:39   ` zimoun
  2019-01-29 11:46     ` Ricardo Wurmus
  0 siblings, 1 reply; 12+ messages in thread
From: zimoun @ 2019-01-29 10:39 UTC (permalink / raw)
  To: Ricardo Wurmus; +Cc: gwl-devel

Hi Ricardo,

On Tue, 29 Jan 2019 at 10:54, Ricardo Wurmus <rekado@elephly.net> wrote:
> Ricardo Wurmus <rekado@elephly.net> writes:
>
> > the GWL could already support execution in containers with this patch:
> […]
> > The directories to be mounted in the container are derived from the
> > declared inputs and outputs.  The only problem is that inputs are
> > read-only in this implementation.  I like it this way, actually, but it
> > means that the extended example workflow won’t work as it tries to
> > delete its inputs.
> >
> > Should data inputs be declared as (mutable-file …) or (file …) instead
> > of being plain strings?
>
> Taking a step back I think it is worth remembering that ultimately we’d
> like to have output caching via an immutable data store.  I think
> declaring files as mutable would be a mistake.  Garbage collection is
> better than modifying output files.

I agree that Garbage Collection is a better design, from my opinion.
However, as an user I would like to garbage collect only the
inputs/outputs of a specific workflow. Not all inside the store.


> Some thoughts on how the data store should work: it’s easy to add stuff
> to the store (we just hash the inputs leading up to the output,
> excluding the output itself); but how do we make store items available
> to a process?  When using containers we can bind mount the file to the
> declared input location; when not using containers we need to link or
> copy the file from

By store, do you the Guix store? or another store? as GWL store?

By inputs, do you mean data-inputs and package-inputs?


I do not know if below makes sense.
What happens if a kind of record managing the data-inputs is added?
I mean, there are: package, process, workflow and we add data.
Then the data is manage as other symbols, with define-public, etc.
However, these data need to live a special store, not in the store of packages.

>
> I also want to have finer control over inputs.  Only declared input
> *files* should be available in the container, not whole *directories*.

Hum? I make sense.
Especially for references used by aligners.
Even if the size of the container will significantly growth. :-)


All the best,
simon

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 10:22 ` zimoun
@ 2019-01-29 11:44   ` Ricardo Wurmus
  0 siblings, 0 replies; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-29 11:44 UTC (permalink / raw)
  To: zimoun; +Cc: gwl-devel


zimoun <zimon.toutoune@gmail.com> writes:

>> -(define (procedure->gexp process)
>> +(define* (procedure->gexp process #:key (container? #t))
>>    "Transform the procedure of PROCESS to a G-expression or return the
>>  plain S-expression."
>
> Why the default is #t ?

Just for testing.  This is work in progress.

-- 
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 10:39   ` zimoun
@ 2019-01-29 11:46     ` Ricardo Wurmus
  2019-01-29 14:29       ` zimoun
  0 siblings, 1 reply; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-29 11:46 UTC (permalink / raw)
  To: zimoun; +Cc: gwl-devel


zimoun <zimon.toutoune@gmail.com> writes:

>> Some thoughts on how the data store should work: it’s easy to add stuff
>> to the store (we just hash the inputs leading up to the output,
>> excluding the output itself); but how do we make store items available
>> to a process?  When using containers we can bind mount the file to the
>> declared input location; when not using containers we need to link or
>> copy the file from
>
> By store, do you the Guix store? or another store? as GWL store?

A GWL-specific data store.  We don’t reuse the Guix store.

> By inputs, do you mean data-inputs and package-inputs?

Data inputs only.

>> I also want to have finer control over inputs.  Only declared input
>> *files* should be available in the container, not whole *directories*.
>
> Hum? I make sense.
> Especially for references used by aligners.
> Even if the size of the container will significantly growth. :-)

If users need directories they should declare directories as inputs /
outputs.

--
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 11:46     ` Ricardo Wurmus
@ 2019-01-29 14:29       ` zimoun
  2019-01-29 17:19         ` Ricardo Wurmus
  0 siblings, 1 reply; 12+ messages in thread
From: zimoun @ 2019-01-29 14:29 UTC (permalink / raw)
  To: Ricardo Wurmus; +Cc: gwl-devel

On Tue, 29 Jan 2019 at 12:46, Ricardo Wurmus <rekado@elephly.net> wrote:
> zimoun <zimon.toutoune@gmail.com> writes:

> > By inputs, do you mean data-inputs and package-inputs?
>
> Data inputs only.

I understand that only the input files need to be part of the input hash.
However, I do not know if the output hash need to also contain the
tool hash and the input hash.
Therefore, when chaining (an input is another output), the store
should track the tools used, somehow.

Maybe, it is what you are explaining with the quote: "we just hash the
inputs leading up to the output, excluding the output itself".

Right?

--
simon

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 14:29       ` zimoun
@ 2019-01-29 17:19         ` Ricardo Wurmus
  2019-01-29 21:52           ` zimoun
  0 siblings, 1 reply; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-29 17:19 UTC (permalink / raw)
  To: zimoun; +Cc: gwl-devel


zimoun <zimon.toutoune@gmail.com> writes:

> On Tue, 29 Jan 2019 at 12:46, Ricardo Wurmus <rekado@elephly.net> wrote:
>> zimoun <zimon.toutoune@gmail.com> writes:
>
>> > By inputs, do you mean data-inputs and package-inputs?
>>
>> Data inputs only.
>
> I understand that only the input files need to be part of the input hash.
> However, I do not know if the output hash need to also contain the
> tool hash and the input hash.
> Therefore, when chaining (an input is another output), the store
> should track the tools used, somehow.

Ah, I understand now.

> Maybe, it is what you are explaining with the quote: "we just hash the
> inputs leading up to the output, excluding the output itself".

Right, “inputs” is very vague at this point.  It should be derived from
the process and all of its data inputs (which are the result of other
processes).

As you can see this is analogous to Guix.  I’m not clear on how exactly
this should be accomplished, to be honest.  In Guix we have packages
referencing other packages directly (through inputs or through the build
system); this is all compiled down to derivations, which have clear
inputs and outputs.

In the GWL we have processes, which are linked through workflows.
Processes compile to derivations that build scripts and the inputs to
these derivations are only package derivations — no data inputs, because
they only become important when the generated scripts are executed.

We could start by assuming that the generated *script* of the current
process (which references a particular package profile) is an input to
the process’s output, as are the other data inputs, which have other
processes scripts as inputs.

We could thus cheaply hash the scripts — since they contain references
to the tools they use.

--
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 17:19         ` Ricardo Wurmus
@ 2019-01-29 21:52           ` zimoun
  2019-01-29 23:16             ` Ricardo Wurmus
  0 siblings, 1 reply; 12+ messages in thread
From: zimoun @ 2019-01-29 21:52 UTC (permalink / raw)
  To: Ricardo Wurmus; +Cc: gwl-devel

Hi Ricardo,

I follow your explanations, I think.

What it is not clear to me is: how many store do we consider?
 1. one managing the package. Everything already here. Nothing to tell more. :-)
 2. one about scripts
 3. one about data
Or the scripts and the data do they live in the same store? And the
script is just considered as a kind-of derivation for the data.

Hum? it is out of container scope, isn't it?
Well, thank you for the container support. :-)

All the best,
simon

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 21:52           ` zimoun
@ 2019-01-29 23:16             ` Ricardo Wurmus
  2019-01-30 10:17               ` zimoun
  0 siblings, 1 reply; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-29 23:16 UTC (permalink / raw)
  To: zimoun; +Cc: gwl-devel


zimoun <zimon.toutoune@gmail.com> writes:

> What it is not clear to me is: how many store do we consider?
>  1. one managing the package. Everything already here. Nothing to tell more. :-)
>  2. one about scripts
>  3. one about data
> Or the scripts and the data do they live in the same store? And the
> script is just considered as a kind-of derivation for the data.

Scripts and data are closely related.

Since we don’t hash the data (because it’s expensive) the scripts are
“proxies” for the data files.  We compute the hashes over the dependent
scripts and assume that this is enough to decide whether to recompute
data files or to serve them from the cache/store.

--
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-29 23:16             ` Ricardo Wurmus
@ 2019-01-30 10:17               ` zimoun
  2019-01-30 12:46                 ` Ricardo Wurmus
  0 siblings, 1 reply; 12+ messages in thread
From: zimoun @ 2019-01-30 10:17 UTC (permalink / raw)
  To: Ricardo Wurmus; +Cc: gwl-devel

Hi Ricardo,

On Wed, 30 Jan 2019 at 00:16, Ricardo Wurmus <rekado@elephly.net> wrote:

> Since we don’t hash the data (because it’s expensive) the scripts are
> “proxies” for the data files.  We compute the hashes over the dependent
> scripts and assume that this is enough to decide whether to recompute
> data files or to serve them from the cache/store.

Just to be sure to well understand your point, let pick the simple
example from genomics pipeline:
 FASTQ -align-> BAM -variant-> VCF
So, you intend to hash:
 - the data FASTQ
 - the scripts align and variant
Or only the scripts containing reference to inputs (here FASTQ), where
the reference is a location fixed by the user.

Well, hashing the scripts and assuming they "mirror" the data files
appear to me an efficient design for the CAS.

--
simon

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: support for containers
  2019-01-30 10:17               ` zimoun
@ 2019-01-30 12:46                 ` Ricardo Wurmus
  0 siblings, 0 replies; 12+ messages in thread
From: Ricardo Wurmus @ 2019-01-30 12:46 UTC (permalink / raw)
  To: zimoun; +Cc: gwl-devel


Hi Simon,

> On Wed, 30 Jan 2019 at 00:16, Ricardo Wurmus <rekado@elephly.net> wrote:
>
>> Since we don’t hash the data (because it’s expensive) the scripts are
>> “proxies” for the data files.  We compute the hashes over the dependent
>> scripts and assume that this is enough to decide whether to recompute
>> data files or to serve them from the cache/store.
>
> Just to be sure to well understand your point, let pick the simple
> example from genomics pipeline:
>  FASTQ -align-> BAM -variant-> VCF
> So, you intend to hash:
>  - the data FASTQ
>  - the scripts align and variant
> Or only the scripts containing reference to inputs (here FASTQ), where
> the reference is a location fixed by the user.

Currently, there is no good way for a user to pass inputs to a workflow,
so I haven’t yet thought about how to handle the user’s input files.
This still needs to be done.  Currently, the only way a user can provide
files as inputs is by writing a process that “generates” the file (even
if it does so by merely accessing the impure file system).  That’s
rather inconvenient and it wouldn’t work in a container where only
declared files are available.

Users should be able to map files to any process input from the command
line (or through a configuration file).  For a provided input we should
take into account the hash of some file property: the timestamp and the
name (cheap), or the contents (expensive).

As regards hashing the scripts here’s what I have so far:

--8<---------------cut here---------------start------------->8---
(define (workflow->data-hashes workflow engine)
  "Return an alist associating each of the WORKFLOW's processes with
the hash of all the process scripts used to generate their outputs."
  (define make-script (process->script engine))
  (define graph (workflow-restrictions workflow))

  ;; Compute hashes for chains of scripts.
  (define (kons process acc)
    (let* ((script (make-script process #:workflow workflow))
           (hash   (bytevector->u8-list
                    (sha256 (call-with-input-file script get-bytevector-all)))))
      (cons
       (cons process
             (append hash
                     ;; Hashes of processes this one depends on.
                     (append-map (cut assoc-ref acc <>)
                                 (or (assoc-ref graph process) '()))))
       acc)))
  (map (match-lambda
         ((process . hashes)
          (cons process
                (bytevector->base32-string
                 (sha256
                  (u8-list->bytevector hashes))))))
       (fold kons '()
             (workflow-run-order workflow #:parallel? #f))))
--8<---------------cut here---------------end--------------->8---

I.e. for any process we want the hash over the script used for the
current process and for all processes that lead up to the current one.

This gives us a hash string for every process.  We can then look up
“${GWL_STORE}/${hash}/output-file-name” — if it exists we use it.  The
workflow runner would now also need to ensure that process outputs are
linked to the appropriate GWL_STORE location upon successful execution.

--
Ricardo

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-01-30 13:33 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-28 23:03 support for containers Ricardo Wurmus
2019-01-29  9:38 ` Ricardo Wurmus
2019-01-29 10:39   ` zimoun
2019-01-29 11:46     ` Ricardo Wurmus
2019-01-29 14:29       ` zimoun
2019-01-29 17:19         ` Ricardo Wurmus
2019-01-29 21:52           ` zimoun
2019-01-29 23:16             ` Ricardo Wurmus
2019-01-30 10:17               ` zimoun
2019-01-30 12:46                 ` Ricardo Wurmus
2019-01-29 10:22 ` zimoun
2019-01-29 11:44   ` Ricardo Wurmus

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).