From mboxrd@z Thu Jan 1 00:00:00 1970 From: David Thompson Subject: [PATCH 08/15] gnu: build: Add Linux container module. Date: Mon, 6 Jul 2015 09:16:37 -0400 Message-ID: <1436188604-2813-8-git-send-email-dthompson2@worcester.edu> References: <1436188604-2813-1-git-send-email-dthompson2@worcester.edu> Mime-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return-path: Received: from eggs.gnu.org ([2001:4830:134:3::10]:47567) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZC6H9-0001VN-Id for guix-devel@gnu.org; Mon, 06 Jul 2015 09:17:41 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ZC6H2-0006vO-6V for guix-devel@gnu.org; Mon, 06 Jul 2015 09:17:35 -0400 Received: from mail-qg0-f54.google.com ([209.85.192.54]:34779) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZC6H2-0006uw-0V for guix-devel@gnu.org; Mon, 06 Jul 2015 09:17:28 -0400 Received: by qgii30 with SMTP id i30so68590756qgi.1 for ; Mon, 06 Jul 2015 06:17:27 -0700 (PDT) In-Reply-To: <1436188604-2813-1-git-send-email-dthompson2@worcester.edu> List-Id: "Development of GNU Guix and the GNU System distribution." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: guix-devel-bounces+gcggd-guix-devel=m.gmane.org@gnu.org Sender: guix-devel-bounces+gcggd-guix-devel=m.gmane.org@gnu.org To: guix-devel@gnu.org Cc: David Thompson From: David Thompson * gnu/build/linux-container.scm: New file. * gnu-system.am (GNU_SYSTEM_MODULES): Add it. * .dir-locals.el: Add Scheme indent rules for 'call-with-clone', 'with-clone', 'call-with-container', and 'container-excursion'. * tests/containers.scm: New file. * Makefile.am (SCM_TESTS): Add it. --- .dir-locals.el | 5 + Makefile.am | 3 +- gnu-system.am | 1 + gnu/build/linux-container.scm | 299 ++++++++++++++++++++++++++++++++++++++++++ tests/containers.scm | 111 ++++++++++++++++ 5 files changed, 418 insertions(+), 1 deletion(-) create mode 100644 gnu/build/linux-container.scm create mode 100644 tests/containers.scm diff --git a/.dir-locals.el b/.dir-locals.el index cbcb120..65e1c6d 100644 --- a/.dir-locals.el +++ b/.dir-locals.el @@ -59,6 +59,11 @@ (eval . (put 'run-with-state 'scheme-indent-function 1)) (eval . (put 'wrap-program 'scheme-indent-function 1)) + (eval . (put 'call-with-clone 'scheme-indent-function 1)) + (eval . (put 'with-clone 'scheme-indent-function 1)) + (eval . (put 'call-with-container 'scheme-indent-function 1)) + (eval . (put 'container-excursion 'scheme-indent-function 1)) + ;; Recognize '~', '+', and '$', as used for gexps, as quotation symbols. ;; This notably allows '(' in Paredit to not insert a space when the ;; preceding symbol is one of these. diff --git a/Makefile.am b/Makefile.am index cc0b135..569ea6f 100644 --- a/Makefile.am +++ b/Makefile.am @@ -196,7 +196,8 @@ SCM_TESTS = \ tests/gremlin.scm \ tests/lint.scm \ tests/publish.scm \ - tests/size.scm + tests/size.scm \ + tests/containers.scm if HAVE_GUILE_JSON diff --git a/gnu-system.am b/gnu-system.am index 7e25f6f..48dbc5f 100644 --- a/gnu-system.am +++ b/gnu-system.am @@ -356,6 +356,7 @@ GNU_SYSTEM_MODULES = \ gnu/build/file-systems.scm \ gnu/build/install.scm \ gnu/build/linux-boot.scm \ + gnu/build/linux-container.scm \ gnu/build/linux-initrd.scm \ gnu/build/linux-modules.scm \ gnu/build/vm.scm diff --git a/gnu/build/linux-container.scm b/gnu/build/linux-container.scm new file mode 100644 index 0000000..60deca2 --- /dev/null +++ b/gnu/build/linux-container.scm @@ -0,0 +1,299 @@ +;;; GNU Guix --- Functional package management for GNU +;;; Copyright © 2015 David Thompson +;;; +;;; This file is part of GNU Guix. +;;; +;;; GNU Guix is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; GNU Guix is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with GNU Guix. If not, see . + +(define-module (gnu build linux-container) + #:use-module (ice-9 format) + #:use-module (ice-9 match) + #:use-module (srfi srfi-98) + #:use-module (guix utils) + #:use-module (guix build utils) + #:use-module (guix build syscalls) + #:export (%namespaces + run-container + call-with-container + container-excursion)) + +(define %namespaces + '(mnt pid ipc uts user net)) + +(define (call-with-clean-exit thunk) + "Apply THUNK, but exit with a status code of 1 if it fails." + (dynamic-wind + (const #t) + thunk + (lambda () + (primitive-exit 1)))) + +(define (mount-flags->bit-mask flags) + "Return the number suitable for the 'flags' argument of 'mount' that +corresponds to the symbols listed in FLAGS." + (let loop ((flags flags)) + (match flags + (('read-only rest ...) + (logior MS_RDONLY (loop rest))) + (('bind-mount rest ...) + (logior MS_BIND (loop rest))) + (('no-suid rest ...) + (logior MS_NOSUID (loop rest))) + (('no-dev rest ...) + (logior MS_NODEV (loop rest))) + (('no-exec rest ...) + (logior MS_NOEXEC (loop rest))) + (() + 0)))) + +(define* (mount-file-system spec root) + "Mount the file system described by SPEC under ROOT. SPEC must have the +form: + + (DEVICE TITLE MOUNT-POINT TYPE (FLAGS ...) OPTIONS CHECK?) + +DEVICE, MOUNT-POINT, and TYPE must be strings; OPTIONS can be a string or #f; +FLAGS must be a list of symbols. CHECK? is ignored." + (match spec + ((source title mount-point type (flags ...) options _) + (let ((mount-point (string-append root mount-point)) + (flags (mount-flags->bit-mask flags))) + (mkdir-p mount-point) + (mount source mount-point type flags options) + + ;; For read-only bind mounts, an extra remount is needed, as per + ;; , which still applies to Linux 4.0. + (when (and (= MS_BIND (logand flags MS_BIND)) + (= MS_RDONLY (logand flags MS_RDONLY))) + (let ((flags (logior MS_BIND MS_REMOUNT MS_RDONLY))) + (mount source mount-point type flags #f))))))) + +(define (purify-environment) + "Unset all environment variables." + (for-each unsetenv + (match (get-environment-variables) + (((names . _) ...) names)))) + +;; The container setup procedure closely resembles that of the Docker +;; specification: +;; https://raw.githubusercontent.com/docker/libcontainer/master/SPEC.md +(define* (mount-file-systems root mounts #:key mount-/sys? mount-/proc?) + "Mount the essential file systems and the those in the MOUNTS list relative +to ROOT, then make ROOT the new root directory for the process." + (define (scope dir) + (string-append root dir)) + + (define (bind-mount src dest) + (mount src dest "none" MS_BIND)) + + ;; Like mount, but creates the mount point if it doesn't exist. + (define* (mount* source target type #:optional (flags 0) options + #:key (update-mtab? #f)) + (mkdir-p target) + (mount source target type flags options #:update-mtab? update-mtab?)) + + ;; The container's file system is completely ephemeral, sans directories + ;; bind-mounted from the host. + (mount "none" root "tmpfs") + + ;; A proc mount requires a new pid namespace. + (when mount-/proc? + (mount* "none" (scope "/proc") "proc" + (logior MS_NOEXEC MS_NOSUID MS_NODEV))) + + ;; A sysfs mount requires the user to have the CAP_SYS_ADMIN capability in + ;; the current network namespace. + (when mount-/sys? + (mount* "none" (scope "/sys") "sysfs" + (logior MS_NOEXEC MS_NOSUID MS_NODEV MS_RDONLY))) + + (mount* "none" (scope "/dev") "tmpfs" + (logior MS_NOEXEC MS_STRICTATIME) + "mode=755") + + ;; Create essential device nodes via bind-mounting them from the + ;; host, because a process within a user namespace cannot create + ;; device nodes. + (for-each (lambda (device) + (when (file-exists? device) + ;; Create the mount point file. + (call-with-output-file (scope device) + (const #t)) + (bind-mount device (scope device)))) + '("/dev/null" + "/dev/zero" + "/dev/full" + "/dev/random" + "/dev/urandom" + "/dev/tty" + "/dev/ptmx" + "/dev/fuse")) + + ;; Setup standard input/output/error. + (symlink "/proc/self/fd" (scope "/dev/fd")) + (symlink "/proc/self/fd/0" (scope "/dev/stdin")) + (symlink "/proc/self/fd/1" (scope "/dev/stdout")) + (symlink "/proc/self/fd/2" (scope "/dev/stderr")) + + ;; Mount user-specified file systems. + (for-each (lambda (spec) + (mount-file-system spec root)) + mounts) + + ;; Jail the process inside the container's root file system. + (let ((put-old (string-append root "/real-root"))) + (mkdir put-old) + (pivot-root root put-old) + (chdir "/") + (umount "real-root" MNT_DETACH) + (rmdir "real-root"))) + +(define (initialize-user-namespace pid) + "Configure the user namespace for PID." + (define proc-dir + (string-append "/proc/" (number->string pid))) + + (define (scope file) + (string-append proc-dir file)) + + ;; Only root can map more than a single uid/gid. A range of 65536 uid/gids + ;; is used to cover 16 bits worth of users and groups, which is sufficient + ;; for most cases. + ;; + ;; See also: http://www.freedesktop.org/software/systemd/man/systemd-nspawn.html#--private-users= + (let* ((uid (getuid)) + (gid (getgid)) + (uid-range (if (zero? uid) 65536 1)) + (gid-range (if (zero? gid) 65536 1))) + + ;; Only root can write to the gid map without first disabling the + ;; setgroups syscall. + (unless (and (zero? uid) (zero? gid)) + (call-with-output-file (scope "/setgroups") + (lambda (port) + (display "deny" port)))) + + ;; Map the user/group that created the container to the root user + ;; within the container. + (call-with-output-file (scope "/uid_map") + (lambda (port) + (format port "0 ~d ~d" uid uid-range))) + (call-with-output-file (scope "/gid_map") + (lambda (port) + (format port "0 ~d ~d" gid gid-range))))) + +(define (namespaces->bit-mask namespaces) + "Return the number suitable for the 'flags' argument of 'clone' that +corresponds to the symbols in NAMESPACES." + (apply logior SIGCHLD + (map (match-lambda + ('mnt CLONE_NEWNS) + ('uts CLONE_NEWUTS) + ('ipc CLONE_NEWIPC) + ('user CLONE_NEWUSER) + ('pid CLONE_NEWPID) + ('net CLONE_NEWNET)) + namespaces))) + +(define (run-container root mounts namespaces thunk) + "Run THUNK in a new container process and return its PID. ROOT specifies +the root directory for the container. MOUNTS is a list of file system specs +that specify the mapping of host file systems into the container. NAMESPACES +is a list of symbols that correspond to the possible Linux namespaces: mnt, +ipc, uts, user, and net." + ;; The parent process must initialize the user namespace for the child + ;; before it can boot. To negotiate this, a pipe is used such that the + ;; child process blocks until the parent writes to it. + (match (pipe) + ((in . out) + (let ((flags (namespaces->bit-mask namespaces))) + (match (clone flags) + (0 + (call-with-clean-exit + (lambda () + (close out) + ;; Wait for parent to set things up. + (read in) + (close in) + (purify-environment) + (when (memq 'mnt namespaces) + (mount-file-systems root mounts + #:mount-/proc? (memq 'pid namespaces) + #:mount-/sys? (memq 'net namespaces))) + ;; TODO: Manage capabilities. + (thunk)))) + (pid + (when (memq 'user namespaces) + (initialize-user-namespace pid)) + ;; TODO: Initialize cgroups. + (close in) + (write 'ready out) + (close out) + pid)))))) + +(define* (call-with-container mounts thunk #:key (namespaces %namespaces)) + "Run THUNK in a new container process and return its exit status. +MOUNTS is a list of file system specs that specify the mapping of host file +systems into the container. NAMESPACES is a list of symbols corresponding to +the identifiers for Linux namespaces: mnt, ipc, uts, pid, user, and net. By +default, all namespaces are used." + (call-with-temporary-directory + (lambda (root) + (let ((pid (run-container root mounts namespaces thunk))) + ;; Catch SIGINT and kill the container process. + (sigaction SIGINT + (lambda (signum) + (false-if-exception + (kill pid SIGKILL)))) + + (match (waitpid pid) + ((_ . status) status)))))) + +(define (container-excursion pid thunk) + "Run THUNK as a child process within the namespaces of process PID and +return the exit status." + (define (namespace-file pid namespace) + (string-append "/proc/" (number->string pid) "/ns/" namespace)) + + (define (fork+waitpid thunk) + (match (primitive-fork) + (0 (call-with-clean-exit thunk)) + (pid + (match (waitpid pid) + ((_ . status) + (status:exit-val status)))))) + + (fork+waitpid + (lambda () + (for-each (lambda (ns) + (call-with-input-file (namespace-file (getpid) ns) + (lambda (current-ns-port) + (call-with-input-file (namespace-file pid ns) + (lambda (new-ns-port) + ;; Joining the namespace that the process + ;; already belongs to would throw an error. + (unless (= (port->fdes current-ns-port) + (port->fdes new-ns-port)) + (setns (port->fdes new-ns-port) 0))))))) + ;; It's important that the user namespace is joined first, + ;; so that the user will have the privileges to join the + ;; other namespaces. Furthermore, it's important that the + ;; mount namespace is joined last, otherwise the /proc mount + ;; point would no longer be accessible. + '("user" "ipc" "uts" "net" "pid" "mnt")) + (purify-environment) + (chdir "/") + ;; Fork again so that the pid is within the context of the joined pid + ;; namespace instead of the original pid namespace. + (primitive-exit (fork+waitpid thunk))))) diff --git a/tests/containers.scm b/tests/containers.scm new file mode 100644 index 0000000..329f300 --- /dev/null +++ b/tests/containers.scm @@ -0,0 +1,111 @@ +;;; GNU Guix --- Functional package management for GNU +;;; Copyright © 2015 David Thompson +;;; +;;; This file is part of GNU Guix. +;;; +;;; GNU Guix is free software; you can redistribute it and/or modify it +;;; under the terms of the GNU General Public License as published by +;;; the Free Software Foundation; either version 3 of the License, or (at +;;; your option) any later version. +;;; +;;; GNU Guix is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;; GNU General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with GNU Guix. If not, see . + +(define-module (test-containers) + #:use-module (guix utils) + #:use-module (guix build syscalls) + #:use-module (gnu build linux-container) + #:use-module (srfi srfi-64) + #:use-module (ice-9 match)) + +(define (assert-exit x) + (primitive-exit (if x 0 1))) + +(test-begin "containers") + +(test-assert "call-with-container, user namespace" + (zero? + (call-with-container '() + (lambda () + ;; The user is root within the new user namespace. + (assert-exit (and (zero? (getuid)) (zero? (getgid))))) + #:namespaces '(user)))) + +(test-assert "call-with-container, uts namespace" + (zero? + (call-with-container '() + (lambda () + ;; The user is root within the container and should be able to change + ;; the hostname of that container. + (sethostname "test-container") + (primitive-exit 0)) + #:namespaces '(user uts)))) + +(test-assert "call-with-container, pid namespace" + (zero? + (call-with-container '() + (lambda () + (match (primitive-fork) + (0 + ;; The first forked process in the new pid namespace is pid 2. + (assert-exit (= 2 (getpid)))) + (pid + (primitive-exit + (match (waitpid pid) + ((_ . status) + (status:exit-val status))))))) + #:namespaces '(user pid)))) + +(test-assert "call-with-container, mnt namespace" + (zero? + (call-with-container '(("none" "" "/testing" "tmpfs" () "" #f)) + (lambda () + (assert-exit (file-exists? "/testing"))) + #:namespaces '(user mnt)))) + +(test-assert "call-with-container, all namespaces" + (zero? + (call-with-container '() + (lambda () + (primitive-exit 0))))) + +(test-assert "container-excursion" + (call-with-temporary-directory + (lambda (root) + (match (pipe) + ((in . out) + (define (container) + (close out) + ;; Wait for test completion. + (read in) + (close in)) + + (define (namespaces pid) + (let ((pid (number->string pid))) + (map (lambda (ns) + (readlink (string-append "/proc/" pid "/ns/" ns))) + '("user" "ipc" "uts" "net" "pid" "mnt")))) + + (let* ((pid (run-container root '() %namespaces container)) + (container-namespaces (namespaces pid)) + ;; Check that all of the namespace identifiers are the same as + ;; the container process. + (status (container-excursion pid + (lambda () + (assert-exit + (equal? container-namespaces + (namespaces (getpid)))))))) + ;; Stop the container. + (write 'done out) + (close out) + (zero? status))))))) + +(test-end) + + +(exit (= (test-runner-fail-count (test-runner-current)) 0)) -- 2.4.3