From c52172502749a4d194dc51db9d2c394cb15e8d07 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Tue, 25 May 2021 08:42:06 -0400 Subject: [PATCH] offload: Handle a possible EOF response from read-repl-response. Fixes . * guix/scripts/offload.scm (check-machine-availability): Refactor so that it takes a single machine object, to allow for retrying a single machine. Handle the case where the checks raised an exception due to the connection to the build machine having been lost, and retry up to 3 times. Ensure the cleanup code is run in all situations. (check-machines-availability): New procedure. Call CHECK-MACHINES-AVAILABILITY in parallel, which improves performance (about twice as fast with 4 build machines, from ~30 s to ~15 s). * guix/inferior.scm (&inferior-connection-lost): New condition type. (read-repl-response): Raise a condition of the above type when reading EOF from the build machine's port. --- guix/inferior.scm | 9 ++++++++ guix/scripts/offload.scm | 50 +++++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 14 deletions(-) diff --git a/guix/inferior.scm b/guix/inferior.scm index 7c8e478f2a..45d9d843e3 100644 --- a/guix/inferior.scm +++ b/guix/inferior.scm @@ -1,5 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès +;;; Copyright © 2021 Maxim Cournoyer ;;; ;;; This file is part of GNU Guix. ;;; @@ -70,6 +71,7 @@ inferior-exception-arguments inferior-exception-inferior inferior-exception-stack + inferior-connection-lost? read-repl-response inferior-packages @@ -228,6 +230,9 @@ equivalent. Return #f if the inferior could not be launched." (inferior inferior-exception-inferior) ; | #f (stack inferior-exception-stack)) ;list of (FILE COLUMN LINE) +(define-condition-type &inferior-connection-lost &error + inferior-connection-lost?) + (define* (read-repl-response port #:optional inferior) "Read a (guix repl) response from PORT and return it as a Scheme object. Raise '&inferior-exception' when an exception is read from PORT." @@ -241,6 +246,10 @@ Raise '&inferior-exception' when an exception is read from PORT." (match (read port) (('values objects ...) (apply values (map sexp->object objects))) + ;; Unexpectedly read EOF from the port. This can happen for example when + ;; the underlying connection for PORT was lost with Guile-SSH. + ((? eof-object?) + (raise (condition (&inferior-connection-lost)))) (('exception ('arguments key objects ...) ('stack frames ...)) ;; Protocol (0 1 1) and later. diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index 835078cb97..0271874f6a 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -1,7 +1,7 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2014, 2015, 2016, 2017, 2018, 2019, 2020 Ludovic Courtès ;;; Copyright © 2017 Ricardo Wurmus -;;; Copyright © 2020 Maxim Cournoyer +;;; Copyright © 2020, 2021 Maxim Cournoyer ;;; Copyright © 2020 Julien Lepiller ;;; ;;; This file is part of GNU Guix. @@ -53,6 +53,7 @@ #:use-module (ice-9 regex) #:use-module (ice-9 format) #:use-module (ice-9 binary-ports) + #:use-module (ice-9 threads) #:export (build-machine build-machine? build-machine-name @@ -684,7 +685,7 @@ daemon is not running." (leave (G_ "failed to import '~a' from '~a'~%") item name))))) -(define (check-machine-availability machine-file pred) +(define (check-machines-availability machine-file pred) "Check that each machine matching PRED in MACHINE-FILE is usable as a build machine." (define (build-machine=? m1 m2) @@ -696,18 +697,39 @@ machine." (let ((machines (filter pred (delete-duplicates (build-machines machine-file) build-machine=?)))) - (info (G_ "testing ~a build machines defined in '~a'...~%") + (info (G_ "Testing ~a build machines defined in '~a'...~%") (length machines) machine-file) - (let* ((names (map build-machine-name machines)) - (sockets (map build-machine-daemon-socket machines)) - (sessions (map (cut open-ssh-session <> %short-timeout) machines)) - (nodes (map remote-inferior sessions))) - (for-each assert-node-has-guix nodes names) - (for-each assert-node-repl nodes names) - (for-each assert-node-can-import sessions nodes names sockets) - (for-each assert-node-can-export sessions nodes names sockets) - (for-each close-inferior nodes) - (for-each disconnect! sessions)))) + (par-for-each check-machine-availability machines))) + +(define (check-machine-availability machine) + "Check whether MACHINE is available. Exit with an error upon failure." + ;; Sometimes, the machine remote port may return EOF, presumably because the + ;; connection was lost. Retry up to 3 times. + (let loop ((retries 3)) + (guard (c ((inferior-connection-lost? c) + (let ((retries-left (1- retries))) + (if (> retries-left 0) + (begin + (format (current-error-port) + (G_ "connection to machine '~a' lost; retrying~%") + (build-machine-name machine)) + (loop (retries-left))) + (leave (G_ "connection repeatedly lost with machine '~a'~%") + (build-machine-name machine)))))) + (let* ((name (build-machine-name machine)) + (socket (build-machine-daemon-socket machine)) + (session (open-ssh-session machine %short-timeout)) + (node (remote-inferior session))) + (dynamic-wind + (lambda () #t) + (lambda () + (assert-node-has-guix node name) + (assert-node-repl node name) + (assert-node-can-import session node name socket) + (assert-node-can-export session node name socket)) + (lambda () + (close-inferior node) + (disconnect! session))))))) (define (check-machine-status machine-file pred) "Print the load of each machine matching PRED in MACHINE-FILE." @@ -824,7 +846,7 @@ machine." ((file) (values file (const #t))) (() (values %machine-file (const #t))) (x (leave (G_ "wrong number of arguments~%")))))) - (check-machine-availability (or file %machine-file) pred)))) + (check-machines-availability (or file %machine-file) pred)))) (("status" rest ...) (with-error-handling (let-values (((file pred) -- 2.31.1