From b5558777617e4674a150895458d57d202de56120 Mon Sep 17 00:00:00 2001 From: Maxim Cournoyer Date: Tue, 25 May 2021 08:42:06 -0400 Subject: [PATCH 2/2] offload: Handle a possible EOF response from read-repl-response. Partially fixes . * guix/scripts/offload.scm (check-machine-availability): Handle the case where the checks raised an exception due to receiving EOF prematurely, and retry up to 3 times. * guix/inferior.scm (&inferior-premature-eof): New condition type. (read-repl-response): Raise a condition of the above type when reading EOF from the build machine's port. --- guix/inferior.scm | 15 ++++++++++++++ guix/scripts/offload.scm | 42 ++++++++++++++++++++++++++-------------- 2 files changed, 43 insertions(+), 14 deletions(-) diff --git a/guix/inferior.scm b/guix/inferior.scm index 7c8e478f2a..e63b37a7dd 100644 --- a/guix/inferior.scm +++ b/guix/inferior.scm @@ -1,5 +1,6 @@ ;;; GNU Guix --- Functional package management for GNU ;;; Copyright © 2018, 2019, 2020, 2021 Ludovic Courtès +;;; Copyright © 2021 Maxim Cournoyer ;;; ;;; This file is part of GNU Guix. ;;; @@ -70,6 +71,9 @@ inferior-exception-arguments inferior-exception-inferior inferior-exception-stack + inferior-premature-eof? + inferior-premature-eof-port + inferior-premature-eof-inferior read-repl-response inferior-packages @@ -228,6 +232,11 @@ equivalent. Return #f if the inferior could not be launched." (inferior inferior-exception-inferior) ; | #f (stack inferior-exception-stack)) ;list of (FILE COLUMN LINE) +(define-condition-type &inferior-premature-eof &error + inferior-premature-eof? + (port inferior-premature-eof-port) + (inferior inferior-premature-eof-inferior)) + (define* (read-repl-response port #:optional inferior) "Read a (guix repl) response from PORT and return it as a Scheme object. Raise '&inferior-exception' when an exception is read from PORT." @@ -241,6 +250,12 @@ Raise '&inferior-exception' when an exception is read from PORT." (match (read port) (('values objects ...) (apply values (map sexp->object objects))) + ((? eof-object?) + ;; Unexpectedly read EOF from the port. This can happen for example when + ;; the underlying connection for PORT was lost with Guile-SSH. + (raise (condition (&inferior-premature-eof + (inferior inferior) + (port port))))) (('exception ('arguments key objects ...) ('stack frames ...)) ;; Protocol (0 1 1) and later. diff --git a/guix/scripts/offload.scm b/guix/scripts/offload.scm index b0fd20e158..4312eb4e22 100644 --- a/guix/scripts/offload.scm +++ b/guix/scripts/offload.scm @@ -705,20 +705,34 @@ machine." "Check whether MACHINE is available. Exit with an error upon failure." ;; Sometimes, the machine remote port may return EOF, presumably because the ;; connection was lost. Retry up to 3 times. - (let* ((name (build-machine-name machine)) - (socket (build-machine-daemon-socket machine)) - (session (open-ssh-session machine %short-timeout)) - (node (remote-inferior session))) - (dynamic-wind - (lambda () #t) - (lambda () - (assert-node-has-guix node name) - (assert-node-repl node name) - (assert-node-can-import session node name socket) - (assert-node-can-export session node name socket)) - (lambda () - (close-inferior node) - (disconnect! session))))) + (let loop ((retries 3)) + (guard (c ((inferior-premature-eof? c) + (let ((retries-left (1- retries)) + (inferior (inferior-premature-eof-inferior c))) + (if (> retries-left 0) + (begin + (info (G_ "got premature EOF from machine '~a' from \ +inferior '~a' on port '~a'; retrying connection~%") + (build-machine-name machine) + inferior + (inferior-premature-eof-port c)) + (loop (retries-left))) + (leave (G_ "connection repeatedly lost with machine '~a'~%") + (build-machine-name machine)))))) + (let* ((name (build-machine-name machine)) + (socket (build-machine-daemon-socket machine)) + (session (open-ssh-session machine %short-timeout)) + (node (remote-inferior session))) + (dynamic-wind + (lambda () #t) + (lambda () + (assert-node-has-guix node name) + (assert-node-repl node name) + (assert-node-can-import session node name socket) + (assert-node-can-export session node name socket)) + (lambda () + (close-inferior node) + (disconnect! session))))))) (define (check-machine-status machine-file pred) "Print the load of each machine matching PRED in MACHINE-FILE." -- 2.31.1