From be442ea64e4fd8e235378a5f04d38296c0af9cf3 Mon Sep 17 00:00:00 2001 From: Carlo Zancanaro Date: Wed, 21 Feb 2018 22:57:59 +1100 Subject: [PATCH] Poll every 0.5s to find dead forked services if prctl fails. * modules/shepherd.scm (open-server-socket): Set socket to be non-blocking. (main): If we are unable to use prctl/PR_SET_CHILD_SUBREAPER, then poll for service processes between client connections, or every 0.5 seconds. * modules/shepherd/service.scm (fork+exec-command): Install handle-SIGCHLD as signal handler. (respawn-service): Separate logic for respawning services from handling SIGCHLD. (handle-SIGCHLD, check-for-dead-services): New exported procedures. * tests/basic.sh, tests/status-sexp.sh: Replace constant integers with symbols. * doc/shepherd.texi (Slots of services): Add note about service running slot being a process id. --- doc/shepherd.texi | 4 ++- modules/shepherd.scm | 46 ++++++++++++++++++------- modules/shepherd/service.scm | 82 ++++++++++++++++++++++++++++---------------- tests/basic.sh | 4 +-- tests/status-sexp.sh | 4 +-- 5 files changed, 94 insertions(+), 46 deletions(-) diff --git a/doc/shepherd.texi b/doc/shepherd.texi index 815091f..47005d5 100644 --- a/doc/shepherd.texi +++ b/doc/shepherd.texi @@ -608,7 +608,9 @@ way. The default value is @code{#f}, which indicates that the service is not running. When an attempt is made to start the service, it will be set to the return value of the procedure in the @code{start} slot. It will also be passed as an argument to the procedure in the -@code{stop} slot. This slot can not be initialized with a keyword. +@code{stop} slot. If it is set a value that is an integer, it is +assumed to be a process id, and shepherd will monitor the process for +unexpected exits. This slot can not be initialized with a keyword. @item @vindex respawn? (slot of ) diff --git a/modules/shepherd.scm b/modules/shepherd.scm index faa1e47..9d94881 100644 --- a/modules/shepherd.scm +++ b/modules/shepherd.scm @@ -42,6 +42,8 @@ (with-fluids ((%default-port-encoding "UTF-8")) (let ((sock (socket PF_UNIX SOCK_STREAM 0)) (address (make-socket-address AF_UNIX file-name))) + (fcntl sock F_SETFL (logior O_NONBLOCK + (fcntl sock F_GETFL))) (bind sock address) (listen sock 10) sock))) @@ -49,14 +51,26 @@ ;; Main program. (define (main . args) - (initialize-cli) + (define poll-services? + (and (not (= 1 (getpid))) ;; if we're pid 1 we don't need to do anything + (catch 'system-error + (lambda () + ;; Register for orphaned processes to be reparented onto us when + ;; their original parent dies. This lets us handle SIGCHLD from + ;; daemon processes that would otherwise have been reparented + ;; under pid 1. Obviously this is unnecessary when we are pid 1. + (prctl PR_SET_CHILD_SUBREAPER 1) + #f) ;; don't poll + (lambda args + ;; We fall back to polling for services on systems that don't + ;; support prctl/PR_SET_CHILD_SUBREAPER + (let ((errno (system-error-errno args))) + (if (or (= ENOSYS errno) ;; prctl not available + (= EINVAL errno)) ;; PR_SET_CHILD_SUBREAPER not available + #t ;; poll + (apply throw args))))))) - (when (not (= 1 (getpid))) - ;; Register for orphaned processes to be reparented onto us when their - ;; original parent dies. This lets us handle SIGCHLD from daemon processes - ;; that would otherwise have been reparented under pid 1. This is - ;; unnecessary when we are pid 1. - (catch-system-error (prctl PR_SET_CHILD_SUBREAPER 1))) + (initialize-cli) (let ((config-file #f) (socket-file default-socket-file) @@ -225,11 +239,19 @@ (_ #t)) (let next-command () - (match (accept sock) - ((command-source . client-address) - (setvbuf command-source _IOFBF 1024) - (process-connection command-source)) - (_ #f)) + (define (read-from sock) + (match (accept sock) + ((command-source . client-address) + (setvbuf command-source _IOFBF 1024) + (process-connection command-source)) + (_ #f))) + (match (select (list sock) (list) (list) (if poll-services? 0.5 #f)) + (((sock) _ _) + (read-from sock)) + (_ + #f)) + (when poll-services? + (check-for-dead-services)) (next-command)))))) (define (process-connection sock) diff --git a/modules/shepherd/service.scm b/modules/shepherd/service.scm index b6394f2..056483a 100644 --- a/modules/shepherd/service.scm +++ b/modules/shepherd/service.scm @@ -3,6 +3,7 @@ ;; Copyright (C) 2002, 2003 Wolfgang Järling ;; Copyright (C) 2014 Alex Sassmannshausen ;; Copyright (C) 2016 Alex Kost +;; Copyright (C) 2018 Carlo Zancanaro ;; ;; This file is part of the GNU Shepherd. ;; @@ -64,6 +65,7 @@ for-each-service lookup-services respawn-service + handle-SIGCHLD register-services provided-by required-by @@ -77,6 +79,7 @@ make-system-destructor make-init.d-service + check-for-dead-services root-service make-actions @@ -800,7 +803,7 @@ false." its PID." ;; Install the SIGCHLD handler if this is the first fork+exec-command call (unless %sigchld-handler-installed? - (sigaction SIGCHLD respawn-service SA_NOCLDSTOP) + (sigaction SIGCHLD handle-SIGCHLD SA_NOCLDSTOP) (set! %sigchld-handler-installed? #t)) (let ((pid (primitive-fork))) (if (zero? pid) @@ -991,7 +994,7 @@ child left." what (strerror errno)) '(0 . #f))))))) -(define (respawn-service signum) +(define (handle-SIGCHLD signum) "Handle SIGCHLD, possibly by respawning the service that just died, or otherwise by updating its state." (let loop () @@ -1010,38 +1013,44 @@ otherwise by updating its state." ;; SERV can be #f for instance when this code runs just after a ;; service's 'stop' method killed its process and completed. (when serv - (slot-set! serv 'running #f) - (if (and (respawn? serv) - (not (respawn-limit-hit? (slot-ref serv 'last-respawns) - (car respawn-limit) - (cdr respawn-limit)))) - (if (not (slot-ref serv 'waiting-for-termination?)) - (begin - ;; Everything is okay, start it. - (local-output "Respawning ~a." - (canonical-name serv)) - (slot-set! serv 'last-respawns - (cons (current-time) - (slot-ref serv 'last-respawns))) - (start serv)) - ;; We have just been waiting for the - ;; termination. The `running' slot has already - ;; been set to `#f' by `stop'. - (begin - (local-output "Service ~a terminated." - (canonical-name serv)) - (slot-set! serv 'waiting-for-termination? #f))) - (begin - (local-output "Service ~a has been disabled." - (canonical-name serv)) - (when (respawn? serv) - (local-output " (Respawning too fast.)")) - (slot-set! serv 'enabled? #f)))) + (respawn-service serv)) ;; As noted in libc's manual (info "(libc) Process Completion"), ;; loop so we don't miss any terminated child process. (loop)))))) +(define (respawn-service serv) + "Respawn a service that has stopped running unexpectedly. If we have +attempted to respawn the service a number of times already and it keeps dying, +then disable it." + (slot-set! serv 'running #f) + (if (and (respawn? serv) + (not (respawn-limit-hit? (slot-ref serv 'last-respawns) + (car respawn-limit) + (cdr respawn-limit)))) + (if (not (slot-ref serv 'waiting-for-termination?)) + (begin + ;; Everything is okay, start it. + (local-output "Respawning ~a." + (canonical-name serv)) + (slot-set! serv 'last-respawns + (cons (current-time) + (slot-ref serv 'last-respawns))) + (start serv)) + ;; We have just been waiting for the + ;; termination. The `running' slot has already + ;; been set to `#f' by `stop'. + (begin + (local-output "Service ~a terminated." + (canonical-name serv)) + (slot-set! serv 'waiting-for-termination? #f))) + (begin + (local-output "Service ~a has been disabled." + (canonical-name serv)) + (when (respawn? serv) + (local-output " (Respawning too fast.)")) + (slot-set! serv 'enabled? #f)))) + ;; Add NEW-SERVICES to the list of known services. (define (register-services . new-services) (define (register-single-service new) @@ -1171,6 +1180,21 @@ file when persistence is enabled." (lambda (p) (format p "~{~a ~}~%" running-services)))))) +(define (check-for-dead-services) + "Poll each process that we expect to be running, and respawn any which have +unexpectedly stopped running. This procedure is used as a fallback on systems +where prctl/PR_SET_CHILD_SUBREAPER is unsupported." + (define (process-exists? pid) + (catch #t + (lambda () (kill pid 0) #t) + (lambda _ #f))) + (for-each-service (lambda (service) + (let ((running (slot-ref service 'running))) + (when (and (integer? running) + (not (process-exists? running))) + (local-output "PID ~a (~a) is dead!" running (canonical-name service)) + (respawn-service service)))))) + (define root-service (make #:docstring "The root service is used to operate on shepherd itself." diff --git a/tests/basic.sh b/tests/basic.sh index 1ddb334..2ecd8fb 100644 --- a/tests/basic.sh +++ b/tests/basic.sh @@ -150,7 +150,7 @@ cat > "$confdir/some-conf.scm" < #:provides '(test-loaded) - #:start (const 42) + #:start (const 'abc) #:stop (const #f))) EOF @@ -166,7 +166,7 @@ $herd status test-loaded $herd status test-loaded | grep stopped $herd start test-loaded -$herd status test-loaded | grep -i 'running.*42' +$herd status test-loaded | grep -i 'running.*abc' $herd stop test-loaded $herd unload root test-loaded diff --git a/tests/status-sexp.sh b/tests/status-sexp.sh index b7c8cb4..11b967e 100644 --- a/tests/status-sexp.sh +++ b/tests/status-sexp.sh @@ -33,7 +33,7 @@ cat > "$conf"< #:provides '(foo) - #:start (const 42) + #:start (const 'abc) #:stop (const #f) #:docstring "Foo!" #:respawn? #t) @@ -85,7 +85,7 @@ root_service_sexp=" (service (version 0) (provides (foo)) (requires ()) (respawn? #t) (docstring \"Foo!\") - (enabled? #t) (running 42) (conflicts ()) + (enabled? #t) (running abc) (conflicts ()) (last-respawns ())) (service (version 0) (provides (bar)) (requires (foo)) -- 2.16.1