From ec47fa189c7d47f1d9444d939b084850f0a7186c Mon Sep 17 00:00:00 2001 From: Carlo Zancanaro Date: Wed, 21 Feb 2018 22:57:59 +1100 Subject: [PATCH 2/2] Poll every 0.5s to find dead forked services * modules/shepherd.scm (open-server-socket): Set socket to be non-blocking. (main): Use select with a timeout. If prctl failed when shepherd started then call check-for-dead-services between connections/timeouts. * modules/shepherd/service.scm (fork+exec-command): Install handle-SIGCHLD as signal handler. (respawn-service): Separate logic for respawning services from handling SIGCHLD. (handle-SIGCHLD, check-for-dead-services): New exported procedures. * tests/basic.sh, tests/status-sexp.sh: Replace constant integers with symbols. --- modules/shepherd.scm | 31 ++++++++++++++---- modules/shepherd/service.scm | 78 +++++++++++++++++++++++++++----------------- tests/basic.sh | 4 +-- tests/status-sexp.sh | 4 +-- 4 files changed, 76 insertions(+), 41 deletions(-) diff --git a/modules/shepherd.scm b/modules/shepherd.scm index ab59e08..b824546 100644 --- a/modules/shepherd.scm +++ b/modules/shepherd.scm @@ -42,6 +42,8 @@ (with-fluids ((%default-port-encoding "UTF-8")) (let ((sock (socket PF_UNIX SOCK_STREAM 0)) (address (make-socket-address AF_UNIX file-name))) + (fcntl sock F_SETFL (logior O_NONBLOCK + (fcntl sock F_GETFL))) (bind sock address) (listen sock 10) sock))) @@ -49,9 +51,17 @@ ;; Main program. (define (main . args) + (define poll-services + (if (= 1 (getpid)) + (lambda () #f) + (catch 'system-error + (lambda () + (prctl PR_SET_CHILD_SUBREAPER 1) + (lambda () #f)) + (lambda (key . args) + check-for-dead-services)))) + (initialize-cli) - (when (not (= 1 (getpid))) - (catch-system-error (prctl PR_SET_CHILD_SUBREAPER 1))) (let ((config-file #f) (socket-file default-socket-file) @@ -220,11 +230,18 @@ (_ #t)) (let next-command () - (match (accept sock) - ((command-source . client-address) - (setvbuf command-source _IOFBF 1024) - (process-connection command-source)) - (_ #f)) + (define (read-from sock) + (match (accept sock) + ((command-source . client-address) + (setvbuf command-source _IOFBF 1024) + (process-connection command-source)) + (_ #f))) + (match (select (list sock) (list) (list) 0.5) + (((sock) _ _) + (read-from sock)) + (_ + #f)) + (poll-services) (next-command)))))) (define (process-connection sock) diff --git a/modules/shepherd/service.scm b/modules/shepherd/service.scm index b6394f2..fc53d76 100644 --- a/modules/shepherd/service.scm +++ b/modules/shepherd/service.scm @@ -3,6 +3,7 @@ ;; Copyright (C) 2002, 2003 Wolfgang Järling ;; Copyright (C) 2014 Alex Sassmannshausen ;; Copyright (C) 2016 Alex Kost +;; Copyright (C) 2018 Carlo Zancanaro ;; ;; This file is part of the GNU Shepherd. ;; @@ -64,6 +65,7 @@ for-each-service lookup-services respawn-service + handle-SIGCHLD register-services provided-by required-by @@ -77,6 +79,7 @@ make-system-destructor make-init.d-service + check-for-dead-services root-service make-actions @@ -800,7 +803,7 @@ false." its PID." ;; Install the SIGCHLD handler if this is the first fork+exec-command call (unless %sigchld-handler-installed? - (sigaction SIGCHLD respawn-service SA_NOCLDSTOP) + (sigaction SIGCHLD handle-SIGCHLD SA_NOCLDSTOP) (set! %sigchld-handler-installed? #t)) (let ((pid (primitive-fork))) (if (zero? pid) @@ -991,7 +994,7 @@ child left." what (strerror errno)) '(0 . #f))))))) -(define (respawn-service signum) +(define (handle-SIGCHLD signum) "Handle SIGCHLD, possibly by respawning the service that just died, or otherwise by updating its state." (let loop () @@ -1009,39 +1012,42 @@ otherwise by updating its state." ;; SERV can be #f for instance when this code runs just after a ;; service's 'stop' method killed its process and completed. - (when serv - (slot-set! serv 'running #f) - (if (and (respawn? serv) - (not (respawn-limit-hit? (slot-ref serv 'last-respawns) - (car respawn-limit) - (cdr respawn-limit)))) - (if (not (slot-ref serv 'waiting-for-termination?)) - (begin - ;; Everything is okay, start it. - (local-output "Respawning ~a." - (canonical-name serv)) - (slot-set! serv 'last-respawns - (cons (current-time) - (slot-ref serv 'last-respawns))) - (start serv)) - ;; We have just been waiting for the - ;; termination. The `running' slot has already - ;; been set to `#f' by `stop'. - (begin - (local-output "Service ~a terminated." - (canonical-name serv)) - (slot-set! serv 'waiting-for-termination? #f))) - (begin - (local-output "Service ~a has been disabled." - (canonical-name serv)) - (when (respawn? serv) - (local-output " (Respawning too fast.)")) - (slot-set! serv 'enabled? #f)))) + (respawn-service serv) ;; As noted in libc's manual (info "(libc) Process Completion"), ;; loop so we don't miss any terminated child process. (loop)))))) +(define (respawn-service serv) + (when serv + (slot-set! serv 'running #f) + (if (and (respawn? serv) + (not (respawn-limit-hit? (slot-ref serv 'last-respawns) + (car respawn-limit) + (cdr respawn-limit)))) + (if (not (slot-ref serv 'waiting-for-termination?)) + (begin + ;; Everything is okay, start it. + (local-output "Respawning ~a." + (canonical-name serv)) + (slot-set! serv 'last-respawns + (cons (current-time) + (slot-ref serv 'last-respawns))) + (start serv)) + ;; We have just been waiting for the + ;; termination. The `running' slot has already + ;; been set to `#f' by `stop'. + (begin + (local-output "Service ~a terminated." + (canonical-name serv)) + (slot-set! serv 'waiting-for-termination? #f))) + (begin + (local-output "Service ~a has been disabled." + (canonical-name serv)) + (when (respawn? serv) + (local-output " (Respawning too fast.)")) + (slot-set! serv 'enabled? #f))))) + ;; Add NEW-SERVICES to the list of known services. (define (register-services . new-services) (define (register-single-service new) @@ -1171,6 +1177,18 @@ file when persistence is enabled." (lambda (p) (format p "~{~a ~}~%" running-services)))))) +(define (check-for-dead-services) + (define (process-exists? pid) + (catch #t + (lambda () (kill pid 0) #t) + (lambda _ #f))) + (for-each-service (lambda (service) + (let ((running (slot-ref service 'running))) + (when (and (integer? running) + (not (process-exists? running))) + (local-output "PID ~a (~a) is dead!" running (canonical-name service)) + (respawn-service service)))))) + (define root-service (make #:docstring "The root service is used to operate on shepherd itself." diff --git a/tests/basic.sh b/tests/basic.sh index 1ddb334..2ecd8fb 100644 --- a/tests/basic.sh +++ b/tests/basic.sh @@ -150,7 +150,7 @@ cat > "$confdir/some-conf.scm" < #:provides '(test-loaded) - #:start (const 42) + #:start (const 'abc) #:stop (const #f))) EOF @@ -166,7 +166,7 @@ $herd status test-loaded $herd status test-loaded | grep stopped $herd start test-loaded -$herd status test-loaded | grep -i 'running.*42' +$herd status test-loaded | grep -i 'running.*abc' $herd stop test-loaded $herd unload root test-loaded diff --git a/tests/status-sexp.sh b/tests/status-sexp.sh index b7c8cb4..11b967e 100644 --- a/tests/status-sexp.sh +++ b/tests/status-sexp.sh @@ -33,7 +33,7 @@ cat > "$conf"< #:provides '(foo) - #:start (const 42) + #:start (const 'abc) #:stop (const #f) #:docstring "Foo!" #:respawn? #t) @@ -85,7 +85,7 @@ root_service_sexp=" (service (version 0) (provides (foo)) (requires ()) (respawn? #t) (docstring \"Foo!\") - (enabled? #t) (running 42) (conflicts ()) + (enabled? #t) (running abc) (conflicts ()) (last-respawns ())) (service (version 0) (provides (bar)) (requires (foo)) -- 2.16.1