unofficial mirror of guix-patches@gnu.org 
 help / color / mirror / code / Atom feed
From: Mathieu Othacehe <othacehe@gnu.org>
To: 43552@debbugs.gnu.org
Cc: Mathieu Othacehe <othacehe@gnu.org>
Subject: [bug#43552] [PATCH] Add watchdog support.
Date: Mon, 21 Sep 2020 18:49:08 +0200	[thread overview]
Message-ID: <20200921164908.1396570-1-othacehe@gnu.org> (raw)

* src/cuirass/watchdog.scm: New file.
* Makefile.am (dist_pkgmodule_DATA): Add it.
* src/cuirass/utils.scm (with-timeout, get-message-with-timeout): Export them.
* bin/cuirass.in (main): Start the watchdog.
---

Hello,

I've noticed that during several hours all Cuirass fibers seem to be
stucked. I suspect that this is because a fiber is doing a blocking action,
hence preventing its scheduler to run other fibers.

This patch adds a watchdog that print warning messages if some Fibers
scheduler is hanging for more than 5 seconds. The next step will be to
identify the faulty fiber.

Thanks,

Mathieu

 Makefile.am              |  5 ++-
 bin/cuirass.in           |  3 +-
 src/cuirass/utils.scm    |  3 ++
 src/cuirass/watchdog.scm | 81 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 src/cuirass/watchdog.scm

diff --git a/Makefile.am b/Makefile.am
index 0e4d3c8..2c7e6e2 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,8 +50,9 @@ dist_pkgmodule_DATA =				\
   src/cuirass/metrics.scm			\
   src/cuirass/send-events.scm			\
   src/cuirass/ui.scm				\
-  src/cuirass/utils.scm             \
-  src/cuirass/templates.scm
+  src/cuirass/utils.scm		                \
+  src/cuirass/templates.scm                     \
+  src/cuirass/watchdog.scm
 
 nodist_pkgmodule_DATA = \
   src/cuirass/config.scm
diff --git a/bin/cuirass.in b/bin/cuirass.in
index ed21ed7..7f4793e 100644
--- a/bin/cuirass.in
+++ b/bin/cuirass.in
@@ -31,6 +31,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@"
              (cuirass logging)
              (cuirass metrics)
              (cuirass utils)
+             (cuirass watchdog)
              (guix ui)
              ((guix build utils) #:select (mkdir-p))
              (fibers)
@@ -142,7 +143,7 @@ exec ${GUILE:-@GUILE@} --no-auto-compile -e main -s "$0" "$@"
                  (if one-shot?
                      (process-specs (db-get-specifications))
                      (let ((exit-channel (make-channel)))
-
+                       (start-watchdog)
                        (if (option-ref opts 'web #f)
                            (begin
                              (spawn-fiber
diff --git a/src/cuirass/utils.scm b/src/cuirass/utils.scm
index 00cfef6..7ce4b83 100644
--- a/src/cuirass/utils.scm
+++ b/src/cuirass/utils.scm
@@ -37,6 +37,9 @@
             define-enumeration
             unwind-protect
 
+            with-timeout
+            get-message-with-timeout
+
             make-worker-thread-channel
             call-with-worker-thread
             with-worker-thread
diff --git a/src/cuirass/watchdog.scm b/src/cuirass/watchdog.scm
new file mode 100644
index 0000000..b912a9f
--- /dev/null
+++ b/src/cuirass/watchdog.scm
@@ -0,0 +1,81 @@
+;;; watchdog.scm -- Monitor fibers scheduling.
+;;; This file is part of Cuirass.
+;;;
+;;; Cuirass is free software: you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation, either version 3 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; Cuirass is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with Cuirass.  If not, see <http://www.gnu.org/licenses/>.
+
+(define-module (cuirass watchdog)
+  #:use-module (cuirass logging)
+  #:use-module (cuirass utils)
+  #:use-module (fibers)
+  #:use-module (fibers channels)
+  #:use-module (fibers internal)
+  #:use-module (fibers operations)
+  #:use-module (ice-9 match)
+  #:use-module (ice-9 threads)
+  #:export (start-watchdog))
+
+(define* (watchdog-fiber scheduler channel
+                         #:key
+                         timeout)
+  "Spawn a fiber running on SCHEDULER that sends over CHANNEL, every TIMEOUT
+seconds, the scheduler name and the current time."
+  (spawn-fiber
+   (lambda ()
+     (while #t
+       (put-message channel (list (scheduler-name scheduler)
+                                  (current-time)))
+       (sleep timeout)))
+   scheduler))
+
+(define* (start-watchdog #:key (timeout 5))
+  "Start a watchdog checking that each Fibers scheduler is not blocked for
+more than TIMEOUT seconds.
+
+The watchdog mechanism consists in spawning a dedicated fiber per running
+Fiber scheduler, using the above watchdog-fiber method.  Those fibers send a
+ping signal periodically to a separate thread.  If no signal is received from
+one of the schedulers for more than TIMEOUT seconds, a warning message is
+printed."
+  (define (check-timeouts pings)
+    (for-each
+     (match-lambda
+       ((scheduler . time)
+        (let* ((cur-time (current-time))
+               (diff-ping (- cur-time time)))
+          (when (> diff-ping timeout)
+            (log-message "Scheduler ~a blocked since ~a seconds."
+                         scheduler diff-ping)))))
+     pings))
+
+  (let ((watchdog-channel (make-channel)))
+    (parameterize (((@@ (fibers internal) current-fiber) #f))
+      (call-with-new-thread
+       (lambda ()
+         (let loop ((pings '()))
+           (let ((operation-timeout 2))
+             (match (perform-operation
+                     (with-timeout
+                      (get-operation watchdog-channel)
+                      #:seconds operation-timeout
+                      #:wrap (const 'timeout)))
+               ((scheduler ping)
+                (loop (assq-set! pings scheduler ping)))
+               ('timeout
+                (check-timeouts pings)
+                (loop pings))))))))
+    (fold-all-schedulers
+     (lambda (name scheduler seed)
+       (watchdog-fiber scheduler watchdog-channel
+                       #:timeout timeout))
+     '())))
-- 
2.28.0





             reply	other threads:[~2020-09-21 16:50 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-09-21 16:49 Mathieu Othacehe [this message]
2020-09-22  8:34 ` [bug#43552] [PATCH] Add watchdog support Mathieu Othacehe
2020-09-29 15:05 ` bug#43552: " Mathieu Othacehe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

  List information: https://guix.gnu.org/

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200921164908.1396570-1-othacehe@gnu.org \
    --to=othacehe@gnu.org \
    --cc=43552@debbugs.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
Code repositories for project(s) associated with this public inbox

	https://git.savannah.gnu.org/cgit/guix.git

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).