all messages for Guix-related lists mirrored at yhetil.org
 help / color / mirror / code / Atom feed
* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
@ 2018-06-11 14:06 Ludovic Courtès
  2019-09-06  9:04 ` Ludovic Courtès
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Ludovic Courtès @ 2018-06-11 14:06 UTC (permalink / raw)
  To: 31785

Hello,

I tried running this:

  guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort) & \
  guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort -r)

… also in parallel with this (for good measure):

  guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort -R)

Since we have 3 clients, that leads to 3 guix-daemon processes, and
those are stuck in a deadlock:

--8<---------------cut here---------------start------------->8---
$ ps aux | grep guix-daemon
[…]
root     20501  0.6  0.0  39292 12012 ?        Ss   15:51   0:01 guix-daemon 20455
root     20503  4.3  0.0  39420 12096 ?        Ss   15:51   0:12 guix-daemon 20491
root     22154  0.0  0.0  39028 11016 ?        Ss   15:52   0:00 guix-daemon 22148
$ sudo strace -p 22154 -t
strace: Process 22154 attached
15:57:03 select(0, 0x7ffdca30b610, NULL, NULL, {tv_sec=1, tv_usec=314189}) = 0 (Timeout)
15:57:04 openat(AT_FDCWD, "/gnu/store/ghc9vcc8ydv305jz0spd73qrrf30gxik-binutils-bootstrap-0.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/5hdpffkgkfszpsin2ir50zscz06x0mb5-coreutils-8.29-debug.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/aasfcliv85frq3aawpa4rw18nvlvl735-gettext-minimal-0.19.8.1.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/j1kpzcpi1m1ihzwyqb60zd4s74yp85g8-glibc-bootstrap-0.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/4llnxsgr6mxxj6pk21d9vbwaxnkbn99k-gnutls-3.5.18-doc.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/7rs9qx86f2w4hjbf9wjm10k4m2vvlbyv-groff-1.22.3-doc.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/d645d919gp1zk8v65qa2r4ia5x1c5n6k-libstdc++-5.5.0.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/amcs7l0ynj1qg6fp9ll3asiamd4zsq75-m4-1.4.18.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/p262qk7zsd2wnlal6x1jax2xbqvdddqq-m4-1.4.18.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/zvvgr301hz95ac15rp0wsjy5j4cdgkg4-module-import-compiled.lock", O_RDWR|O_CREAT, 0600) = 8
15:57:04 fcntl(8, F_GETFD)              = 0
15:57:04 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:57:04 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:57:04 close(8)                       = 0
15:57:04 openat(AT_FDCWD, "/gnu/store/760kj698c9x1vcmlzk1qr8i9hqw961n3-openssl-1.0.2o-doc.lock", O_RDWR|O_CREAT, 0600) = 8
^C
$ sudo strace -p 20503 -t
strace: Process 20503 attached
15:58:23 select(445, [15 54 159 177 321 411 414 444], NULL, NULL, {tv_sec=1, tv_usec=595302}) = 0 (Timeout)
15:58:25 openat(AT_FDCWD, "/gnu/store/amcs7l0ynj1qg6fp9ll3asiamd4zsq75-m4-1.4.18.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:25 fcntl(14, F_GETFD)             = 0
15:58:25 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:25 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:25 close(14)                      = 0
15:58:25 openat(AT_FDCWD, "/gnu/store/zvvgr301hz95ac15rp0wsjy5j4cdgkg4-module-import-compiled.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:25 fcntl(14, F_GETFD)             = 0
15:58:25 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:25 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:25 close(14)                      = 0
15:58:25 openat(AT_FDCWD, "/gnu/store/760kj698c9x1vcmlzk1qr8i9hqw961n3-openssl-1.0.2o-doc.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:25 fcntl(14, F_GETFD)             = 0
15:58:25 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:25 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:25 close(14)                      = 0
15:58:25 select(445, [15 54 159 177 321 411 414 444], NULL, NULL, {tv_sec=5, tv_usec=0}) = 0 (Timeout)
15:58:30 openat(AT_FDCWD, "/gnu/store/amcs7l0ynj1qg6fp9ll3asiamd4zsq75-m4-1.4.18.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:30 fcntl(14, F_GETFD)             = 0
15:58:30 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:30 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:30 close(14)                      = 0
15:58:30 openat(AT_FDCWD, "/gnu/store/zvvgr301hz95ac15rp0wsjy5j4cdgkg4-module-import-compiled.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:30 fcntl(14, F_GETFD)             = 0
15:58:30 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:30 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:30 close(14)                      = 0
15:58:30 openat(AT_FDCWD, "/gnu/store/760kj698c9x1vcmlzk1qr8i9hqw961n3-openssl-1.0.2o-doc.lock", O_RDWR|O_CREAT, 0600) = 14
15:58:30 fcntl(14, F_GETFD)             = 0
15:58:30 fcntl(14, F_SETFD, FD_CLOEXEC) = 0
15:58:30 fcntl(14, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:58:30 close(14)                      = 0
15:58:30 select(445, [15 54 159 177 321 411 414 444], NULL, NULL, {tv_sec=5, tv_usec=0}^Cstrace: Process 20503 detached
$ sudo strace -p 20501 -t
strace: Process 20501 attached
15:59:09 select(121, [20 30 120], NULL, NULL, {tv_sec=0, tv_usec=535316}) = 0 (Timeout)
15:59:10 openat(AT_FDCWD, "/gnu/store/ghc9vcc8ydv305jz0spd73qrrf30gxik-binutils-bootstrap-0.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/5hdpffkgkfszpsin2ir50zscz06x0mb5-coreutils-8.29-debug.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/aasfcliv85frq3aawpa4rw18nvlvl735-gettext-minimal-0.19.8.1.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/j1kpzcpi1m1ihzwyqb60zd4s74yp85g8-glibc-bootstrap-0.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/4llnxsgr6mxxj6pk21d9vbwaxnkbn99k-gnutls-3.5.18-doc.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/7rs9qx86f2w4hjbf9wjm10k4m2vvlbyv-groff-1.22.3-doc.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/d645d919gp1zk8v65qa2r4ia5x1c5n6k-libstdc++-5.5.0.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 openat(AT_FDCWD, "/gnu/store/p262qk7zsd2wnlal6x1jax2xbqvdddqq-m4-1.4.18.lock", O_RDWR|O_CREAT, 0600) = 8
15:59:10 fcntl(8, F_GETFD)              = 0
15:59:10 fcntl(8, F_SETFD, FD_CLOEXEC)  = 0
15:59:10 fcntl(8, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Risurco dumtempe ne disponeblas)
15:59:10 close(8)                       = 0
15:59:10 select(121, [20 30 120], NULL, NULL, {tv_sec=5, tv_usec=0}^Cstrace: Process 20501 detached
$ sudo lsof | grep '/gnu/store/.*\.lock'
guix-daem 20501             root   14uW     REG              253,0          0    3150879 /gnu/store/zvvgr301hz95ac15rp0wsjy5j4cdgkg4-module-import-compiled.lock
guix-daem 20501             root   26uW     REG              253,0          0    3212618 /gnu/store/760kj698c9x1vcmlzk1qr8i9hqw961n3-openssl-1.0.2o-doc.lock
guix-daem 20501             root  116uW     REG              253,0          0    3212684 /gnu/store/amcs7l0ynj1qg6fp9ll3asiamd4zsq75-m4-1.4.18.lock
guix-daem 20503             root    8uW     REG              253,0          0    3212520 /gnu/store/7rs9qx86f2w4hjbf9wjm10k4m2vvlbyv-groff-1.22.3-doc.lock
guix-daem 20503             root   50uW     REG              253,0          0    3212514 /gnu/store/ghc9vcc8ydv305jz0spd73qrrf30gxik-binutils-bootstrap-0.lock
guix-daem 20503             root  155uW     REG              253,0          0    3212547 /gnu/store/j1kpzcpi1m1ihzwyqb60zd4s74yp85g8-glibc-bootstrap-0.lock
guix-daem 20503             root  173uW     REG              253,0          0    3212552 /gnu/store/4llnxsgr6mxxj6pk21d9vbwaxnkbn99k-gnutls-3.5.18-doc.lock
guix-daem 20503             root  317uW     REG              253,0          0    3212602 /gnu/store/d645d919gp1zk8v65qa2r4ia5x1c5n6k-libstdc++-5.5.0.lock
guix-daem 20503             root  407uW     REG              253,0          0    3212655 /gnu/store/5hdpffkgkfszpsin2ir50zscz06x0mb5-coreutils-8.29-debug.lock
guix-daem 20503             root  410uW     REG              253,0          0    3212659 /gnu/store/aasfcliv85frq3aawpa4rw18nvlvl735-gettext-minimal-0.19.8.1.lock
guix-daem 20503             root  440uW     REG              253,0          0    3212686 /gnu/store/p262qk7zsd2wnlal6x1jax2xbqvdddqq-m4-1.4.18.lock
--8<---------------cut here---------------end--------------->8---

So for example:

  • 20501 tries to acquire
    aasfcliv85frq3aawpa4rw18nvlvl735-gettext-minimal-0.19.8.1.lock,
    which is held by 20503;

  • 20503 tries to acquire
    amcs7l0ynj1qg6fp9ll3asiamd4zsq75-m4-1.4.18.lock, which is held by
    20501.

This comes from the fact that ‘LocalStore::buildPaths’ takes the
user-supplied derivation list as is, without sorting it, and then
acquires locks in that order in ‘Worker::run’.

A topological sort (or maybe an alphanumeric sort?) should allow us to
guarantee that guix-daemon processes take locks in the same order, and
then don’t end up in a deadlock.

I discovered this bug while monitoring Cuirass on berlin: several
sessions submit batches of 200 derivations in ‘build-paths’ RPCs, and
sometimes most of the corresponding guix-daemon processes would end up
being stuck in a lock-acquiring loop.

Ludo’.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2018-06-11 14:06 bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock Ludovic Courtès
@ 2019-09-06  9:04 ` Ludovic Courtès
  2020-11-04 15:21 ` Ludovic Courtès
  2024-12-21 16:22 ` Ludovic Courtès
  2 siblings, 0 replies; 7+ messages in thread
From: Ludovic Courtès @ 2019-09-06  9:04 UTC (permalink / raw)
  To: 31785

Here’s another example from berlin today:

--8<---------------cut here---------------start------------->8---
ludo@berlin$ sudo guix processes
SessionPID: 38649
ClientPID: 38611
ClientCommand: /gnu/store/7x9lrzs2l00mfcr3ya7hg1i06i7lq9pd-profile/bin/guile \ /home/mbakke/guix/master/scripts/guix build --no-grafts --keep-going --max-silent-time=36000 --timeout=216000 -s armhf-linux nmoldyn r-msnid r-yamss netcdf-fortran python-netcdf4
LockHeld: /gnu/store/v6hcm1gvv3gdfbnq6cd4kn1i6ip3y74s-netcdf-fortran-4.4.4.lock
LockHeld: /gnu/store/ydxl0ws9kk6ix58m99ib7qhi157yq49h-r-msnid-1.18.1.lock
ChildProcess: 33181: /gnu/store/9alic3caqhay3h8mx4iihpmyj6ymqpcx-guile-2.2.4/bin/guile \ /gnu/store/vyhfp0gj4rx74yw0ybalzj8bvmqgp9pr-guix-1.0.1-4.c902458/bin/.guix-real offload x86_64-linux 36000 1 216000

[...]

SessionPID: 44043
ClientPID: 44023
ClientCommand: /gnu/store/7x9lrzs2l00mfcr3ya7hg1i06i7lq9pd-profile/bin/guile \ /home/mbakke/guix/master/scripts/guix build --no-grafts --keep-going --max-silent-time=36000 --timeout=216000 -s armhf-linux netcdf-fortran r-msnid nmoldyn r-yamss
LockHeld: /gnu/store/69jjabypcpx83482rh3yp4a9ncjimw6b-r-yamss-1.10.0.lock
LockHeld: /gnu/store/ibdbwxb1i8yal8ilqxxgnzh6bxpinic1-nmoldyn-3.0.11.lock
ChildProcess: 48997: /gnu/store/9alic3caqhay3h8mx4iihpmyj6ymqpcx-guile-2.2.4/bin/guile \ /gnu/store/vyhfp0gj4rx74yw0ybalzj8bvmqgp9pr-guix-1.0.1-4.c902458/bin/.guix-real offload x86_64-linux 36000 1 216000

ludo@berlin ~$ sudo strace -p 38649 -e openat,fcntl
strace: Process 38649 attached
openat(AT_FDCWD, "/gnu/store/ibdbwxb1i8yal8ilqxxgnzh6bxpinic1-nmoldyn-3.0.11.lock", O_RDWR|O_CREAT, 0600) = 10
fcntl(10, F_GETFD)                      = 0
fcntl(10, F_SETFD, FD_CLOEXEC)          = 0
fcntl(10, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Resource temporarily unavailable)
openat(AT_FDCWD, "/gnu/store/69jjabypcpx83482rh3yp4a9ncjimw6b-r-yamss-1.10.0.lock", O_RDWR|O_CREAT, 0600) = 10
fcntl(10, F_GETFD)                      = 0
fcntl(10, F_SETFD, FD_CLOEXEC)          = 0
fcntl(10, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Resource temporarily unavailable)
^Cstrace: Process 38649 detached

ludo@berlin ~$ sudo strace -p 44043 -e openat,fcntl
strace: Process 44043 attached
openat(AT_FDCWD, "/gnu/store/ydxl0ws9kk6ix58m99ib7qhi157yq49h-r-msnid-1.18.1.lock", O_RDWR|O_CREAT, 0600) = 10
fcntl(10, F_GETFD)                      = 0
fcntl(10, F_SETFD, FD_CLOEXEC)          = 0
fcntl(10, F_SETLK, {l_type=F_WRLCK, l_whence=SEEK_SET, l_start=0, l_len=0}) = -1 EAGAIN (Resource temporarily unavailable)
^Cstrace: Process 44043 detached
--8<---------------cut here---------------end--------------->8---

A very palpable deadlock.

Ludo’.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2018-06-11 14:06 bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock Ludovic Courtès
  2019-09-06  9:04 ` Ludovic Courtès
@ 2020-11-04 15:21 ` Ludovic Courtès
  2024-12-21 16:22 ` Ludovic Courtès
  2 siblings, 0 replies; 7+ messages in thread
From: Ludovic Courtès @ 2020-11-04 15:21 UTC (permalink / raw)
  To: 31785; +Cc: Mathieu Othacehe

Hi,

ludo@gnu.org (Ludovic Courtès) skribis:

> This comes from the fact that ‘LocalStore::buildPaths’ takes the
> user-supplied derivation list as is, without sorting it, and then
> acquires locks in that order in ‘Worker::run’.

This diagnostic is incorrect: ‘Goals’ is a set sorted according to
‘CompareGoalPtrs’, which is lexical sort that arranges so substitution
goals come before derivation goals.  Thus, ‘_topGoals’ and ‘awake2’ in
Worker::run are sorted in a deterministic fashion.


The problem is that ‘Worker::waitForAWhile’ reshuffles the order of
goals by temporarily moving goals out of the way.  This can happen when
offloading replies “postpone”, which is inherently non-deterministic
(which goals are put to sleep will vary from one session to another
session.)

When those goals are eventually woken up from ‘Worker::waitForInput’,
they’re reprocessed, in sorted order, but potentially with “holes”
compared to other ‘guix-daemon’ processes.

That’s only a partial explanation; we need to go further to come up with
an actual deadlock scenario.

Ludo’.




^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2018-06-11 14:06 bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock Ludovic Courtès
  2019-09-06  9:04 ` Ludovic Courtès
  2020-11-04 15:21 ` Ludovic Courtès
@ 2024-12-21 16:22 ` Ludovic Courtès
  2024-12-21 17:08   ` Ludovic Courtès
  2 siblings, 1 reply; 7+ messages in thread
From: Ludovic Courtès @ 2024-12-21 16:22 UTC (permalink / raw)
  To: 31785

ludo@gnu.org (Ludovic Courtès) skribis:

> I tried running this:
>
>   guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort) & \
>   guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort -r)
>
> … also in parallel with this (for good measure):
>
>   guix build --max-jobs=200 $(guix gc -R $(guix build -d inkscape --no-grafts) | sort -R)
>
> Since we have 3 clients, that leads to 3 guix-daemon processes, and
> those are stuck in a deadlock:

This strikes again: ‘cuirass remote-worker’ processes on berlin
occasionally end up deadlocking in the exact same way.

When running ‘current remote-worker --workers=4’, 4 sessions (4 clients)
are used, which can lead to that situation, as in this example:

--8<---------------cut here---------------start------------->8---
root@hydra-guix-126 ~# guix processes |guix shell recutils -- recsel -p 'SessionPID,ClientCommand,LockHeld'
SessionPID: 27250
ClientCommand: /gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile -e main -s /gnu/store/ll18sc406b5cqapmvz17v22gh4sryb24-cuirass-1.2.0-11.e96f088/bin/.cuirass-real remote-worker --user=cuirass-worker --workers=4 --systems=x86_64-linux,i686-linux --publish-port=5558 --substitute-urls=http://141.80.167.131

SessionPID: 27269
ClientCommand: /gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile -e main -s /gnu/store/ll18sc406b5cqapmvz17v22gh4sryb24-cuirass-1.2.0-11.e96f088/bin/.cuirass-real remote-worker --user=cuirass-worker --workers=4 --systems=x86_64-linux,i686-linux --publish-port=5558 --substitute-urls=http://141.80.167.131
LockHeld: /gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0.lock
LockHeld: /gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1.lock

SessionPID: 27308
ClientCommand: /gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile -e main -s /gnu/store/ll18sc406b5cqapmvz17v22gh4sryb24-cuirass-1.2.0-11.e96f088/bin/.cuirass-real remote-worker --user=cuirass-worker --workers=4 --systems=x86_64-linux,i686-linux --publish-port=5558 --substitute-urls=http://141.80.167.131
LockHeld: /gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0.lock

SessionPID: 27345
ClientCommand: /gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile -e main -s /gnu/store/ll18sc406b5cqapmvz17v22gh4sryb24-cuirass-1.2.0-11.e96f088/bin/.cuirass-real remote-worker --user=cuirass-worker --workers=4 --systems=x86_64-linux,i686-linux --publish-port=5558 --substitute-urls=http://141.80.167.131
LockHeld: /gnu/store/0xbi2bgq34yyx2fqjjwpgdv4gkfyaf60-gst-plugins-bad-minimal-1.22.3.lock
LockHeld: /gnu/store/ij5igi5xrp4sx6c78nbvg24lb4ma2f4l-libcbor-0.11.0.lock
LockHeld: /gnu/store/czfvm14yy517vb8w2hpp46nyrdrymqyp-libfido2-1.12.0.lock
LockHeld: /gnu/store/1ldcq0p20nqy7d3mxdy4yra1ax5ik3xc-mpg123-1.31.2.lock
LockHeld: /gnu/store/sadbf1fmb0n9k754x5jbbdklcxbjqlhx-openssh-9.9p1.lock
LockHeld: /gnu/store/86rl29llmb7s4sl3bx0vl465mmq7nk6f-gcr-3.41.2.lock

SessionPID: 27382
ClientCommand: /gnu/store/mfkz7fvlfpv3ppwbkv0imb19nrf95akf-guile-3.0.9/bin/guile --no-auto-compile -e main -s /gnu/store/ll18sc406b5cqapmvz17v22gh4sryb24-cuirass-1.2.0-11.e96f088/bin/.cuirass-real remote-worker --user=cuirass-worker --workers=4 --systems=x86_64-linux,i686-linux --publish-port=5558 --substitute-urls=http://141.80.167.131
--8<---------------cut here---------------end--------------->8---

Here process 27269 holds locks on libva and rav1e and waits forever
trying to get the dav1d lock, held by 27308; process 27308 tries to get
the rav1e lock; process 27345 tries to get the libva lock.

FWIW, each of them is trying to substitute (not build) those things, via
the ‘build-things’ call made after the “substituting ~a inputs for ~a”
message in remote-worker.

Ludo’.




^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2024-12-21 16:22 ` Ludovic Courtès
@ 2024-12-21 17:08   ` Ludovic Courtès
  2024-12-21 22:43     ` Ludovic Courtès
  2024-12-24 13:08     ` Reepca Russelstein via Bug reports for GNU Guix
  0 siblings, 2 replies; 7+ messages in thread
From: Ludovic Courtès @ 2024-12-21 17:08 UTC (permalink / raw)
  To: 31785; +Cc: Christopher Baines, Reepca Russelstein

(Cc’ing Reepca + Chris for insight!  See
<https://issues.guix.gnu.org/31785> for context.)

Ludovic Courtès <ludo@gnu.org> skribis:

> Here process 27269 holds locks on libva and rav1e and waits forever
> trying to get the dav1d lock, held by 27308; process 27308 tries to get
> the rav1e lock; process 27345 tries to get the libva lock.

Additional data points:

  • All three store items are valid, yet client sessions are still stuck
    with locks held and trying to acquire the other locks;

  • The associated timestamp of these three store items in
    /var/guix/db/db.sqlite is the same (one-second accuracy);

  • The timestamps corresponds exactly to that of the “fetching path”
    messages in the log:

--8<---------------cut here---------------start------------->8---
root@hydra-guix-126 ~# grep -C3 -E "fetching.*(rav1e-0.7.1|libva-2.19.0|dav1d-1.5.0)'" /var/log/cuirass-remote-worker.log
2024-12-21 13:27:21  libgdata-0.18.1.tar.xz  832KiB    133.9MiB/s 00:00 ???????????????????? 100.0%
2024-12-21 13:27:21
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/9zbfmr41v0g4a5wm5s4yzwn8hg8l50b0-libgdata-0.18.1.tar.xz
2024-12-21 13:27:21 fetching path `/gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0 substitute
2024-12-21 13:27:21 GmMxDR0c: substituting 14 inputs for /gnu/store/21m6i110abpzdjsb2kbz2c6sm8zy3zpx-sugar-cellgame-activity-5-1.4a22fd1.drv
2024-12-21 13:27:21 Downloading http://141.80.167.131/nar/lzip/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0...
--
2024-12-21 13:27:21  guile_gi-0.3.2.tar.gz  876KiB     124.3MiB/s 00:00 ???????????????????? 100.0%
2024-12-21 13:27:21
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/db700nxijpqn34a22nxpajbw2pwffkpv-guile_gi-0.3.2.tar.gz
2024-12-21 13:27:21 fetching path `/gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0 substitute
2024-12-21 13:27:21 Downloading http://141.80.167.131/nar/zstd/ij5igi5xrp4sx6c78nbvg24lb4ma2f4l-libcbor-0.11.0...
2024-12-21 13:27:21
--
2024-12-21 13:27:21
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/ij5igi5xrp4sx6c78nbvg24lb4ma2f4l-libcbor-0.11.0
2024-12-21 13:27:21 Downloading http://141.80.167.131/nar/zstd/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0...
2024-12-21 13:27:21 fetching path `/gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1 substitute
2024-12-21 13:27:21 waiting for locks or build slots...
2024-12-21 13:27:21
--8<---------------cut here---------------end--------------->8---

It would seem that the root cause is that locks aren’t released even
though substitution succeeded:

--8<---------------cut here---------------start------------->8---
root@hydra-guix-126 ~# grep -E '(zf5w9ypk8il0i9y22n81aamypr2qgsmm|72s7500g3zg2p6fjdc1paazvm1w2xdr2|0bbnhq7bagn6sbj2lmapmdiiw50v3dgz)' /var/log/cuirass-remote-worker.log
2024-12-21 13:27:21 fetching path `/gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0 substitute
2024-12-21 13:27:21 Downloading http://141.80.167.131/nar/lzip/zf5w9ypk8il0i9y22n81aamypr2qgsm -dav1d-1.5.0...
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/zf5w9ypk8il0i9y22n81aamypr2qgsmm-dav1d-1.5.0
2024-12-21 13:27:21 fetching path `/gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0 substitute
2024-12-21 13:27:21 Downloading http://141.80.167.131/nar/zstd/72s7500g3zg2p6fjdc1paazvm1w2xdr -libva-2.19.0...
2024-12-21 13:27:21 fetching path `/gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1'...
2024-12-21 13:27:21 @ substituter-started /gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1 substitute
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/72s7500g3zg2p6fjdc1paazvm1w2xdr2-libva-2.19.0
2024-12-21 13:27:21  gst-plugins-espeak-0.5.0-1.7f6e412  19KiB 4.5MiB/s 00:00 ???????????????????? 100.0%Downloading http://141.80.167.131/nar/zstd/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1...
2024-12-21 13:27:21 @ substituter-succeeded /gnu/store/0bbnhq7bagn6sbj2lmapmdiiw50v3dgz-rav1e-0.7.1
--8<---------------cut here---------------end--------------->8---

Thoughts?

Ludo’.




^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2024-12-21 17:08   ` Ludovic Courtès
@ 2024-12-21 22:43     ` Ludovic Courtès
  2024-12-24 13:08     ` Reepca Russelstein via Bug reports for GNU Guix
  1 sibling, 0 replies; 7+ messages in thread
From: Ludovic Courtès @ 2024-12-21 22:43 UTC (permalink / raw)
  To: 31785; +Cc: Christopher Baines, Reepca Russelstein

Ludovic Courtès <ludo@gnu.org> skribis:

> It would seem that the root cause is that locks aren’t released even
> though substitution succeeded:

<https://github.com/NixOS/nix/issues/178> was about an issue that looks
exactly the same, but the fix is already included in our copy of
‘build.cc’.

Ludo’.




^ permalink raw reply	[flat|nested] 7+ messages in thread

* bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock
  2024-12-21 17:08   ` Ludovic Courtès
  2024-12-21 22:43     ` Ludovic Courtès
@ 2024-12-24 13:08     ` Reepca Russelstein via Bug reports for GNU Guix
  1 sibling, 0 replies; 7+ messages in thread
From: Reepca Russelstein via Bug reports for GNU Guix @ 2024-12-24 13:08 UTC (permalink / raw)
  To: Ludovic Courtès; +Cc: Christopher Baines, 31785


[-- Attachment #1.1: Type: text/plain, Size: 7370 bytes --]


(note: the following includes some "thinking out loud", skip to the
patch if you're in a hurry)

I'm a bit puzzled how exactly a true "forward-progress-not-possible"
deadlock can even occur, especially in the manner you describe.  The
derivation graph is inherently a directed acyclic graph, so as long as
locks are only acquired for a given derivation after all of its inputs
(or references, for substitutions) are present, and are released once
the desired outputs are present, it should only even theoretically be
possible to have a deadlock between different outputs of the same
derivation.

These requirements /seem/ to be satisfied by build.cc.  Path locks are
only acquired in DerivationGoal::tryToBuild and
SubstitutionGoal::tryToRun, and the former is only reachable through
DerivationGoal::inputsRealised (and DerivationGoal::buildDone in case of
multiple rounds), and the latter is only reachable through
SubstitutionGoal::referencesValid.  As their names imply, having made it
through them should suffice to prove that the inputs and references are
present.

That leaves the requirement that the locks are released once the desired
outputs are present.  It /is/ possible for a lock to stick around longer
than desired (along with many other bad things) in
DerivationGoal::tryToBuild, but only if a cached build failure is
encountered, to my knowledge (notice how there is no call to done,
amDone, or any rescheduling procedure in that case; the running
DerivationGoal just drops off the face of the earth and tells nobody,
including any goal waiting on it).  This also happens in
DerivationGoal::haveDerivation.

I assume you're not using cached build failures, right?  It defaults to
off, so it should only be in use if you explicitly passed
--cache-failures to guix-daemon (or had your client pass a value of
"true" for "build-cache-failure", but I don't see that string anywhere
in the guile side of guix).

One detail that worries me is that SubstitutionGoal.outputLock is a
std::shared_ptr<PathLocks> instead of a PathLocks.  I cannot for the
life of me figure out why this is the case, since no sharing of
ownership of the underlying PathLocks seems to ever be done.
outputLock.reset() seems to be used as if it were a synonym for
outputLock.unlock() in many places, so whoever wrote those uses probably
thought the same.  As far as I can tell, it *should* always cause
PathLocks::unlock to be called, so it shouldn't be an issue, it just
worries me that I don't understand why it's done that way.

Ahh, I think I may have just found another place in
DerivationGoal::tryToBuild that might be the source of your issues.
First, note the comment:

/* Obtain locks on all output paths.  The locks are automatically
   released when we exit this function or the client crashes. ... */

Well, as I noticed earlier in the cached build failures case, this isn't
exactly accurate, and there's probably a reason why.  The comment seems
to believe specifically that leaving the scope of tryToBuild will
automatically release the locks, which suggests to me that at one point
outputLocks was a local variable, rather than a field of DerivationGoal.
This theorized older version would be consistent with what we see in
this (current) snippet:

    if (buildMode != bmCheck && validPaths.size() == drv.outputs.size()) {
        debug(format("skipping build of derivation `%1%', someone beat us to it") % drvPath);
        outputLocks.setDeletion(true);
        done(BuildResult::AlreadyValid);
        return;
    }

If leaving the scope of tryToBuild would cause outputLocks.unlock to
run, then outputLocks.setDeletion is all that would need to be called.
But since that isn't the case, this will cause the lock to be held for
as long as the DerivationGoal exists.

(note that I haven't actually checked the commit history to see whether
outputLocks was or wasn't at some point a local variable - I have no
idea why the commenter thought returning would automatically release it)

Perhaps this same error was made elsewhere.  C-s to see where else
setDeletion is used without an accompanying unlock and... sure enough,
in SubstitutionGoal::tryToRun, we see:

    /* Check again whether the path is invalid. */
    if (!repair && worker.store.isValidPath(storePath)) {
        debug(format("store path `%1%' has become valid") % storePath);
        outputLock->setDeletion(true);
        amDone(ecSuccess);
        return;
    }

Now outputLock will be held for as long as this SubstitutionGoal exists.

The Nix issue that you identified as closely resembling what you were
encountering was resolved with this commit message
(https://github.com/NixOS/nix/commit/29cde917fe6b8f2e669c8bf10b38f640045c83b8):

-------------------------------------------------------------------
Fix deadlock in SubstitutionGoal

We were relying on SubstitutionGoal's destructor releasing the lock,
but if a goal is a top-level goal, the destructor won't run in a
timely manner since its reference count won't drop to zero.  So
release it explicitly.
-------------------------------------------------------------------

Hmmm.  Destructors not running in a timely manner, and in this
particular case - which is guaranteed to occur when one guix-daemon
process gets blocked by another holding a lock and the one initially
holding the lock produces the output path - the only way the locks get
released is when the destructors are run.  I do believe this may be
related!

One other possible reason for destructors not running in as timely a
manner as one might hope (though probably not as severely as for
top-level goals) would be that Worker::run uses a ready queue, awake2,
of type Goals, which contains strong references.  This ensures that any
Goal on this scheduler turn will not be destroyed, even if it has
already run, until the end of this scheduler turn is reached.  This
probably isn't the cause for what you're seeing, but this kind of detail
matters a lot when so much is implicitly tied to the lifetime of
references.  This could be corrected by popping off Goals one at a time
from awake2.

Patch attached for DerivationGoal::tryToBuild and
SubstitutionGoal::tryToRun.  I haven't tested it beyond verifying that
it compiles, but I've got a pretty good feeling™ about it.  The commit
message includes discussion of a concrete scenario in which this could
cause a deadlock.  Interestingly enough, it seems to require at least 3
guix-daemon processes running during the "inciting event", though now
that I think about it, the third doesn't need to be a persistent part of
the deadlock.

I did a brief glance over what I could find of Nix's corresponding code
(of course much has changed), and it looks like there is a chance that
at least their DerivationGoal::tryToBuild in
src/libstore/build/derivation-goal.cc is affected by the same issue (you
see the same pattern of calling outputLocks.setDeletion but not
outputLocks.unlock in the same case).  I couldn't quickly determine
whether it was possible their substitutes were affected, as that code
has more significantly diverged and I see no mention of locks in it.
You may want to give them a nudge and see if this information is of use
to them.

Merry Christmas.

- reepca

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1.2: 0001-nix-build.cc-explicitly-unlock-in-the-has-become-val.patch --]
[-- Type: text/x-patch, Size: 3298 bytes --]

From 2030f198892f972ee6bc2fb8529cbd6afb6bc9ad Mon Sep 17 00:00:00 2001
From: Reepca Russelstein <reepca@russelstein.xyz>
Date: Tue, 24 Dec 2024 05:40:58 -0600
Subject: [PATCH] nix: build.cc: explicitly unlock in the has-become-valid
 case.

(hopefully) fixes #31785.  Similar to Nix issue #178, fixed in
https://github.com/NixOS/nix/commit/29cde917fe6b8f2e669c8bf10b38f640045c83b8.

We can't rely on Goal deletion to release our locks in a timely manner.  In
the case in which multiple guix-daemon processes simultaneously try producing
an output path path1, the one that gets there first (P1) will get the lock,
and the second one (P2) will continue trying to acquire the lock until it is
released.  Once it has acquired the lock, it checks to see whether the path
has already become valid in the meantime, and if so it reports success to
those Goals waiting on its completion and finishes.  Unfortunately, it fails
to release the locks it holds first, so those stay held until that Goal gets
deleted.

Suppose we have the following store path dependency graph:

          path4
      /     |     \
   path1   path2   path3

P2 is now sitting on path1's lock, and will continue to do so until path4 is
completed.  Suppose there is also a P3, and it has been blocked while P1
builds path2.  Now P3 is sitting on path2's lock, and can't acquire path1's
lock to determine that it has been completed.  Likewise P2 is sitting on
path1's lock, and now can't acquire path2's lock to determine that it has been
completed.  Finally, P3 completes path3 while P2 is blocked.

Now:

- P1 knows that path1 and path2 are complete, and holds no locks, but can't
  determine that path3 is complete
- P2 knows that path1 and path3 are complete, and holds locks on path1 and
  path3, but can't determine that path2 is complete
- P3 knows that path2 and path3 are complete, and holds a lock on path2, but
  can't determine that path1 is complete

And none of these locks will be released until path4 is complete.  Thus, we
have a deadlock.

To resolve this, we should explicitly release these locks as soon as they
should be released.

* nix/libstore/build.cc
  (DerivationGoal::tryToBuild, SubstitutionGoal::tryToRun):
  Explicitly release locks in the has-become-valid case.

Change-Id: Ie510f84828892315fe6776c830db33d0f70bcef8
---
 nix/libstore/build.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/nix/libstore/build.cc b/nix/libstore/build.cc
index 43a8a371846..edd01bab34d 100644
--- a/nix/libstore/build.cc
+++ b/nix/libstore/build.cc
@@ -1200,6 +1200,7 @@ void DerivationGoal::tryToBuild()
     if (buildMode != bmCheck && validPaths.size() == drv.outputs.size()) {
         debug(format("skipping build of derivation `%1%', someone beat us to it") % drvPath);
         outputLocks.setDeletion(true);
+        outputLocks.unlock();
         done(BuildResult::AlreadyValid);
         return;
     }
@@ -3070,6 +3071,7 @@ void SubstitutionGoal::tryToRun()
     if (!repair && worker.store.isValidPath(storePath)) {
         debug(format("store path `%1%' has become valid") % storePath);
         outputLock->setDeletion(true);
+        outputLock.reset();
         amDone(ecSuccess);
         return;
     }
-- 
2.45.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 519 bytes --]

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2024-12-24 13:15 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-06-11 14:06 bug#31785: Multiple client 'build-paths' RPCs can lead to daemon deadlock Ludovic Courtès
2019-09-06  9:04 ` Ludovic Courtès
2020-11-04 15:21 ` Ludovic Courtès
2024-12-21 16:22 ` Ludovic Courtès
2024-12-21 17:08   ` Ludovic Courtès
2024-12-21 22:43     ` Ludovic Courtès
2024-12-24 13:08     ` Reepca Russelstein via Bug reports for GNU Guix

Code repositories for project(s) associated with this external index

	https://git.savannah.gnu.org/cgit/guix.git

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.