I've been seeing all sorts of deadlocks in guile, and so I wrote a small debugging utility to try to track down the problems. I'd like to see this patch included in future versions of guile. I found one bug with this tool, and have submitted a patch for that already. It looks like there's another bug involving signals -- there is a probably deadlock in garbage collection if a signal is sent at just the wrong time. The bug can be seen by enabling this patch, and then running 'make check'. I'm going to ignore this, as I'm not worried about signals right now. This is my "final" version of this patch, I'd sent a beta version a few days ago. Its "final" because I'm not anticipating any further changes. --linas Add a deadlock debugging facility to guile. Signed-off-by: Linas Vepstas --- config.h.in | 3 configure.in | 11 +++ libguile/Makefile.am | 2 libguile/debug-locks.c | 159 +++++++++++++++++++++++++++++++++++++++++++++ libguile/pthread-threads.h | 15 ++++ libguile/threads.c | 53 +++++++++++++++ libguile/threads.h | 8 ++ 7 files changed, 250 insertions(+), 1 deletion(-) Index: guile-1.8.5/libguile/pthread-threads.h =================================================================== --- guile-1.8.5.orig/libguile/pthread-threads.h 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/libguile/pthread-threads.h 2008-11-16 18:57:48.000000000 -0600 @@ -91,6 +91,21 @@ extern pthread_mutexattr_t scm_i_pthread #define scm_i_scm_pthread_cond_wait scm_pthread_cond_wait #define scm_i_scm_pthread_cond_timedwait scm_pthread_cond_timedwait +#ifdef GUILE_DEBUG_LOCKS +#undef scm_i_pthread_mutex_lock +#define scm_i_pthread_mutex_lock(ARG) scm_i_pthread_mutex_lock_dbg(ARG, #ARG) + +#undef scm_i_pthread_mutex_unlock +#define scm_i_pthread_mutex_unlock(ARG) scm_i_pthread_mutex_unlock_dbg(ARG, #ARG) + +int scm_i_pthread_mutex_lock_dbg(pthread_mutex_t *, const char *); +int scm_i_pthread_mutex_unlock_dbg(pthread_mutex_t *, const char *); + +void prt_lockholders(void); +void prt_this_lockholder(void); + +#endif + #endif /* SCM_PTHREADS_THREADS_H */ /* Index: guile-1.8.5/libguile/threads.c =================================================================== --- guile-1.8.5.orig/libguile/threads.c 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/libguile/threads.c 2008-11-16 18:57:48.000000000 -0600 @@ -441,6 +441,24 @@ guilify_self_1 (SCM_STACKITEM *base) SCM_SET_FREELIST_LOC (scm_i_freelist, &t->freelist); SCM_SET_FREELIST_LOC (scm_i_freelist2, &t->freelist2); +#ifdef GUILE_DEBUG_LOCKS + int i, j; + for(i=0; ilockname[i] = NULL; + t->lockmutex[i] = NULL; + for(j=0; jlockholder[i][j] = NULL; + } + } + if (scm_initialized_p == 0) + { + t->lockname[0] = "&scm_i_init_mutex"; + t->lockmutex[0] = &scm_i_init_mutex; + } +#endif + scm_i_pthread_setspecific (scm_i_thread_key, t); scm_i_pthread_mutex_lock (&t->heap_mutex); @@ -1624,8 +1642,21 @@ scm_i_thread_wake_up () scm_i_thread *t; scm_i_pthread_cond_broadcast (&wake_up_cond); +#ifndef GUILE_DEBUG_LOCKS for (t = all_threads; t; t = t->next_thread) scm_i_pthread_mutex_unlock (&t->heap_mutex); +#else + /* Unlock in reverse order from locking */ + scm_i_thread *tt = NULL; + while (tt != all_threads) + { + scm_i_thread *tp = NULL; + for (t = all_threads; t != tt; t = t->next_thread) + tp = t; + scm_i_pthread_mutex_unlock (&tp->heap_mutex); + tt = tp; + } +#endif scm_i_pthread_mutex_unlock (&thread_admin_mutex); scm_enter_guile ((scm_t_guile_ticket) SCM_I_CURRENT_THREAD); } @@ -1721,6 +1752,28 @@ scm_init_threads_default_dynamic_state ( scm_i_default_dynamic_state = scm_permanent_object (state); } +#ifdef GUILE_DEBUG_LOCKS +extern int guile_do_abort_on_badlock; + +void prt_one_lockholder(scm_i_thread *); +void prt_lockholders(void) +{ + scm_i_thread *t; + + if (!guile_do_abort_on_badlock) return; + + for (t = all_threads; t; t = t->next_thread) + { + prt_one_lockholder(t); + } +} + +void prt_this_lockholder(void) +{ + prt_one_lockholder(SCM_I_CURRENT_THREAD); +} +#endif + void scm_init_thread_procs () { Index: guile-1.8.5/libguile/threads.h =================================================================== --- guile-1.8.5.orig/libguile/threads.h 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/libguile/threads.h 2008-11-16 18:57:48.000000000 -0600 @@ -108,6 +108,14 @@ typedef struct scm_i_thread { SCM_STACKITEM *top; jmp_buf regs; +#ifdef GUILE_DEBUG_LOCKS +#define LOCK_STACK_DEPTH 250 +#define TRACE_STACK_DEPTH 4 + const char *lockname[LOCK_STACK_DEPTH]; + char *lockholder[LOCK_STACK_DEPTH][TRACE_STACK_DEPTH]; + pthread_mutex_t *lockmutex[LOCK_STACK_DEPTH]; +#endif + } scm_i_thread; #define SCM_I_IS_THREAD(x) SCM_SMOB_PREDICATE (scm_tc16_thread, x) Index: guile-1.8.5/configure.in =================================================================== --- guile-1.8.5.orig/configure.in 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/configure.in 2008-11-16 18:57:48.000000000 -0600 @@ -122,6 +122,13 @@ AC_ARG_ENABLE(debug-malloc, [Define this if you want to debug scm_must_malloc/realloc/free calls.]) fi) +AC_ARG_ENABLE(debug-locks, + [ --enable-debug-locks include thread lock debugging code], + if test "$enable_debug_locks" = y || test "$enable_debug_locks" = yes; then + AC_DEFINE(GUILE_DEBUG_LOCKS, 1, + [Define this if you want to debug pthread lock nesting and deadlock trouble.]) + fi) + SCM_I_GSC_GUILE_DEBUG=0 AC_ARG_ENABLE(guile-debug, [AC_HELP_STRING([--enable-guile-debug], @@ -263,6 +270,10 @@ if test "$enable_debug_malloc" = yes; th AC_LIBOBJ([debug-malloc]) fi +if test "$enable_debug_locks" = yes; then + AC_LIBOBJ([debug-locks]) +fi + if test "$enable_elisp" = yes; then SCM_I_GSC_ENABLE_ELISP=1 else Index: guile-1.8.5/config.h.in =================================================================== --- guile-1.8.5.orig/config.h.in 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/config.h.in 2008-11-16 18:57:48.000000000 -0600 @@ -42,6 +42,9 @@ Boston, MA 02110-1301, USA. /* Define this if you want to debug scm_must_malloc/realloc/free calls. */ #undef GUILE_DEBUG_MALLOC +/* Define this if you want to debug thread locking and deadlocks. */ +#undef GUILE_DEBUG_LOCKS + /* The imaginary unit (positive square root of -1). */ #undef GUILE_I Index: guile-1.8.5/libguile/Makefile.am =================================================================== --- guile-1.8.5.orig/libguile/Makefile.am 2008-11-16 18:57:19.000000000 -0600 +++ guile-1.8.5/libguile/Makefile.am 2008-11-16 18:57:48.000000000 -0600 @@ -159,7 +159,7 @@ EXTRA_libguile_la_SOURCES = _scm.h \ inet_aton.c memmove.c putenv.c strerror.c \ dynl.c regex-posix.c \ filesys.c posix.c net_db.c socket.c \ - debug-malloc.c mkstemp.c \ + debug-locks.c debug-malloc.c mkstemp.c \ win32-uname.c win32-dirent.c win32-socket.c ## delete guile-snarf.awk from the installation bindir, in case it's Index: guile-1.8.5/libguile/debug-locks.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ guile-1.8.5/libguile/debug-locks.c 2008-11-16 20:36:22.000000000 -0600 @@ -0,0 +1,159 @@ +/* Copyright (C) 2008 Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +/* + * Utilities for tracing/debugging deadlocks. Conditionally compiled by + * requesting ./configure --enable-debug-locks. The functions here + * replace the pthred mutex lock and unlock routines to record where + * a lock is taken, by recording a short snippet of the stack. The + * logs of held locks are on a thread-by-thread basis, so that deadlocks + * across different threads can be debugged. + * + * The primary check is to make sure that locks are always unlocked in + * reverse order (nested order), as out-of-sequence locks typically + * result in deadlocks. One exception is made: if the lock is the + * thread heap_mutex lock (i.e. the lock that defines "guile mode"), + * its allowed to be unlocked in reversed sequence, as a special case. + * If a bad unlock sequence is detected, then abort is called, to put + * the system into the debugger. + * + * If guile still deadlocks, without triggering an error, you might + * find the prt_lockholders() function to be useful: from within gdb, + * just say "call prt_lockholders()" to list all locks held by all + * threads. + * + * CAUTION: Turning this on leads to a *severe* performance degradation. + */ + +#include +#include +#include "_scm.h" + +extern void prt_lockholders(void); + +int guile_do_abort_on_badlock = 0; + +void prt_one_lockholder(scm_i_thread *); +void prt_one_lockholder(scm_i_thread *t) +{ + int i, j; + + fprintf (stderr, "\nThread %p\n", (void *) t->pthread); + for (i=0; ilockname[i]) break; + fprintf(stderr, "%d: %s (%p) in:\n", i, t->lockname[i], t->lockmutex[i]); + for (j=0; jlockholder[i][j]) break; + fprintf(stderr, "\t%s\n", t->lockholder[i][j]); + } + } +} + + +int scm_i_pthread_mutex_lock_dbg(pthread_mutex_t *mtx, const char *lockstr) +{ + int i,j; + int rc = pthread_mutex_lock(mtx); + scm_i_thread *tp = SCM_I_CURRENT_THREAD; + + if (NULL == tp) + return rc; + + for (i=0; ilockname[i]) + { + void * b[TRACE_STACK_DEPTH+1]; + int sz = backtrace(b, TRACE_STACK_DEPTH+1); + char **s = backtrace_symbols(b, sz); + for (j=0; jlockholder[i][j] = strdup(s[j+1]); + } + tp->lockname[i] = lockstr; + tp->lockmutex[i] = mtx; + free (s); + break; + } + } + + if (LOCK_STACK_DEPTH <= i) + { + fprintf(stderr, "Error: thread is holding too many locks\n"); + prt_lockholders(); + if (guile_do_abort_on_badlock) abort(); + } + + return rc; +} + +int scm_i_pthread_mutex_unlock_dbg(pthread_mutex_t *mtx, const char * lockstr) +{ + int i,j; + + scm_i_thread *tp = SCM_I_CURRENT_THREAD; + if (NULL == tp) + return pthread_mutex_unlock(mtx); + + for(i=LOCK_STACK_DEPTH-1; i>=0; i--) + { + if (0x0 == tp->lockname[i]) + continue; + + /* Allows the crazy nested two-step invloving the heap_mutex */ + if ((tp->lockmutex[i] != mtx) && + ((tp->lockmutex[i] != &tp->heap_mutex) || (0 == i) || + (tp->lockmutex[i-1] != mtx))) + { + fprintf(stderr, "Error: unlocking is badly nested: " + "Attempting to unlock %s (%p)\n", + lockstr, mtx); + prt_one_lockholder(tp); + // prt_lockholders(); + if (guile_do_abort_on_badlock) abort(); + } + if (tp->lockmutex[i-1] == mtx) + { + tp->lockname[i-1] = tp->lockname[i]; + tp->lockmutex[i-1] = tp->lockmutex[i]; + for (j=0; jlockholder[i-1][j]; + tp->lockholder[i-1][j] = tp->lockholder[i][j]; + tp->lockholder[i][j] = tmp; + } + } + tp->lockname[i] = NULL; + tp->lockmutex[i] = NULL; + for (j=0; jlockholder[i][j]) free(tp->lockholder[i][j]); + tp->lockholder[i][j] = NULL; + } + break; + } + + if (0 > i) + { + fprintf(stderr, "Error: unlocking a lock that's not held\n"); + prt_lockholders(); + if (guile_do_abort_on_badlock) abort(); + } + + return pthread_mutex_unlock(mtx); +}