From 6ddc5744fe7402d33df363b7a88e77b57e601b5f Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Mon, 26 Aug 2024 12:42:34 -0400 Subject: [PATCH 1/6] Add heartbeat pause/resume capability --- src/threading.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/threading.c b/src/threading.c index 8f350d41f64b1..95202a9e2ceac 100644 --- a/src/threading.c +++ b/src/threading.c @@ -1008,6 +1008,40 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n, return 0; } +// temporarily pause the heartbeat thread +JL_DLLEXPORT int jl_heartbeat_pause(void) +{ + if (!heartbeat_enabled) { + return -1; + } + heartbeat_enabled = 0; + return 0; +} + +// resume the paused heartbeat thread +JL_DLLEXPORT int jl_heartbeat_resume(void) +{ + // cannot resume if the heartbeat thread is already running + if (heartbeat_enabled) { + return -1; + } + + // cannot resume if we weren't paused (disabled != paused) + if (heartbeat_interval_s == 0) { + return -1; + } + + // heartbeat thread must be ready + if (uv_sem_trywait(&heartbeat_off_sem) != 0) { + return -1; + } + n_hbs_missed = 0; + n_hbs_recvd = 0; + heartbeat_enabled = 1; + uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread + return 0; +} + // heartbeat JL_DLLEXPORT void jl_heartbeat(void) { @@ -1099,7 +1133,7 @@ void jl_heartbeat_threadfun(void *arg) uv_sem_post(&heartbeat_off_sem); // sleep the thread here; this semaphore is posted in - // jl_heartbeat_enable() + // jl_heartbeat_enable() or jl_heartbeat_resume() uv_sem_wait(&heartbeat_on_sem); // Set the sleep duration. @@ -1111,7 +1145,7 @@ void jl_heartbeat_threadfun(void *arg) // heartbeat is enabled; sleep, waiting for the desired interval sleep_for(s, ns); - // if heartbeats were turned off while we were sleeping, reset + // if heartbeats were turned off/paused while we were sleeping, reset if (!heartbeat_enabled) { continue; } @@ -1150,6 +1184,16 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int show_tasks_after_n, return -1; } +JL_DLLEXPORT int jl_heartbeat_pause(void) +{ + return -1; +} + +JL_DLLEXPORT int jl_heartbeat_resume(void) +{ + return -1; +} + JL_DLLEXPORT void jl_heartbeat(void) { } From fb47e0e5bbaab9b8340ba8c7299900238cd02a34 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Wed, 18 Sep 2024 15:47:56 -0400 Subject: [PATCH 2/6] Add check to avoid negative sleep duration --- src/threading.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/threading.c b/src/threading.c index 95202a9e2ceac..45161e32c5ed9 100644 --- a/src/threading.c +++ b/src/threading.c @@ -1156,13 +1156,15 @@ void jl_heartbeat_threadfun(void *arg) tchb = jl_hrtime() - t0; // adjust the next sleep duration based on how long the heartbeat - // check took + // check took, but if it took too long then use the normal duration rs = 1; while (tchb > 1e9) { rs++; tchb -= 1e9; } - s = heartbeat_interval_s - rs; + if (rs < heartbeat_interval_s) { + s = heartbeat_interval_s - rs; + } ns = 1e9 - tchb; } } From 6fe40c5feb2286ecd1554a5f05d4d87b01caed15 Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Wed, 18 Sep 2024 15:51:45 -0400 Subject: [PATCH 3/6] Disable heartbeats in `jl_print_task_backtraces()` `jl_print_task_backtraces()` can take long enough that there can be heartbeat loss, which can trigger printing task backtraces again, unless it is called from the heartbeat thread which takes care of that possible problem. --- src/stackwalk.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/stackwalk.c b/src/stackwalk.c index 8a12fb2a28143..a7b0cdf4d0264 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1166,10 +1166,20 @@ JL_DLLEXPORT void jl_print_backtrace(void) JL_NOTSAFEPOINT } extern int gc_first_tid; +extern int jl_inside_heartbeat_thread(void); +extern int jl_heartbeat_pause(void); +extern int jl_heartbeat_resume(void); -// Print backtraces for all live tasks, for all threads, to jl_safe_printf stderr +// Print backtraces for all live tasks, for all threads, to jl_safe_printf +// stderr. This can take a _long_ time! JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT { + // disable heartbeats to prevent heartbeat loss while running this, + // unless this is called from the heartbeat thread + if (!jl_inside_heartbeat_thread()) { + jl_heartbeat_pause(); + } + size_t nthreads = jl_atomic_load_acquire(&jl_n_threads); jl_ptls_t *allstates = jl_atomic_load_relaxed(&jl_all_tls_states); int ctid = -1; @@ -1232,6 +1242,10 @@ JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT jl_safe_printf("thread (%d) ==== End thread %d\n", ctid, ptls2->tid + 1); } jl_safe_printf("thread (%d) ++++ Done\n", ctid); + + if (!jl_inside_heartbeat_thread()) { + jl_heartbeat_resume(); + } } #ifdef __cplusplus From 0602dcc2ff4e02abdcfcea48fe9a0e2bce3534ff Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Wed, 18 Sep 2024 15:58:40 -0400 Subject: [PATCH 4/6] Pause heartbeats for GC --- src/gc.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/gc.c b/src/gc.c index dad5768732545..ccafc05e02621 100644 --- a/src/gc.c +++ b/src/gc.c @@ -3734,6 +3734,9 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) return recollect; } +extern int jl_heartbeat_pause(void); +extern int jl_heartbeat_resume(void); + JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) { JL_PROBE_GC_BEGIN(collection); @@ -3775,6 +3778,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) // existence of the thread in the jl_n_threads count. // // TODO: concurrently queue objects + jl_heartbeat_pause(); jl_fence(); gc_n_threads = jl_atomic_load_acquire(&jl_n_threads); gc_all_tls_states = jl_atomic_load_relaxed(&jl_all_tls_states); @@ -3806,6 +3810,7 @@ JL_DLLEXPORT void jl_gc_collect(jl_gc_collection_t collection) gc_n_threads = 0; gc_all_tls_states = NULL; + jl_heartbeat_resume(); jl_safepoint_end_gc(); jl_gc_state_set(ptls, old_state, JL_GC_STATE_WAITING); JL_PROBE_GC_END(); From 3fe564d3af1072a85e82d7c6eb39011abe55ea9d Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Wed, 18 Sep 2024 17:28:43 -0400 Subject: [PATCH 5/6] Address review comment --- src/threading.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/threading.c b/src/threading.c index 45161e32c5ed9..d89e8d09745fc 100644 --- a/src/threading.c +++ b/src/threading.c @@ -1035,8 +1035,13 @@ JL_DLLEXPORT int jl_heartbeat_resume(void) if (uv_sem_trywait(&heartbeat_off_sem) != 0) { return -1; } + + // reset state as we've been paused n_hbs_missed = 0; n_hbs_recvd = 0; + tasks_showed = 0; + + // resume heartbeat_enabled = 1; uv_sem_post(&heartbeat_on_sem); // wake the heartbeat thread return 0; From fd8eedf307a1131f4d69c406a46b0e865eb986bd Mon Sep 17 00:00:00 2001 From: K Pamnany Date: Thu, 19 Sep 2024 13:30:03 -0400 Subject: [PATCH 6/6] Address review comment --- src/stackwalk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/stackwalk.c b/src/stackwalk.c index a7b0cdf4d0264..ab3f13c8e9746 100644 --- a/src/stackwalk.c +++ b/src/stackwalk.c @@ -1175,7 +1175,9 @@ extern int jl_heartbeat_resume(void); JL_DLLEXPORT void jl_print_task_backtraces(int show_done) JL_NOTSAFEPOINT { // disable heartbeats to prevent heartbeat loss while running this, - // unless this is called from the heartbeat thread + // unless this is called from the heartbeat thread itself; in that + // situation, the thread is busy running this and it will not be + // updating the missed heartbeats counter if (!jl_inside_heartbeat_thread()) { jl_heartbeat_pause(); }