From aef2ba041b7d71d663599d90b1b837e95ad9e8a6 Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Fri, 30 Aug 2024 15:17:54 -0300 Subject: [PATCH 1/7] Implement parallel sweeping of stack pools + use a round robin to only return stacks one thread at a time to avoid contention on munmap syscalls --- src/gc-stacks.c | 53 +++++++++++++++++++++++++++++-------------------- src/gc-stock.c | 47 ++++++++++++++++++++++++++++++++++++++----- 2 files changed, 74 insertions(+), 26 deletions(-) diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 783129ea97693..0d17eafbcfd1e 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -202,6 +202,10 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } +extern _Atomic(int) gc_ptls_sweep_idx; +extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) gc_stack_free_idx; + void sweep_stack_pools(void) JL_NOTSAFEPOINT { // Stack sweeping algorithm: @@ -216,32 +220,38 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT // if (stkbuf) // push(free_stacks[sz], stkbuf) assert(gc_n_threads); - for (int i = 0; i < gc_n_threads; i++) { + jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + while (1) { + int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); + if (i < 0) + break; jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 == NULL) continue; // free half of stacks that remain unused since last sweep - for (int p = 0; p < JL_N_STACK_POOLS; p++) { - small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p]; - size_t n_to_free; - if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { - n_to_free = al->len; // not alive yet or dead, so it does not need these anymore - } - else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { - n_to_free = al->len / 2; - if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) - n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; - } - else { - n_to_free = 0; - } - for (int n = 0; n < n_to_free; n++) { - void *stk = small_arraylist_pop(al); - free_stack(stk, pool_sizes[p]); - } - if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { - small_arraylist_free(al); + if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) { + for (int p = 0; p < JL_N_STACK_POOLS; p++) { + small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p]; + size_t n_to_free; + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + n_to_free = al->len; // not alive yet or dead, so it does not need these anymore + } + else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) { + n_to_free = al->len / 2; + if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL)) + n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL; + } + else { + n_to_free = 0; + } + for (int n = 0; n < n_to_free; n++) { + void *stk = small_arraylist_pop(al); + free_stack(stk, pool_sizes[p]); + } + if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { + small_arraylist_free(al); + } } } if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) { @@ -287,6 +297,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT } live_tasks->len -= ndel; } + jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); } JL_DLLEXPORT jl_array_t *jl_live_tasks(void) diff --git a/src/gc-stock.c b/src/gc-stock.c index d25f8917f302d..ba0ef74515e8f 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -29,6 +29,10 @@ _Atomic(int) gc_n_threads_sweeping; _Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC _Atomic(int) gc_master_tid; +// counter for sharing work when sweeping stacks +_Atomic(int) gc_ptls_sweep_idx; +// counter for round robin of giving back stack pages to the OS +_Atomic(int) gc_stack_free_idx = 0; // `tid` of first GC thread int gc_first_tid; // Mutex/cond used to synchronize wakeup of GC threads on parallel marking @@ -994,13 +998,35 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa // sweep over all memory that is being used and not in a pool static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT { - sweep_stack_pools(); gc_sweep_foreign_objs(); sweep_malloced_memory(); sweep_big(ptls); jl_engine_sweep(gc_all_tls_states); } +// wake up all threads to sweep the stacks +void gc_sweep_wake_all_stacks(jl_ptls_t ptls) +{ + uv_mutex_lock(&gc_threads_lock); + int first = gc_first_parallel_collector_thread_id(); + int last = gc_last_parallel_collector_thread_id(); + for (int i = first; i <= last; i++) { + jl_ptls_t ptls2 = gc_all_tls_states[i]; + gc_check_ptls_of_parallel_collector_thread(ptls2); + jl_atomic_fetch_add(&ptls2->gc_tls.gc_sweeps_requested, 1); + } + uv_cond_broadcast(&gc_threads_cond); + uv_mutex_unlock(&gc_threads_lock); + return; +} + +void gc_sweep_wait_for_all_stacks(void) +{ + while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { + jl_cpu_pause(); + } +} + static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT { assert(pg->fl_begin_offset != UINT16_MAX); @@ -1076,7 +1102,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc } // wake up all threads to sweep the pages -void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch) +void gc_sweep_wake_all_pages(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch) { int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch); if (parallel_sweep_worthwhile && !page_profile_enabled) { @@ -1112,7 +1138,7 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_ } // wait for all threads to finish sweeping -void gc_sweep_wait_for_all(void) +void gc_sweep_wait_for_all_pages(void) { jl_atomic_store(&gc_allocd_scratch, NULL); while (jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { @@ -1258,9 +1284,9 @@ static void gc_sweep_pool(void) // the actual sweeping jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t)); jl_ptls_t ptls = jl_current_task->ptls; - gc_sweep_wake_all(ptls, new_gc_allocd_scratch); + gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch); gc_sweep_pool_parallel(ptls); - gc_sweep_wait_for_all(); + gc_sweep_wait_for_all_pages(); // reset half-pages pointers for (int t_i = 0; t_i < n_threads; t_i++) { @@ -3069,6 +3095,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); + // initialize ptls index for parallel sweeping of stack pools + int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); + if (stack_free_idx + 1 == gc_n_threads) + jl_atomic_store_relaxed(&gc_stack_free_idx, 0); + else + jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); + jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial + gc_sweep_wake_all_stacks(ptls); + sweep_stack_pools(); + gc_sweep_wait_for_all_stacks(); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); @@ -3511,6 +3547,7 @@ void jl_parallel_gc_threadfun(void *arg) if (may_sweep(ptls)) { assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_sweep_pool_parallel(ptls); + sweep_stack_pools(); jl_atomic_fetch_add(&ptls->gc_tls.gc_sweeps_requested, -1); } } From b7e838644d78b5cfc85e3b59ff55a2fb2f3436ce Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Fri, 6 Sep 2024 13:43:31 -0300 Subject: [PATCH 2/7] Apply suggestions from code review --- src/Makefile | 1 + src/gc-stacks.c | 7 ++----- src/gc-stock.c | 35 ++++++++++++++++++++++++----------- src/gc-stock.h | 4 +++- src/gc-tls.h | 1 + 5 files changed, 31 insertions(+), 17 deletions(-) diff --git a/src/Makefile b/src/Makefile index 52e673aa6cc1a..7fcc2cfa2ffeb 100644 --- a/src/Makefile +++ b/src/Makefile @@ -318,6 +318,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de $(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h $(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h +$(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h $(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 0d17eafbcfd1e..10441a8f02c2e 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -1,6 +1,7 @@ // This file is a part of Julia. License is MIT: https://julialang.org/license #include "gc-common.h" +#include "gc-stock.h" #include "threading.h" #ifndef _OS_WINDOWS_ # include @@ -202,11 +203,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO return stk; } -extern _Atomic(int) gc_ptls_sweep_idx; -extern _Atomic(int) gc_n_threads_sweeping; -extern _Atomic(int) gc_stack_free_idx; - -void sweep_stack_pools(void) JL_NOTSAFEPOINT +void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT { // Stack sweeping algorithm: // // deallocate stacks if we have too many sitting around unused diff --git a/src/gc-stock.c b/src/gc-stock.c index ba0ef74515e8f..00c936be5f601 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1027,6 +1027,20 @@ void gc_sweep_wait_for_all_stacks(void) } } +void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + // initialize ptls index for parallel sweeping of stack pools + int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); + if (stack_free_idx + 1 == gc_n_threads) + jl_atomic_store_relaxed(&gc_stack_free_idx, 0); + else + jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); + jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial + gc_sweep_wake_all_stacks(ptls); + sweep_stack_pool_loop(); + gc_sweep_wait_for_all_stacks(); +} + static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT { assert(pg->fl_begin_offset != UINT16_MAX); @@ -3095,16 +3109,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); - // initialize ptls index for parallel sweeping of stack pools - int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); - if (stack_free_idx + 1 == gc_n_threads) - jl_atomic_store_relaxed(&gc_stack_free_idx, 0); - else - jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1); - jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial - gc_sweep_wake_all_stacks(ptls); - sweep_stack_pools(); - gc_sweep_wait_for_all_stacks(); + sweep_stack_pools(ptls); gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags(); @@ -3516,6 +3521,10 @@ STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0); } +STATIC_INLINE int may_sweep_stack(jl_ptls_t ptls) JL_NOTSAFEPOINT +{ + return (jl_atomic_load(&ptls->gc_tls.gc_stack_sweep_requested) > 0); +} // parallel gc thread function void jl_parallel_gc_threadfun(void *arg) { @@ -3544,10 +3553,14 @@ void jl_parallel_gc_threadfun(void *arg) uv_mutex_unlock(&gc_threads_lock); assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_mark_loop_parallel(ptls, 0); + if (may_sweep_stack(ptls)) { + assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); + sweep_stack_pool_loop(); + jl_atomic_fetch_add(&ptls->gc_tls.gc_stack_sweep_requested, -1); + } if (may_sweep(ptls)) { assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD); gc_sweep_pool_parallel(ptls); - sweep_stack_pools(); jl_atomic_fetch_add(&ptls->gc_tls.gc_sweeps_requested, -1); } } diff --git a/src/gc-stock.h b/src/gc-stock.h index 45c93bf4289ae..afed1d99c8d91 100644 --- a/src/gc-stock.h +++ b/src/gc-stock.h @@ -511,6 +511,8 @@ extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) gc_ptls_sweep_idx; +extern _Atomic(int) gc_stack_free_idx; extern _Atomic(int) n_threads_running; extern uv_barrier_t thread_init_done; void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq); @@ -521,7 +523,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls); void gc_mark_loop_parallel(jl_ptls_t ptls, int master); void gc_sweep_pool_parallel(jl_ptls_t ptls); void gc_free_pages(void); -void sweep_stack_pools(void) JL_NOTSAFEPOINT; +void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT; void jl_gc_debug_init(void); // GC pages diff --git a/src/gc-tls.h b/src/gc-tls.h index 9e4b09404db84..183016cd91515 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -82,6 +82,7 @@ typedef struct { jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; _Atomic(size_t) gc_sweeps_requested; + _Atomic(uint8_t) gc_stack_sweep_requested; arraylist_t sweep_objs; } jl_gc_tls_states_t; From ebb39c1c6389bacd859030812046a73af42b4788 Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Fri, 6 Sep 2024 16:19:16 -0300 Subject: [PATCH 3/7] Make analyzegc happier --- src/gc-stock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gc-stock.c b/src/gc-stock.c index 00c936be5f601..90ded41bd7eba 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1005,7 +1005,7 @@ static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT } // wake up all threads to sweep the stacks -void gc_sweep_wake_all_stacks(jl_ptls_t ptls) +void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT { uv_mutex_lock(&gc_threads_lock); int first = gc_first_parallel_collector_thread_id(); @@ -1020,7 +1020,7 @@ void gc_sweep_wake_all_stacks(jl_ptls_t ptls) return; } -void gc_sweep_wait_for_all_stacks(void) +void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT { while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { jl_cpu_pause(); From 31b5c0b3d487126b8fa3680c9fba9fc4ea154420 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Mon, 9 Sep 2024 12:18:24 -0300 Subject: [PATCH 4/7] Address suggestions from code review --- src/gc-stock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gc-stock.c b/src/gc-stock.c index 90ded41bd7eba..b7e114819d0c9 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1013,7 +1013,7 @@ void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT for (int i = first; i <= last; i++) { jl_ptls_t ptls2 = gc_all_tls_states[i]; gc_check_ptls_of_parallel_collector_thread(ptls2); - jl_atomic_fetch_add(&ptls2->gc_tls.gc_sweeps_requested, 1); + jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1); } uv_cond_broadcast(&gc_threads_cond); uv_mutex_unlock(&gc_threads_lock); @@ -3547,7 +3547,7 @@ void jl_parallel_gc_threadfun(void *arg) while (1) { uv_mutex_lock(&gc_threads_lock); - while (!may_mark() && !may_sweep(ptls)) { + while (!may_mark() && !may_sweep(ptls) && !may_sweep_stack(ptls)) { uv_cond_wait(&gc_threads_cond, &gc_threads_lock); } uv_mutex_unlock(&gc_threads_lock); From a6d03919239f7e078a660d45898fa21e00629452 Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Tue, 10 Sep 2024 11:19:31 -0300 Subject: [PATCH 5/7] Move assertion to correct place. --- src/gc-stacks.c | 1 - src/gc-stock.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 10441a8f02c2e..9ae5b8d83519c 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -216,7 +216,6 @@ void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - assert(gc_n_threads); jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); while (1) { int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); diff --git a/src/gc-stock.c b/src/gc-stock.c index b7e114819d0c9..2d9557661f552 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -1030,6 +1030,7 @@ void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT { // initialize ptls index for parallel sweeping of stack pools + assert(gc_n_threads); int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx); if (stack_free_idx + 1 == gc_n_threads) jl_atomic_store_relaxed(&gc_stack_free_idx, 0); From ad14f6d9043595c20b20696b09310eb3f7590c0e Mon Sep 17 00:00:00 2001 From: gbaraldi Date: Wed, 11 Sep 2024 16:06:29 -0300 Subject: [PATCH 6/7] Address suggestions from code review --- src/gc-stacks.c | 6 +++--- src/gc-stock.c | 12 +++++++----- src/gc-stock.h | 3 ++- src/gc-tls.h | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/src/gc-stacks.c b/src/gc-stacks.c index 9ae5b8d83519c..f6e787a4c1d2d 100644 --- a/src/gc-stacks.c +++ b/src/gc-stacks.c @@ -216,7 +216,7 @@ void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT // bufsz = t->bufsz // if (stkbuf) // push(free_stacks[sz], stkbuf) - jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1); while (1) { int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1); if (i < 0) @@ -224,7 +224,7 @@ void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT jl_ptls_t ptls2 = gc_all_tls_states[i]; if (ptls2 == NULL) continue; - + assert(gc_n_threads); // free half of stacks that remain unused since last sweep if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) { for (int p = 0; p < JL_N_STACK_POOLS; p++) { @@ -293,7 +293,7 @@ void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT } live_tasks->len -= ndel; } - jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); + jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1); } JL_DLLEXPORT jl_array_t *jl_live_tasks(void) diff --git a/src/gc-stock.c b/src/gc-stock.c index 2d9557661f552..b31b093f1c446 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -24,7 +24,9 @@ int jl_n_sweepthreads; // Number of threads currently running the GC mark-loop _Atomic(int) gc_n_threads_marking; // Number of threads sweeping -_Atomic(int) gc_n_threads_sweeping; +_Atomic(int) gc_n_threads_sweeping_pools; +// Number of threads sweeping stacks +_Atomic(int) gc_n_threads_sweeping_stacks; // Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing) _Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch; // `tid` of mutator thread that triggered GC @@ -1022,7 +1024,7 @@ void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT { - while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { + while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx) >= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping_stacks) != 0) { jl_cpu_pause(); } } @@ -1156,7 +1158,7 @@ void gc_sweep_wake_all_pages(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_a void gc_sweep_wait_for_all_pages(void) { jl_atomic_store(&gc_allocd_scratch, NULL); - while (jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) { + while (jl_atomic_load_acquire(&gc_n_threads_sweeping_pools) != 0) { jl_cpu_pause(); } } @@ -1164,7 +1166,7 @@ void gc_sweep_wait_for_all_pages(void) // sweep all pools void gc_sweep_pool_parallel(jl_ptls_t ptls) { - jl_atomic_fetch_add(&gc_n_threads_sweeping, 1); + jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, 1); jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch); if (allocd_scratch != NULL) { gc_page_profiler_serializer_t serializer = gc_page_serializer_create(); @@ -1209,7 +1211,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls) } gc_page_serializer_destroy(&serializer); } - jl_atomic_fetch_add(&gc_n_threads_sweeping, -1); + jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, -1); } // free all pages (i.e. through `madvise` on Linux) that were lazily freed diff --git a/src/gc-stock.h b/src/gc-stock.h index afed1d99c8d91..86fa58fa28440 100644 --- a/src/gc-stock.h +++ b/src/gc-stock.h @@ -510,7 +510,8 @@ extern uv_mutex_t gc_threads_lock; extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; extern _Atomic(int) gc_n_threads_marking; -extern _Atomic(int) gc_n_threads_sweeping; +extern _Atomic(int) gc_n_threads_sweeping_pools; +extern _Atomic(int) gc_n_threads_sweeping_stacks; extern _Atomic(int) gc_ptls_sweep_idx; extern _Atomic(int) gc_stack_free_idx; extern _Atomic(int) n_threads_running; diff --git a/src/gc-tls.h b/src/gc-tls.h index 183016cd91515..3c2cc029a6183 100644 --- a/src/gc-tls.h +++ b/src/gc-tls.h @@ -82,7 +82,7 @@ typedef struct { jl_gc_markqueue_t mark_queue; jl_gc_mark_cache_t gc_cache; _Atomic(size_t) gc_sweeps_requested; - _Atomic(uint8_t) gc_stack_sweep_requested; + _Atomic(size_t) gc_stack_sweep_requested; arraylist_t sweep_objs; } jl_gc_tls_states_t; From 065d982cf88fc14082f0bdd3f5b9057eab246c6f Mon Sep 17 00:00:00 2001 From: Gabriel Baraldi Date: Wed, 25 Sep 2024 10:36:37 -0300 Subject: [PATCH 7/7] Add statistic for sweeping of stack pools --- base/timing.jl | 2 ++ src/gc-interface.h | 2 ++ src/gc-stock.c | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/base/timing.jl b/base/timing.jl index 80ebb74abee26..b9950b576a200 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -22,8 +22,10 @@ struct GC_Num total_time_to_safepoint ::Int64 sweep_time ::Int64 mark_time ::Int64 + stack_pool_sweep_time ::Int64 total_sweep_time ::Int64 total_mark_time ::Int64 + total_stack_pool_sweep_time::Int64 last_full_sweep ::Int64 last_incremental_sweep ::Int64 end diff --git a/src/gc-interface.h b/src/gc-interface.h index e543b4b5879f1..bb2abbe2d36ac 100644 --- a/src/gc-interface.h +++ b/src/gc-interface.h @@ -44,8 +44,10 @@ typedef struct { uint64_t total_time_to_safepoint; uint64_t sweep_time; uint64_t mark_time; + uint64_t stack_pool_sweep_time; uint64_t total_sweep_time; uint64_t total_mark_time; + uint64_t total_stack_pool_sweep_time; uint64_t last_full_sweep; uint64_t last_incremental_sweep; } jl_gc_num_t; diff --git a/src/gc-stock.c b/src/gc-stock.c index b31b093f1c446..64758bc2fde8a 100644 --- a/src/gc-stock.c +++ b/src/gc-stock.c @@ -3112,7 +3112,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) #endif current_sweep_full = sweep_full; sweep_weak_refs(); + uint64_t stack_pool_time = jl_hrtime(); sweep_stack_pools(ptls); + stack_pool_time = jl_hrtime() - stack_pool_time; + gc_num.total_stack_pool_sweep_time += stack_pool_time; + gc_num.stack_pool_sweep_time = stack_pool_time; gc_sweep_other(ptls, sweep_full); gc_scrub(); gc_verify_tags();