From 3d458df4382f62163264951ba44308a3b557b277 Mon Sep 17 00:00:00 2001 From: Diogo Netto <61364108+d-netto@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:30:52 -0300 Subject: [PATCH] expose metric to report reasons why full GCs were triggered (#55826) (#189) Additional GC observability tool. This will help us to diagnose why some of our servers are triggering so many full GCs in certain circumstances. --- base/timing.jl | 33 +++++++++++++++++++++++++++++++++ src/gc.c | 7 +++++++ src/gc.h | 15 +++++++++++++++ src/threading.c | 2 +- test/gc.jl | 11 +++++++++++ 5 files changed, 67 insertions(+), 1 deletion(-) diff --git a/base/timing.jl b/base/timing.jl index 73a3c5dc7d5e5..64d2a0a6f054e 100644 --- a/base/timing.jl +++ b/base/timing.jl @@ -107,6 +107,39 @@ function gc_page_utilization_data() return Base.unsafe_wrap(Array, page_utilization_raw, JL_GC_N_MAX_POOLS, own=false) end +# must be kept in sync with `src/gc.h`` +const FULL_SWEEP_REASONS = [:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL, :FULL_SWEEP_REASON_FORCED_FULL_SWEEP, + :FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM, :FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY, + :FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER] + +""" + Base.full_sweep_reasons() + +Return a dictionary of the number of times each full sweep reason has occurred. + +The reasons are: +- `:FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL`: Full sweep was caused due to `always_full` being set in the GC debug environment +- `:FULL_SWEEP_REASON_FORCED_FULL_SWEEP`: Full sweep was forced by `GC.gc(true)` +- `:FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM`: Full sweep was forced by the allocation interval being above the total + memory in the machine (as returned by LibUV) divided by the number of mutator threads +- `:FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY`: Full sweep was caused due to live bytes being above the + soft heap limit size (which is either automatically computed at initialization based on the total memory provided by LibUV, + or set by the user via `--heap-size-hint`) +- `:FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER`: Full sweep was forced by the intergenerational frontier being too large + (i.e. too many pointers in the remembered set) + +Note that the set of reasons is not guaranteed to be stable across minor versions of Julia. +""" +function full_sweep_reasons() + reason = cglobal(:jl_full_sweep_reasons, UInt64) + reasons_as_array = Base.unsafe_wrap(Vector{UInt64}, reason, length(FULL_SWEEP_REASONS), own=false) + d = Dict{Symbol, Int64}() + for (i, r) in enumerate(FULL_SWEEP_REASONS) + d[r] = reasons_as_array[i] + end + return d +end + """ Base.jit_total_bytes() diff --git a/src/gc.c b/src/gc.c index ccafc05e02621..334bd8f83ada7 100644 --- a/src/gc.c +++ b/src/gc.c @@ -37,6 +37,8 @@ uv_sem_t gc_sweep_assists_needed; uv_mutex_t gc_queue_observer_lock; // Tag for sentinel nodes in bigval list uintptr_t gc_bigval_sentinel_tag; +// Table recording number of full GCs due to each reason +JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS]; // Linked list of callback functions @@ -3551,6 +3553,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (large_frontier) { sweep_full = 1; gc_num.interval = last_long_collect_interval; + gc_count_full_sweep_reason(FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER); } if (not_freed_enough || large_frontier) { gc_num.interval = gc_num.interval * 2; @@ -3566,6 +3569,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (gc_num.interval > maxmem) { sweep_full = 1; gc_num.interval = maxmem; + gc_count_full_sweep_reason(FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM); } } @@ -3574,13 +3578,16 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection) if (live_bytes > max_total_memory) { under_memory_pressure = 1; sweep_full = 1; + gc_count_full_sweep_reason(FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY); } if (gc_sweep_always_full) { sweep_full = 1; + gc_count_full_sweep_reason(FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL); } if (collection == JL_GC_FULL && !prev_sweep_full) { sweep_full = 1; recollect = 1; + gc_count_full_sweep_reason(FULL_SWEEP_REASON_FORCED_FULL_SWEEP); } if (sweep_full) { // these are the difference between the number of gc-perm bytes scanned diff --git a/src/gc.h b/src/gc.h index b06deec9d7238..61c7c835db34a 100644 --- a/src/gc.h +++ b/src/gc.h @@ -560,6 +560,21 @@ FORCE_INLINE void gc_big_object_link(bigval_t *sentinel_node, bigval_t *node) JL sentinel_node->next = node; } +// Must be kept in sync with `base/timing.jl` +#define FULL_SWEEP_REASON_SWEEP_ALWAYS_FULL (0) +#define FULL_SWEEP_REASON_FORCED_FULL_SWEEP (1) +#define FULL_SWEEP_REASON_ALLOCATION_INTERVAL_ABOVE_MAXMEM (2) +#define FULL_SWEEP_REASON_LIVE_BYTES_ABOVE_MAX_TOTAL_MEMORY (3) +#define FULL_SWEEP_REASON_LARGE_INTERGEN_FRONTIER (4) +#define FULL_SWEEP_NUM_REASONS (5) + +extern JL_DLLEXPORT uint64_t jl_full_sweep_reasons[FULL_SWEEP_NUM_REASONS]; +STATIC_INLINE void gc_count_full_sweep_reason(int reason) JL_NOTSAFEPOINT +{ + assert(reason >= 0 && reason < FULL_SWEEP_NUM_REASONS); + jl_full_sweep_reasons[reason]++; +} + extern uv_mutex_t gc_threads_lock; extern uv_cond_t gc_threads_cond; extern uv_sem_t gc_sweep_assists_needed; diff --git a/src/threading.c b/src/threading.c index d89e8d09745fc..d0f25de374462 100644 --- a/src/threading.c +++ b/src/threading.c @@ -1035,7 +1035,7 @@ JL_DLLEXPORT int jl_heartbeat_resume(void) if (uv_sem_trywait(&heartbeat_off_sem) != 0) { return -1; } - + // reset state as we've been paused n_hbs_missed = 0; n_hbs_recvd = 0; diff --git a/test/gc.jl b/test/gc.jl index 3b8677568df45..a07abd48c0152 100644 --- a/test/gc.jl +++ b/test/gc.jl @@ -28,6 +28,13 @@ function run_pg_size_test() @test page_size == (1 << 12) || page_size == (1 << 14) end +function full_sweep_reasons_test() + GC.gc() + reasons = Base.full_sweep_reasons() + @test reasons[:FULL_SWEEP_REASON_FORCED_FULL_SWEEP] >= 1 + @test keys(reasons) == Set(Base.FULL_SWEEP_REASONS) +end + # !!! note: # Since we run our tests on 32bit OS as well we confine ourselves # to parameters that allocate about 512MB of objects. Max RSS is lower @@ -43,3 +50,7 @@ end run_nonzero_page_utilization_test() run_pg_size_test() end + +@testset "Full GC reasons" begin + full_sweep_reasons_test() +end