From 6046c7da0b7951f8324f3dd9247d798b318df05b Mon Sep 17 00:00:00 2001 From: d-netto Date: Thu, 7 Dec 2023 09:26:48 -0300 Subject: [PATCH] WIP: create separate object pools for long-lived types --- src/gc.c | 155 ++++++++++++++++++++++++++++++++++++------- src/jlapi.c | 4 ++ src/julia_internal.h | 14 +++- src/julia_threads.h | 8 ++- 4 files changed, 155 insertions(+), 26 deletions(-) diff --git a/src/gc.c b/src/gc.c index ee0f9828e5a1f..35ff41e6c452e 100644 --- a/src/gc.c +++ b/src/gc.c @@ -24,6 +24,7 @@ int gc_first_tid; // Mutex/cond used to synchronize sleep/wakeup of GC threads uv_mutex_t gc_threads_lock; uv_cond_t gc_threads_cond; +FILE *page_profiling_file; // Linked list of callback functions @@ -1444,6 +1445,19 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc_instrumented(jl_ptls_t ptls, int pool_offset, int osize, jl_value_t* type) { + jl_datatype_t *ty = (jl_datatype_t*)type; + if (ty == jl_simplevector_type) { + pool_offset += JL_GC_N_MAX_POOLS; + } + else if (ty == jl_code_instance_type) { + pool_offset = 2 * JL_GC_N_MAX_POOLS; + } + else if (ty == jl_method_instance_type) { + pool_offset = 2 * JL_GC_N_MAX_POOLS + 1; + } + else if (ty == jl_typemap_entry_type) { + pool_offset = 2 * JL_GC_N_MAX_POOLS + 2; + } jl_value_t *val = jl_gc_pool_alloc_inner(ptls, pool_offset, osize); maybe_record_alloc_to_profile(val, osize, (jl_datatype_t*)type); return val; @@ -1473,25 +1487,71 @@ JL_DLLEXPORT double jl_gc_page_utilization_stats[JL_GC_N_MAX_POOLS]; STATIC_INLINE void gc_update_page_fragmentation_data(jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT { - gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n]; - jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree); - jl_atomic_fetch_add(&stats->n_pages_allocd, 1); + // gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[pg->pool_n]; + // jl_atomic_fetch_add(&stats->n_freed_objs, pg->nfree); + // jl_atomic_fetch_add(&stats->n_pages_allocd, 1); } STATIC_INLINE void gc_dump_page_utilization_data(void) JL_NOTSAFEPOINT { - for (int i = 0; i < JL_GC_N_POOLS; i++) { - gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i]; - double utilization = 1.0; - size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs); - size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd); - if (n_pages_allocd != 0) { - utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ; - } - jl_gc_page_utilization_stats[i] = utilization; - jl_atomic_store_relaxed(&stats->n_freed_objs, 0); - jl_atomic_store_relaxed(&stats->n_pages_allocd, 0); + // for (int i = 0; i < JL_GC_N_POOLS; i++) { + // gc_fragmentation_stat_t *stats = &gc_page_fragmentation_stats[i]; + // double utilization = 1.0; + // size_t n_freed_objs = jl_atomic_load_relaxed(&stats->n_freed_objs); + // size_t n_pages_allocd = jl_atomic_load_relaxed(&stats->n_pages_allocd); + // if (n_pages_allocd != 0) { + // utilization -= ((double)n_freed_objs * (double)jl_gc_sizeclasses[i]) / (double)n_pages_allocd / (double)GC_PAGE_SZ; + // } + // jl_gc_page_utilization_stats[i] = utilization; + // jl_atomic_store_relaxed(&stats->n_freed_objs, 0); + // jl_atomic_store_relaxed(&stats->n_pages_allocd, 0); + // } +} + +// #define GC_HEAP_DUMP + +int heap_dump_enabled; + +void gc_enable_heap_dump(void) +{ + heap_dump_enabled = 1; +} + +static void gc_heap_dump_write_preamble(char *data, int osize) JL_NOTSAFEPOINT +{ +#ifdef GC_HEAP_DUMP + if (heap_dump_enabled) { + fprintf(page_profiling_file, "{data: %p, osize: %d}\n", data, osize); } +#endif +} + +static void gc_heap_dump_write_empty_page(void) JL_NOTSAFEPOINT +{ +#ifdef GC_HEAP_DUMP + if (heap_dump_enabled) { + fprintf(page_profiling_file, "[empty]\n"); + } +#endif +} + +static void gc_heap_dump_write_garbage(void) JL_NOTSAFEPOINT +{ +#ifdef GC_HEAP_DUMP + if (heap_dump_enabled) { + fprintf(page_profiling_file, "[garbage]\n"); + } +#endif +} + +static void gc_heap_dump_write_live_obj(jl_taggedvalue_t *v) JL_NOTSAFEPOINT +{ +#ifdef GC_HEAP_DUMP + if (heap_dump_enabled) { + const char *name = jl_typeof_str(jl_valueof(v)); + fprintf(page_profiling_file, "[%s]\n", name); + } +#endif } int64_t buffered_pages = 0; @@ -1509,6 +1569,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag } size_t old_nfree = pg->nfree; size_t nfree; + gc_heap_dump_write_preamble(data, osize); int re_use_page = 1; int keep_as_local_buffer = 0; @@ -1526,6 +1587,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag keep_as_local_buffer = 1; } nfree = (GC_PAGE_SZ - GC_PAGE_OFFSET) / osize; + gc_heap_dump_write_empty_page(); goto done; } // For quick sweep, we might be able to skip the page if the page doesn't @@ -1535,6 +1597,7 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag if (!prev_sweep_full || pg->prev_nold == pg->nold) { freedall = 0; nfree = pg->nfree; + gc_heap_dump_write_empty_page(); goto done; } } @@ -1552,18 +1615,22 @@ static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_pag int bits = v->bits.gc; // if an object is past `lim_newpages` then we can guarantee it's garbage if (!gc_marked(bits) || (char*)v >= lim_newpages) { + gc_heap_dump_write_garbage(); *pfl = v; pfl = &v->next; pfl_begin = (pfl_begin != NULL) ? pfl_begin : pfl; pg_nfree++; + gc_heap_dump_write_garbage(); } else { // marked young or old + gc_heap_dump_write_live_obj(v); if (current_sweep_full || bits == GC_MARKED) { // old enough bits = v->bits.gc = GC_OLD; // promote } prev_nold++; has_marked |= gc_marked(bits); freedall = 0; + gc_heap_dump_write_live_obj(v); } v = (jl_taggedvalue_t*)((char*)v + osize); } @@ -1704,6 +1771,7 @@ void gc_free_pages(void) // setup the data-structures for a sweep over all memory pools static void gc_sweep_pool(void) { + // jl_safe_printf("GC sweep pools...\n"); gc_time_pool_start(); buffered_pages = 0; @@ -1713,20 +1781,21 @@ static void gc_sweep_pool(void) // allocate enough space to hold the end of the free list chain // for every thread and pool size - jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**)); + const int jl_gc_n_tot_pools = 2 * JL_GC_N_MAX_POOLS + 3; + jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * jl_gc_n_tot_pools * sizeof(jl_taggedvalue_t**)); // update metadata of pages that were pointed to by freelist or newpages from a pool // i.e. pages being the current allocation target for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 == NULL) { - for (int i = 0; i < JL_GC_N_POOLS; i++) { - pfl[t_i * JL_GC_N_POOLS + i] = NULL; + for (int i = 0; i < jl_gc_n_tot_pools; i++) { + pfl[t_i * jl_gc_n_tot_pools + i] = NULL; } continue; } jl_atomic_store_relaxed(&ptls2->gc_num.pool_live_bytes, 0); - for (int i = 0; i < JL_GC_N_POOLS; i++) { + for (int i = 0; i < jl_gc_n_tot_pools; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; jl_taggedvalue_t *last = p->freelist; if (last != NULL) { @@ -1735,7 +1804,7 @@ static void gc_sweep_pool(void) pg->has_young = 1; } p->freelist = NULL; - pfl[t_i * JL_GC_N_POOLS + i] = &p->freelist; + pfl[t_i * jl_gc_n_tot_pools + i] = &p->freelist; last = p->newpages; if (last != NULL) { @@ -1755,6 +1824,7 @@ static void gc_sweep_pool(void) } // the actual sweeping + // jl_safe_printf("GC sweep pages... the actual sweeping\n"); jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t)); memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t)); jl_atomic_store(&gc_allocd_scratch, tmp); @@ -1766,7 +1836,7 @@ static void gc_sweep_pool(void) jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { ptls2->page_metadata_allocd = tmp[t_i]; - for (int i = 0; i < JL_GC_N_POOLS; i++) { + for (int i = 0; i < jl_gc_n_tot_pools; i++) { jl_gc_pool_t *p = &ptls2->heap.norm_pools[i]; p->newpages = NULL; } @@ -1774,6 +1844,7 @@ static void gc_sweep_pool(void) } // merge free lists + // jl_safe_printf("GC sweep pages... merge free lists\n"); for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 == NULL) { @@ -1786,19 +1857,20 @@ static void gc_sweep_pool(void) char *cur_pg = pg->data; jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset); jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset); - *pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg; - pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next; + *pfl[t_i * jl_gc_n_tot_pools + pg->pool_n] = fl_beg; + pfl[t_i * jl_gc_n_tot_pools + pg->pool_n] = &fl_end->next; } pg = pg2; } } // null out terminal pointers of free lists + // jl_safe_printf("GC sweep pages... null out terminal pointers of free lists\n"); for (int t_i = 0; t_i < n_threads; t_i++) { jl_ptls_t ptls2 = gc_all_tls_states[t_i]; if (ptls2 != NULL) { - for (int i = 0; i < JL_GC_N_POOLS; i++) { - *pfl[t_i * JL_GC_N_POOLS + i] = NULL; + for (int i = 0; i < jl_gc_n_tot_pools; i++) { + *pfl[t_i * jl_gc_n_tot_pools + i] = NULL; } } } @@ -3676,11 +3748,39 @@ void jl_init_thread_heap(jl_ptls_t ptls) { jl_thread_heap_t *heap = &ptls->heap; jl_gc_pool_t *p = heap->norm_pools; + // initialize pools for objects which are not special cased for (int i = 0; i < JL_GC_N_POOLS; i++) { p[i].osize = jl_gc_sizeclasses[i]; p[i].freelist = NULL; p[i].newpages = NULL; } + // initialize pools for SimpleVector + for (int i = 0; i < JL_GC_N_POOLS; i++) { + p[i + JL_GC_N_MAX_POOLS].osize = jl_gc_sizeclasses[i]; + p[i + JL_GC_N_MAX_POOLS].freelist = NULL; + p[i + JL_GC_N_MAX_POOLS].newpages = NULL; + } + // // initialize pool for CodeInstance + size_t ci_size = sizeof(jl_code_instance_t) + sizeof(jl_taggedvalue_t); + size_t sz = jl_gc_sizeclasses[jl_gc_szclass(ci_size)]; + int i = 2 * JL_GC_N_MAX_POOLS; + p[i].osize = sz; + p[i].freelist = NULL; + p[i].newpages = NULL; + // initialize pool for MethodInstance + size_t mi_size = sizeof(jl_method_instance_t) + sizeof(jl_taggedvalue_t); + sz = jl_gc_sizeclasses[jl_gc_szclass(mi_size)]; + i = 2 * JL_GC_N_MAX_POOLS + 1; + p[i].osize = sz; + p[i].freelist = NULL; + p[i].newpages = NULL; + // initialize pool for TypeMapEntry + size_t tme_size = sizeof(jl_typemap_entry_t) + sizeof(jl_taggedvalue_t); + sz = jl_gc_sizeclasses[jl_gc_szclass(tme_size)]; + i = 2 * JL_GC_N_MAX_POOLS + 2; + p[i].osize = sz; + p[i].freelist = NULL; + p[i].newpages = NULL; small_arraylist_new(&heap->weak_refs, 0); small_arraylist_new(&heap->live_tasks, 0); for (int i = 0; i < JL_N_STACK_POOLS; i++) @@ -3728,6 +3828,13 @@ void jl_gc_init(void) uv_mutex_init(&gc_perm_lock); uv_mutex_init(&gc_threads_lock); uv_cond_init(&gc_threads_cond); +#ifdef GC_HEAP_DUMP + page_profiling_file = fopen("julia_page_profile.out", "w"); + if (page_profiling_file == NULL) { + fprintf(stderr, "could not open \"julia_page_profile.out\" for writing\n"); + abort(); + } +#endif jl_gc_init_page(); jl_gc_debug_init(); diff --git a/src/jlapi.c b/src/jlapi.c index 5c6f01ab86a88..398d6b2618daf 100644 --- a/src/jlapi.c +++ b/src/jlapi.c @@ -678,6 +678,8 @@ static void rr_detach_teleport(void) { } #endif +void gc_enable_heap_dump(void); + JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[]) { // no-op on Windows, note that the caller must have already converted @@ -716,6 +718,8 @@ JL_DLLEXPORT int jl_repl_entrypoint(int argc, char *argv[]) } int ret = true_main(argc, (char**)new_argv); jl_atexit_hook(ret); + gc_enable_heap_dump(); + jl_gc_collect(JL_GC_FULL); return ret; } diff --git a/src/julia_internal.h b/src/julia_internal.h index e06a834a80752..950e2afdb7b2d 100644 --- a/src/julia_internal.h +++ b/src/julia_internal.h @@ -483,8 +483,20 @@ STATIC_INLINE jl_value_t *jl_gc_alloc_(jl_ptls_t ptls, size_t sz, void *ty) const size_t allocsz = sz + sizeof(jl_taggedvalue_t); if (sz <= GC_MAX_SZCLASS) { int pool_id = jl_gc_szclass(allocsz); - jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; int osize = jl_gc_sizeclasses[pool_id]; + jl_gc_pool_t *p = &ptls->heap.norm_pools[pool_id]; + if (ty == jl_simplevector_type) { + p += JL_GC_N_MAX_POOLS; + } + else if (ty == jl_code_instance_type) { + p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS]; + } + else if (ty == jl_method_instance_type) { + p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS + 1]; + } + else if (ty == jl_typemap_entry_type) { + p = &ptls->heap.norm_pools[2 * JL_GC_N_MAX_POOLS + 2]; + } // We call `jl_gc_pool_alloc_noinline` instead of `jl_gc_pool_alloc` to avoid double-counting in // the Allocations Profiler. (See https://github.com/JuliaLang/julia/pull/43868 for more details.) v = jl_gc_pool_alloc_noinline(ptls, (char*)p - (char*)ptls, osize); diff --git a/src/julia_threads.h b/src/julia_threads.h index f69f9dd4baacf..84bf896639b69 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -161,7 +161,13 @@ typedef struct { // variables for allocating objects from pools #define JL_GC_N_MAX_POOLS 51 // conservative. must be kept in sync with `src/julia_internal.h` - jl_gc_pool_t norm_pools[JL_GC_N_MAX_POOLS]; + // pool layout: + // - [0, JL_GC_N_MAX_POOLS): small objects which are not special cased (see below) + // - [JL_GC_N_MAX_POOLS, 2 * JL_GC_N_MAX_POOLS): SimpleVector + // - 2 * JL_GC_N_MAX_POOLS: CodeInstance + // - 2 * JL_GC_N_MAX_POOLS + 1: MethodInstance + // - 2 * JL_GC_N_MAX_POOLS + 2: TypeMapEntry + jl_gc_pool_t norm_pools[2 * JL_GC_N_MAX_POOLS + 3]; #define JL_N_STACK_POOLS 16 small_arraylist_t free_stacks[JL_N_STACK_POOLS];