diff --git a/src/atomics.h b/src/atomics.h index ebfc66bbd83f4..a43ebed83a745 100644 --- a/src/atomics.h +++ b/src/atomics.h @@ -49,6 +49,8 @@ __atomic_fetch_add(obj, arg, __ATOMIC_RELAXED) # define jl_atomic_fetch_add(obj, arg) \ __atomic_fetch_add(obj, arg, __ATOMIC_SEQ_CST) +# define jl_atomic_add_fetch(obj, arg) \ + __atomic_add_fetch(obj, arg, __ATOMIC_SEQ_CST) # define jl_atomic_fetch_and_relaxed(obj, arg) \ __atomic_fetch_and(obj, arg, __ATOMIC_RELAXED) # define jl_atomic_fetch_and(obj, arg) \ @@ -91,6 +93,8 @@ __atomic_load_n(obj, __ATOMIC_SEQ_CST) # define jl_atomic_load_acquire(obj) \ __atomic_load_n(obj, __ATOMIC_ACQUIRE) +# define jl_atomic_load_relaxed(obj) \ + __atomic_load_n(obj, __ATOMIC_RELAXED) #elif defined(_COMPILER_MICROSOFT_) # define jl_signal_fence() _ReadWriteBarrier() diff --git a/src/gc.c b/src/gc.c index 0fe4b1fdc460a..ea5631844f925 100644 --- a/src/gc.c +++ b/src/gc.c @@ -491,6 +491,19 @@ static size_t max_collect_interval = 1250000000UL; static size_t max_collect_interval = 500000000UL; #endif +// determine how often the given thread should atomically update +// the global allocation counter. +// NOTE: currently the same for all threads. +static int64_t per_thread_counter_interval(jl_ptls_t ptls) +{ + if (jl_n_threads == 1) + return gc_num.interval; + size_t intvl = gc_num.interval / jl_n_threads / 2; + if (intvl < 1048576) + return 1048576; + return intvl; +} + // global variables for GC stats // Resetting the object to a young object, this is used when marking the @@ -802,16 +815,21 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT jl_gc_queue_root(v); } -#define should_collect() (__unlikely(gc_num.allocd>0)) - -static inline int maybe_collect(jl_ptls_t ptls) +static inline void maybe_collect(jl_ptls_t ptls) { - if (should_collect() || gc_debug_check_other()) { + int should_collect = 0; + if (ptls->gc_num.allocd >= 0) { + int64_t intvl = per_thread_counter_interval(ptls); + size_t localbytes = ptls->gc_num.allocd + intvl; + ptls->gc_num.allocd = -intvl; + should_collect = (jl_atomic_add_fetch(&gc_num.allocd, localbytes) >= 0); + } + if (should_collect || gc_debug_check_other()) { jl_gc_collect(0); - return 1; } - jl_gc_safepoint_(ptls); - return 0; + else { + jl_gc_safepoint_(ptls); + } } // weak references @@ -876,12 +894,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz) jl_throw(jl_memory_exception); gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t, gc_cblist_notify_external_alloc, (v, allocsz)); -#ifdef JULIA_ENABLE_THREADING - jl_atomic_fetch_add(&gc_num.allocd, allocsz); -#else - gc_num.allocd += allocsz; -#endif - gc_num.bigalloc++; + ptls->gc_num.allocd += allocsz; + ptls->gc_num.bigalloc++; #ifdef MEMDEBUG memset(v, 0xee, allocsz); #endif @@ -973,14 +987,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT { - gc_num.allocd += sz; + jl_ptls_t ptls = jl_get_ptls_states(); + ptls->gc_num.allocd += sz; +} + +static void combine_thread_gc_counts(jl_gc_num_t *dest) +{ + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls = jl_all_tls_states[i]; + if (ptls) { + dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + per_thread_counter_interval(ptls)); + dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed); + dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc); + dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc); + dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc); + dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc); + dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall); + } + } +} + +static void reset_thread_gc_counts(void) +{ + for (int i = 0; i < jl_n_threads; i++) { + jl_ptls_t ptls = jl_all_tls_states[i]; + if (ptls) { + memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t)); + ptls->gc_num.allocd = -per_thread_counter_interval(ptls); + } + } } void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT { + combine_thread_gc_counts(&gc_num); live_bytes += (gc_num.deferred_alloc + (gc_num.allocd + gc_num.interval)); gc_num.allocd = -(int64_t)gc_num.interval; gc_num.deferred_alloc = 0; + reset_thread_gc_counts(); } static size_t array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT @@ -1098,16 +1142,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset, #ifdef MEMDEBUG return jl_gc_big_alloc(ptls, osize); #endif - // FIXME - need JL_ATOMIC_FETCH_AND_ADD here - if (__unlikely((gc_num.allocd += osize) >= 0) || gc_debug_check_pool()) { - //gc_num.allocd -= osize; - jl_gc_collect(0); - //gc_num.allocd += osize; - } - else { - jl_gc_safepoint_(ptls); - } - gc_num.poolalloc++; + maybe_collect(ptls); + ptls->gc_num.allocd += osize; + ptls->gc_num.poolalloc++; // first try to use the freelist jl_taggedvalue_t *v = p->freelist; if (v) { @@ -2603,9 +2640,11 @@ JL_DLLEXPORT int jl_gc_is_enabled(void) JL_DLLEXPORT int64_t jl_gc_total_bytes(void) { + jl_gc_num_t num = gc_num; + combine_thread_gc_counts(&num); // Sync this logic with `base/util.jl:GC_Diff` - return (gc_num.total_allocd + gc_num.deferred_alloc + - gc_num.allocd + gc_num.interval); + return (num.total_allocd + num.deferred_alloc + + num.allocd + num.interval); } JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void) { @@ -2613,7 +2652,9 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void) } JL_DLLEXPORT jl_gc_num_t jl_gc_num(void) { - return gc_num; + jl_gc_num_t num = gc_num; + combine_thread_gc_counts(&num); + return num; } JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void) @@ -2687,6 +2728,8 @@ static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp // Only one thread should be running in this function static int _jl_gc_collect(jl_ptls_t ptls, int full) { + combine_thread_gc_counts(&gc_num); + jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache; jl_gc_mark_sp_t sp; gc_mark_sp_init(gc_cache, &sp); @@ -2853,6 +2896,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full) gc_num.total_time += pause; gc_num.since_sweep = 0; gc_num.freed = 0; + reset_thread_gc_counts(); return recollect; } @@ -2962,6 +3006,10 @@ void jl_init_thread_heap(jl_ptls_t ptls) gc_cache->pc_stack = (void**)malloc(init_size * sizeof(void*)); gc_cache->pc_stack_end = gc_cache->pc_stack + init_size; gc_cache->data_stack = (jl_gc_mark_data_t *)malloc(init_size * sizeof(jl_gc_mark_data_t)); + + memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t)); + assert(gc_num.interval == default_collect_interval); + ptls->gc_num.allocd = -per_thread_counter_interval(ptls); } // System-wide initializations @@ -2999,8 +3047,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); maybe_collect(ptls); - gc_num.allocd += sz; - gc_num.malloc++; + ptls->gc_num.allocd += sz; + ptls->gc_num.malloc++; void *b = malloc(sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3011,8 +3059,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) { jl_ptls_t ptls = jl_get_ptls_states(); maybe_collect(ptls); - gc_num.allocd += nm*sz; - gc_num.malloc++; + ptls->gc_num.allocd += nm*sz; + ptls->gc_num.malloc++; void *b = calloc(nm, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3021,9 +3069,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz) JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz) { + jl_ptls_t ptls = jl_get_ptls_states(); free(p); - gc_num.freed += sz; - gc_num.freecall++; + ptls->gc_num.freed += sz; + ptls->gc_num.freecall++; } // older name for jl_gc_counted_free_with_size @@ -3037,10 +3086,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size jl_ptls_t ptls = jl_get_ptls_states(); maybe_collect(ptls); if (sz < old) - gc_num.freed += (old - sz); + ptls->gc_num.freed += (old - sz); else - gc_num.allocd += (sz - old); - gc_num.realloc++; + ptls->gc_num.allocd += (sz - old); + ptls->gc_num.realloc++; void *b = realloc(p, sz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3100,8 +3149,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz) size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT); if (allocsz < sz) // overflow in adding offs, size was "negative" jl_throw(jl_memory_exception); - gc_num.allocd += allocsz; - gc_num.malloc++; + ptls->gc_num.allocd += allocsz; + ptls->gc_num.malloc++; void *b = malloc_cache_align(allocsz); if (b == NULL) jl_throw(jl_memory_exception); @@ -3123,10 +3172,10 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds live_bytes += allocsz - oldsz; } else if (allocsz < oldsz) - gc_num.freed += (oldsz - allocsz); + ptls->gc_num.freed += (oldsz - allocsz); else - gc_num.allocd += (allocsz - oldsz); - gc_num.realloc++; + ptls->gc_num.allocd += (allocsz - oldsz); + ptls->gc_num.realloc++; void *b; if (isaligned) diff --git a/src/init.c b/src/init.c index 1c0f6564e5ff4..42eddbdd0544c 100644 --- a/src/init.c +++ b/src/init.c @@ -742,10 +742,11 @@ void _julia_init(JL_IMAGE_SEARCH rel) jl_error("cannot generate code-coverage or track allocation information while generating a .o or .bc output file"); } + jl_gc_init(); + jl_init_threading(); jl_init_intrinsic_properties(); - jl_gc_init(); jl_gc_enable(0); jl_resolve_sysimg_location(rel); diff --git a/src/julia_threads.h b/src/julia_threads.h index 06088d52f48fb..f856c5a5b3007 100644 --- a/src/julia_threads.h +++ b/src/julia_threads.h @@ -65,6 +65,16 @@ typedef struct { uint16_t osize; // size of objects in this pool } jl_gc_pool_t; +typedef struct { + int64_t allocd; + int64_t freed; + uint64_t malloc; + uint64_t realloc; + uint64_t poolalloc; + uint64_t bigalloc; + uint64_t freecall; +} jl_thread_gc_num_t; + typedef struct { // variable for tracking weak references arraylist_t weak_refs; @@ -156,6 +166,7 @@ struct _jl_tls_states_t { volatile int8_t in_finalizer; int8_t disable_gc; jl_thread_heap_t heap; + jl_thread_gc_num_t gc_num; uv_mutex_t sleep_lock; uv_cond_t wake_signal; volatile sig_atomic_t defer_signal;