diff --git a/src/atomics.h b/src/atomics.h
index ebfc66bbd83f4..a43ebed83a745 100644
--- a/src/atomics.h
+++ b/src/atomics.h
@@ -49,6 +49,8 @@
     __atomic_fetch_add(obj, arg, __ATOMIC_RELAXED)
 #  define jl_atomic_fetch_add(obj, arg)                 \
     __atomic_fetch_add(obj, arg, __ATOMIC_SEQ_CST)
+#  define jl_atomic_add_fetch(obj, arg)                 \
+    __atomic_add_fetch(obj, arg, __ATOMIC_SEQ_CST)
 #  define jl_atomic_fetch_and_relaxed(obj, arg)         \
     __atomic_fetch_and(obj, arg, __ATOMIC_RELAXED)
 #  define jl_atomic_fetch_and(obj, arg)                 \
@@ -91,6 +93,8 @@
     __atomic_load_n(obj, __ATOMIC_SEQ_CST)
 #  define jl_atomic_load_acquire(obj)           \
     __atomic_load_n(obj, __ATOMIC_ACQUIRE)
+#  define jl_atomic_load_relaxed(obj)           \
+    __atomic_load_n(obj, __ATOMIC_RELAXED)
 #elif defined(_COMPILER_MICROSOFT_)
 #  define jl_signal_fence() _ReadWriteBarrier()
 
diff --git a/src/gc.c b/src/gc.c
index 0fe4b1fdc460a..ea5631844f925 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -491,6 +491,19 @@ static size_t max_collect_interval = 1250000000UL;
 static size_t max_collect_interval =  500000000UL;
 #endif
 
+// determine how often the given thread should atomically update
+// the global allocation counter.
+// NOTE: currently the same for all threads.
+static int64_t per_thread_counter_interval(jl_ptls_t ptls)
+{
+    if (jl_n_threads == 1)
+        return gc_num.interval;
+    size_t intvl = gc_num.interval / jl_n_threads / 2;
+    if (intvl < 1048576)
+        return 1048576;
+    return intvl;
+}
+
 // global variables for GC stats
 
 // Resetting the object to a young object, this is used when marking the
@@ -802,16 +815,21 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
         jl_gc_queue_root(v);
 }
 
-#define should_collect() (__unlikely(gc_num.allocd>0))
-
-static inline int maybe_collect(jl_ptls_t ptls)
+static inline void maybe_collect(jl_ptls_t ptls)
 {
-    if (should_collect() || gc_debug_check_other()) {
+    int should_collect = 0;
+    if (ptls->gc_num.allocd >= 0) {
+        int64_t intvl = per_thread_counter_interval(ptls);
+        size_t localbytes = ptls->gc_num.allocd + intvl;
+        ptls->gc_num.allocd = -intvl;
+        should_collect = (jl_atomic_add_fetch(&gc_num.allocd, localbytes) >= 0);
+    }
+    if (should_collect || gc_debug_check_other()) {
         jl_gc_collect(0);
-        return 1;
     }
-    jl_gc_safepoint_(ptls);
-    return 0;
+    else {
+        jl_gc_safepoint_(ptls);
+    }
 }
 
 // weak references
@@ -876,12 +894,8 @@ JL_DLLEXPORT jl_value_t *jl_gc_big_alloc(jl_ptls_t ptls, size_t sz)
         jl_throw(jl_memory_exception);
     gc_invoke_callbacks(jl_gc_cb_notify_external_alloc_t,
         gc_cblist_notify_external_alloc, (v, allocsz));
-#ifdef JULIA_ENABLE_THREADING
-    jl_atomic_fetch_add(&gc_num.allocd, allocsz);
-#else
-    gc_num.allocd += allocsz;
-#endif
-    gc_num.bigalloc++;
+    ptls->gc_num.allocd += allocsz;
+    ptls->gc_num.bigalloc++;
 #ifdef MEMDEBUG
     memset(v, 0xee, allocsz);
 #endif
@@ -973,14 +987,44 @@ void jl_gc_track_malloced_array(jl_ptls_t ptls, jl_array_t *a) JL_NOTSAFEPOINT
 
 void jl_gc_count_allocd(size_t sz) JL_NOTSAFEPOINT
 {
-    gc_num.allocd += sz;
+    jl_ptls_t ptls = jl_get_ptls_states();
+    ptls->gc_num.allocd += sz;
+}
+
+static void combine_thread_gc_counts(jl_gc_num_t *dest)
+{
+    for (int i = 0; i < jl_n_threads; i++) {
+        jl_ptls_t ptls = jl_all_tls_states[i];
+        if (ptls) {
+            dest->allocd += (jl_atomic_load_relaxed(&ptls->gc_num.allocd) + per_thread_counter_interval(ptls));
+            dest->freed += jl_atomic_load_relaxed(&ptls->gc_num.freed);
+            dest->malloc += jl_atomic_load_relaxed(&ptls->gc_num.malloc);
+            dest->realloc += jl_atomic_load_relaxed(&ptls->gc_num.realloc);
+            dest->poolalloc += jl_atomic_load_relaxed(&ptls->gc_num.poolalloc);
+            dest->bigalloc += jl_atomic_load_relaxed(&ptls->gc_num.bigalloc);
+            dest->freecall += jl_atomic_load_relaxed(&ptls->gc_num.freecall);
+        }
+    }
+}
+
+static void reset_thread_gc_counts(void)
+{
+    for (int i = 0; i < jl_n_threads; i++) {
+        jl_ptls_t ptls = jl_all_tls_states[i];
+        if (ptls) {
+            memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t));
+            ptls->gc_num.allocd = -per_thread_counter_interval(ptls);
+        }
+    }
 }
 
 void jl_gc_reset_alloc_count(void) JL_NOTSAFEPOINT
 {
+    combine_thread_gc_counts(&gc_num);
     live_bytes += (gc_num.deferred_alloc + (gc_num.allocd + gc_num.interval));
     gc_num.allocd = -(int64_t)gc_num.interval;
     gc_num.deferred_alloc = 0;
+    reset_thread_gc_counts();
 }
 
 static size_t array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT
@@ -1098,16 +1142,9 @@ JL_DLLEXPORT jl_value_t *jl_gc_pool_alloc(jl_ptls_t ptls, int pool_offset,
 #ifdef MEMDEBUG
     return jl_gc_big_alloc(ptls, osize);
 #endif
-    // FIXME - need JL_ATOMIC_FETCH_AND_ADD here
-    if (__unlikely((gc_num.allocd += osize) >= 0) || gc_debug_check_pool()) {
-        //gc_num.allocd -= osize;
-        jl_gc_collect(0);
-        //gc_num.allocd += osize;
-    }
-    else {
-        jl_gc_safepoint_(ptls);
-    }
-    gc_num.poolalloc++;
+    maybe_collect(ptls);
+    ptls->gc_num.allocd += osize;
+    ptls->gc_num.poolalloc++;
     // first try to use the freelist
     jl_taggedvalue_t *v = p->freelist;
     if (v) {
@@ -2603,9 +2640,11 @@ JL_DLLEXPORT int jl_gc_is_enabled(void)
 
 JL_DLLEXPORT int64_t jl_gc_total_bytes(void)
 {
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num);
     // Sync this logic with `base/util.jl:GC_Diff`
-    return (gc_num.total_allocd + gc_num.deferred_alloc +
-            gc_num.allocd + gc_num.interval);
+    return (num.total_allocd + num.deferred_alloc +
+            num.allocd + num.interval);
 }
 JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
 {
@@ -2613,7 +2652,9 @@ JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void)
 }
 JL_DLLEXPORT jl_gc_num_t jl_gc_num(void)
 {
-    return gc_num;
+    jl_gc_num_t num = gc_num;
+    combine_thread_gc_counts(&num);
+    return num;
 }
 
 JL_DLLEXPORT int64_t jl_gc_diff_total_bytes(void)
@@ -2687,6 +2728,8 @@ static void jl_gc_queue_bt_buf(jl_gc_mark_cache_t *gc_cache, jl_gc_mark_sp_t *sp
 // Only one thread should be running in this function
 static int _jl_gc_collect(jl_ptls_t ptls, int full)
 {
+    combine_thread_gc_counts(&gc_num);
+
     jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
     jl_gc_mark_sp_t sp;
     gc_mark_sp_init(gc_cache, &sp);
@@ -2853,6 +2896,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, int full)
     gc_num.total_time += pause;
     gc_num.since_sweep = 0;
     gc_num.freed = 0;
+    reset_thread_gc_counts();
 
     return recollect;
 }
@@ -2962,6 +3006,10 @@ void jl_init_thread_heap(jl_ptls_t ptls)
     gc_cache->pc_stack = (void**)malloc(init_size * sizeof(void*));
     gc_cache->pc_stack_end = gc_cache->pc_stack + init_size;
     gc_cache->data_stack = (jl_gc_mark_data_t *)malloc(init_size * sizeof(jl_gc_mark_data_t));
+
+    memset(&ptls->gc_num, 0, sizeof(jl_thread_gc_num_t));
+    assert(gc_num.interval == default_collect_interval);
+    ptls->gc_num.allocd = -per_thread_counter_interval(ptls);
 }
 
 // System-wide initializations
@@ -2999,8 +3047,8 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     maybe_collect(ptls);
-    gc_num.allocd += sz;
-    gc_num.malloc++;
+    ptls->gc_num.allocd += sz;
+    ptls->gc_num.malloc++;
     void *b = malloc(sz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
@@ -3011,8 +3059,8 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 {
     jl_ptls_t ptls = jl_get_ptls_states();
     maybe_collect(ptls);
-    gc_num.allocd += nm*sz;
-    gc_num.malloc++;
+    ptls->gc_num.allocd += nm*sz;
+    ptls->gc_num.malloc++;
     void *b = calloc(nm, sz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
@@ -3021,9 +3069,10 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
 
 JL_DLLEXPORT void jl_gc_counted_free_with_size(void *p, size_t sz)
 {
+    jl_ptls_t ptls = jl_get_ptls_states();
     free(p);
-    gc_num.freed += sz;
-    gc_num.freecall++;
+    ptls->gc_num.freed += sz;
+    ptls->gc_num.freecall++;
 }
 
 // older name for jl_gc_counted_free_with_size
@@ -3037,10 +3086,10 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size
     jl_ptls_t ptls = jl_get_ptls_states();
     maybe_collect(ptls);
     if (sz < old)
-        gc_num.freed += (old - sz);
+        ptls->gc_num.freed += (old - sz);
     else
-        gc_num.allocd += (sz - old);
-    gc_num.realloc++;
+        ptls->gc_num.allocd += (sz - old);
+    ptls->gc_num.realloc++;
     void *b = realloc(p, sz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
@@ -3100,8 +3149,8 @@ JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
     size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
     if (allocsz < sz)  // overflow in adding offs, size was "negative"
         jl_throw(jl_memory_exception);
-    gc_num.allocd += allocsz;
-    gc_num.malloc++;
+    ptls->gc_num.allocd += allocsz;
+    ptls->gc_num.malloc++;
     void *b = malloc_cache_align(allocsz);
     if (b == NULL)
         jl_throw(jl_memory_exception);
@@ -3123,10 +3172,10 @@ static void *gc_managed_realloc_(jl_ptls_t ptls, void *d, size_t sz, size_t olds
         live_bytes += allocsz - oldsz;
     }
     else if (allocsz < oldsz)
-        gc_num.freed += (oldsz - allocsz);
+        ptls->gc_num.freed += (oldsz - allocsz);
     else
-        gc_num.allocd += (allocsz - oldsz);
-    gc_num.realloc++;
+        ptls->gc_num.allocd += (allocsz - oldsz);
+    ptls->gc_num.realloc++;
 
     void *b;
     if (isaligned)
diff --git a/src/init.c b/src/init.c
index 1c0f6564e5ff4..42eddbdd0544c 100644
--- a/src/init.c
+++ b/src/init.c
@@ -742,10 +742,11 @@ void _julia_init(JL_IMAGE_SEARCH rel)
         jl_error("cannot generate code-coverage or track allocation information while generating a .o or .bc output file");
     }
 
+    jl_gc_init();
+
     jl_init_threading();
     jl_init_intrinsic_properties();
 
-    jl_gc_init();
     jl_gc_enable(0);
 
     jl_resolve_sysimg_location(rel);
diff --git a/src/julia_threads.h b/src/julia_threads.h
index 06088d52f48fb..f856c5a5b3007 100644
--- a/src/julia_threads.h
+++ b/src/julia_threads.h
@@ -65,6 +65,16 @@ typedef struct {
     uint16_t osize;      // size of objects in this pool
 } jl_gc_pool_t;
 
+typedef struct {
+    int64_t     allocd;
+    int64_t     freed;
+    uint64_t    malloc;
+    uint64_t    realloc;
+    uint64_t    poolalloc;
+    uint64_t    bigalloc;
+    uint64_t    freecall;
+} jl_thread_gc_num_t;
+
 typedef struct {
     // variable for tracking weak references
     arraylist_t weak_refs;
@@ -156,6 +166,7 @@ struct _jl_tls_states_t {
     volatile int8_t in_finalizer;
     int8_t disable_gc;
     jl_thread_heap_t heap;
+    jl_thread_gc_num_t gc_num;
     uv_mutex_t sleep_lock;
     uv_cond_t wake_signal;
     volatile sig_atomic_t defer_signal;