Skip to content

Commit

Permalink
parallelize sweeping of object pools
Browse files Browse the repository at this point in the history
  • Loading branch information
d-netto committed Sep 12, 2023
1 parent 5d82d80 commit a03e7fe
Show file tree
Hide file tree
Showing 8 changed files with 158 additions and 76 deletions.
4 changes: 2 additions & 2 deletions src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ static void gc_clear_mark_outer(int bits)
{
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd;
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
while (pg != NULL) {
gc_clear_mark_page(pg, bits);
pg = pg->next;
Expand Down Expand Up @@ -1153,7 +1153,7 @@ static void gc_count_pool_pagetable(void)
{
for (int i = 0; i < gc_n_threads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd;
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
while (pg != NULL) {
if (gc_alloc_map_is_set(pg->data)) {
gc_count_pool_page(pg);
Expand Down
12 changes: 6 additions & 6 deletions src/gc-pages.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,22 +100,22 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
jl_gc_pagemeta_t *meta = NULL;

// try to get page from `pool_lazily_freed`
meta = pop_lf_page_metadata_back(&global_page_pool_lazily_freed);
meta = pop_lf_back(&global_page_pool_lazily_freed);
if (meta != NULL) {
gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
// page is already mapped
return meta;
}

// try to get page from `pool_clean`
meta = pop_lf_page_metadata_back(&global_page_pool_clean);
meta = pop_lf_back(&global_page_pool_clean);
if (meta != NULL) {
gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
goto exit;
}

// try to get page from `pool_freed`
meta = pop_lf_page_metadata_back(&global_page_pool_freed);
meta = pop_lf_back(&global_page_pool_freed);
if (meta != NULL) {
jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_resident, GC_PAGE_SZ);
gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
Expand All @@ -124,7 +124,7 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT

uv_mutex_lock(&gc_perm_lock);
// another thread may have allocated a large block while we were waiting...
meta = pop_lf_page_metadata_back(&global_page_pool_clean);
meta = pop_lf_back(&global_page_pool_clean);
if (meta != NULL) {
uv_mutex_unlock(&gc_perm_lock);
gc_alloc_map_set(meta->data, GC_PAGE_ALLOCATED);
Expand All @@ -138,10 +138,10 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
pg->data = data + GC_PAGE_SZ * i;
gc_alloc_map_maybe_create(pg->data);
if (i == 0) {
gc_alloc_map_set(pg->data, 1);
gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED);
}
else {
push_lf_page_metadata_back(&global_page_pool_clean, pg);
push_lf_back(&global_page_pool_clean, pg);
}
}
uv_mutex_unlock(&gc_perm_lock);
Expand Down
146 changes: 107 additions & 39 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ int jl_n_markthreads;
int jl_n_sweepthreads;
// Number of threads currently running the GC mark-loop
_Atomic(int) gc_n_threads_marking;
// Number of threads sweeping
_Atomic(int) gc_n_threads_sweeping;
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch;
// `tid` of mutator thread that triggered GC
_Atomic(int) gc_master_tid;
// `tid` of first GC thread
Expand Down Expand Up @@ -750,6 +754,7 @@ static int mark_reset_age = 0;
static int64_t scanned_bytes; // young bytes scanned while marking
static int64_t perm_scanned_bytes; // old bytes scanned while marking
int prev_sweep_full = 1;
int current_sweep_full = 0;
int under_pressure = 0;

// Full collection heuristics
Expand Down Expand Up @@ -1285,9 +1290,9 @@ STATIC_INLINE jl_taggedvalue_t *gc_reset_page(jl_ptls_t ptls2, const jl_gc_pool_
return beg;
}

jl_gc_global_page_pool_t global_page_pool_lazily_freed;
jl_gc_global_page_pool_t global_page_pool_clean;
jl_gc_global_page_pool_t global_page_pool_freed;
jl_gc_page_stack_t global_page_pool_lazily_freed;
jl_gc_page_stack_t global_page_pool_clean;
jl_gc_page_stack_t global_page_pool_freed;
pagetable_t alloc_map;

// Add a new page to the pool. Discards any pages in `p->newpages` before.
Expand All @@ -1296,7 +1301,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
// Do not pass in `ptls` as argument. This slows down the fast path
// in pool_alloc significantly
jl_ptls_t ptls = jl_current_task->ptls;
jl_gc_pagemeta_t *pg = pop_page_metadata_back(&ptls->page_metadata_lazily_freed);
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls->page_metadata_lazily_freed);
if (pg != NULL) {
gc_alloc_map_set(pg->data, GC_PAGE_ALLOCATED);
}
Expand All @@ -1306,7 +1311,7 @@ static NOINLINE jl_taggedvalue_t *gc_add_page(jl_gc_pool_t *p) JL_NOTSAFEPOINT
pg->osize = p->osize;
pg->thread_n = ptls->tid;
set_page_metadata(pg);
push_page_metadata_back(&ptls->page_metadata_allocd, pg);
push_lf_back(&ptls->page_metadata_allocd, pg);
jl_taggedvalue_t *fl = gc_reset_page(ptls, p, pg);
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, GC_PAGE_SZ);
p->newpages = fl;
Expand Down Expand Up @@ -1408,8 +1413,8 @@ int jl_gc_classify_pools(size_t sz, int *osize)
int64_t lazy_freed_pages = 0;

// Returns pointer to terminal pointer of list rooted at *pfl.
static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allocd,
jl_gc_pagemeta_t **lazily_freed, jl_gc_pagemeta_t *pg, jl_taggedvalue_t **pfl, int sweep_full, int osize) JL_NOTSAFEPOINT
static void gc_sweep_page(jl_gc_pool_t *p, jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed,
jl_gc_pagemeta_t *pg, int osize) JL_NOTSAFEPOINT
{
char *data = pg->data;
jl_taggedvalue_t *v = (jl_taggedvalue_t*)(data + GC_PAGE_OFFSET);
Expand All @@ -1433,7 +1438,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
// the eager one uses less memory.
// FIXME - need to do accounting on a per-thread basis
// on quick sweeps, keep a few pages empty but allocated for performance
if (!sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) {
if (!current_sweep_full && lazy_freed_pages <= default_collect_interval / GC_PAGE_SZ) {
lazy_freed_pages++;
freed_lazily = 1;
}
Expand All @@ -1443,15 +1448,9 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
}
// For quick sweep, we might be able to skip the page if the page doesn't
// have any young live cell before marking.
if (!sweep_full && !pg->has_young) {
if (!current_sweep_full && !pg->has_young) {
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
// the position of the freelist begin/end in this page
// is stored in its metadata
if (pg->fl_begin_offset != (uint16_t)-1) {
*pfl = page_pfl_beg(pg);
pfl = (jl_taggedvalue_t**)page_pfl_end(pg);
}
freedall = 0;
nfree = pg->nfree;
goto done;
Expand All @@ -1464,6 +1463,8 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
int has_young = 0;
int16_t prev_nold = 0;
int pg_nfree = 0;
jl_taggedvalue_t *fl = NULL;
jl_taggedvalue_t **pfl = &fl;
jl_taggedvalue_t **pfl_begin = NULL;
while ((char*)v <= lim) {
int bits = v->bits.gc;
Expand All @@ -1475,7 +1476,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
pg_nfree++;
}
else { // marked young or old
if (sweep_full || bits == GC_MARKED) { // old enough
if (current_sweep_full || bits == GC_MARKED) { // old enough
bits = v->bits.gc = GC_OLD; // promote
}
prev_nold++;
Expand All @@ -1497,7 +1498,7 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo
}

pg->nfree = pg_nfree;
if (sweep_full) {
if (current_sweep_full) {
pg->nold = 0;
pg->prev_nold = prev_nold;
}
Expand All @@ -1506,45 +1507,44 @@ static jl_taggedvalue_t **gc_sweep_page(jl_gc_pool_t *p, jl_gc_pagemeta_t **allo

done:
if (re_use_page) {
push_page_metadata_back(allocd, pg);
push_lf_back(allocd, pg);
}
else if (freed_lazily) {
gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
push_page_metadata_back(lazily_freed, pg);
push_lf_back(lazily_freed, pg);
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ);
}
else {
jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ);
#ifdef _P64 // only enable concurrent sweeping on 64bit
if (jl_n_sweepthreads == 0) {
jl_gc_free_page(pg);
push_lf_page_metadata_back(&global_page_pool_freed, pg);
push_lf_back(&global_page_pool_freed, pg);
}
else {
gc_alloc_map_set(pg->data, GC_PAGE_LAZILY_FREED);
push_lf_page_metadata_back(&global_page_pool_lazily_freed, pg);
push_lf_back(&global_page_pool_lazily_freed, pg);
}
#else
jl_gc_free_page(pg);
push_lf_page_metadata_back(&global_page_pool_freed, pg);
push_lf_back(&global_page_pool_freed, pg);
#endif
}
gc_time_count_page(freedall, pg_skpd);
gc_num.freed += (nfree - old_nfree) * osize;
jl_atomic_fetch_add((_Atomic(int64_t) *)&gc_num.freed, (nfree - old_nfree) * osize);
pool_live_bytes += GC_PAGE_SZ - GC_PAGE_OFFSET - nfree * osize;
return pfl;
}

// the actual sweeping over all allocated pages in a memory pool
STATIC_INLINE void gc_sweep_pool_page(jl_taggedvalue_t ***pfl, jl_gc_pagemeta_t **allocd,
jl_gc_pagemeta_t **lazily_freed, jl_gc_pagemeta_t *pg, int sweep_full) JL_NOTSAFEPOINT
STATIC_INLINE void gc_sweep_pool_page(jl_gc_page_stack_t *allocd, jl_gc_page_stack_t *lazily_freed,
jl_gc_pagemeta_t *pg) JL_NOTSAFEPOINT
{
int p_n = pg->pool_n;
int t_n = pg->thread_n;
jl_ptls_t ptls2 = gc_all_tls_states[t_n];
jl_gc_pool_t *p = &ptls2->heap.norm_pools[p_n];
int osize = pg->osize;
pfl[t_n * JL_GC_N_POOLS + p_n] = gc_sweep_page(p, allocd, lazily_freed, pg, pfl[t_n * JL_GC_N_POOLS + p_n], sweep_full, osize);
gc_sweep_page(p, allocd, lazily_freed, pg, osize);
}

// sweep over all memory that is being used and not in a pool
Expand All @@ -1570,8 +1570,55 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
pg->nfree = nfree;
}

void gc_sweep_wake_all(void)
{
uv_mutex_lock(&gc_threads_lock);
for (int i = gc_first_tid; i < gc_first_tid + jl_n_gcthreads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
jl_atomic_fetch_add(&ptls2->gc_sweeps_requested, 1);
}
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
}

void gc_sweep_pool_parallel(void)
{
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
if (allocd_scratch != NULL) {
while (1) {
int found_pg = 0;
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
if (pg == NULL) {
continue;
}
gc_sweep_pool_page(allocd, &ptls2->page_metadata_lazily_freed, pg);
found_pg = 1;
}
if (!found_pg) {
break;
}
}
}
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
}

void gc_sweep_wait_for_all(void)
{
jl_atomic_store(&gc_allocd_scratch, NULL);
while (jl_atomic_load_relaxed(&gc_n_threads_sweeping) != 0) {
jl_cpu_pause();
}
}

// setup the data-structures for a sweep over all memory pools
static void gc_sweep_pool(int sweep_full)
static void gc_sweep_pool(void)
{
gc_time_pool_start();
lazy_freed_pages = 0;
Expand Down Expand Up @@ -1614,7 +1661,7 @@ static void gc_sweep_pool(int sweep_full)
pg->has_young = 1;
}
}
jl_gc_pagemeta_t *pg = ptls2->page_metadata_lazily_freed;
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_lazily_freed.bottom);
while (pg != NULL) {
jl_gc_pagemeta_t *pg2 = pg->next;
lazy_freed_pages++;
Expand All @@ -1623,24 +1670,44 @@ static void gc_sweep_pool(int sweep_full)
}

// the actual sweeping
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
jl_atomic_store(&gc_allocd_scratch, tmp);
gc_sweep_wake_all();
gc_sweep_pool_parallel();
gc_sweep_wait_for_all();

for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
jl_gc_pagemeta_t *allocd = NULL;
jl_gc_pagemeta_t *pg = ptls2->page_metadata_allocd;
while (pg != NULL) {
jl_gc_pagemeta_t *pg2 = pg->next;
gc_sweep_pool_page(pfl, &allocd, &ptls2->page_metadata_lazily_freed, pg, sweep_full);
pg = pg2;
}
ptls2->page_metadata_allocd = allocd;
ptls2->page_metadata_allocd = tmp[t_i];
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
p->newpages = NULL;
}
}
}

// merge free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
while (pg != NULL) {
jl_gc_pagemeta_t *pg2 = pg->next;
if (pg->fl_begin_offset != UINT16_MAX) {
char *cur_pg = pg->data;
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
}
pg = pg2;
}
}

// null out terminal pointers of free lists
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
Expand All @@ -1658,7 +1725,7 @@ static void gc_sweep_pool(int sweep_full)
}
#endif

gc_time_pool_end(sweep_full);
gc_time_pool_end(current_sweep_full);
}

static void gc_sweep_perm_alloc(void)
Expand Down Expand Up @@ -3289,13 +3356,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
#ifdef USE_TRACY
TracyCZoneColor(full_timing_block.tracy_ctx, 0xFFA500);
#endif
current_sweep_full = sweep_full;
sweep_weak_refs();
sweep_stack_pools();
gc_sweep_foreign_objs();
gc_sweep_other(ptls, sweep_full);
gc_scrub();
gc_verify_tags();
gc_sweep_pool(sweep_full);
gc_sweep_pool();
if (sweep_full)
gc_sweep_perm_alloc();
}
Expand Down
Loading

0 comments on commit a03e7fe

Please sign in to comment.