Skip to content

Commit

Permalink
GC mark-loop rewrite (#47292)
Browse files Browse the repository at this point in the history
## Previous work

Since #21590, the GC mark-loop was implemented by keeping two manually managed stacks: one of which contained iterator states used to keep track of the object currently being marked. As an example, to mark arrays, we would pop the corresponding iterator state from the stack, iterate over the array until we found an unmarked reference, and if so, we would update the iterator state (to reflect the index we left off), "repush" it into the stack and  proceed with marking the reference we just found.

## This PR

This PR eliminates the need of keeping the iterator states by modifying the object graph traversal code. We keep a single stack of `jl_value_t *` currently being processed. To mark an object, we first pop it from the stack, push all unmarked references into the stack and proceed with marking.

I believe this doesn't break any invariant from the generational GC. Indeed, the age bits are set after marking (if the object survived one GC cycle it's moved to the old generation), so this new traversal scheme wouldn't change the fact of whether an object had references to old objects or not. Furthermore, we must not update GC metadata for objects in the `remset`, and we ensure this by calling `gc_mark_outrefs` in `gc_queue_remset` with `meta_updated` set to 1.

## Additional advantages

1. There are no recursive function calls in the GC mark-loop code (one of the reasons why #21590 was implemented).
2. Keeping a single GC queue will **greatly** simplify work-stealing in the multi-threaded GC we are working on (c.f. #45639).
3. Arrays of references, for example, are now marked on a regular stride fashion, which could help with hardware prefetching.
4. We can easily modify the traversal mode (to breadth first, for example) by only changing the `jl_gc_markqueue_t`(from LIFO to FIFO, for example) methods without touching the mark-loop itself, which could enable further exploration on the GC in the future.

Since this PR changes the mark-loop graph traversal code, there are some changes in the heap-snapshot, though I'm not familiar with that PR.

Some benchmark results are here: https://hackmd.io/@Idnmfpb3SxK98-OsBtRD5A/H1V6QSzvs.
  • Loading branch information
d-netto authored Jan 24, 2023
1 parent f9d0473 commit dfab7be
Show file tree
Hide file tree
Showing 7 changed files with 865 additions and 1,495 deletions.
175 changes: 35 additions & 140 deletions src/gc-debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,21 +198,23 @@ static void restore(void)

static void gc_verify_track(jl_ptls_t ptls)
{
jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
do {
jl_gc_mark_sp_t sp;
gc_mark_sp_init(gc_cache, &sp);
jl_gc_markqueue_t mq;
mq.current = mq.start = ptls->mark_queue.start;
mq.end = ptls->mark_queue.end;
mq.current_chunk = mq.chunk_start = ptls->mark_queue.chunk_start;
mq.chunk_end = ptls->mark_queue.chunk_end;
arraylist_push(&lostval_parents_done, lostval);
jl_safe_printf("Now looking for %p =======\n", lostval);
clear_mark(GC_CLEAN);
gc_mark_queue_all_roots(ptls, &sp);
gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
for (int i = 0; i < gc_n_threads; i++) {
gc_mark_queue_all_roots(ptls, &mq);
gc_mark_finlist(&mq, &to_finalize, 0);
for (int i = 0; i < gc_n_threads;i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
gc_mark_finlist(&mq, &ptls2->finalizers, 0);
}
gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
gc_mark_loop(ptls, sp);
gc_mark_finlist(&mq, &finalizer_list_marked, 0);
gc_mark_loop_(ptls, &mq);
if (lostval_parents.len == 0) {
jl_safe_printf("Could not find the missing link. We missed a toplevel root. This is odd.\n");
break;
Expand Down Expand Up @@ -246,22 +248,24 @@ static void gc_verify_track(jl_ptls_t ptls)

void gc_verify(jl_ptls_t ptls)
{
jl_gc_mark_cache_t *gc_cache = &ptls->gc_cache;
jl_gc_mark_sp_t sp;
gc_mark_sp_init(gc_cache, &sp);
jl_gc_markqueue_t mq;
mq.current = mq.start = ptls->mark_queue.start;
mq.end = ptls->mark_queue.end;
mq.current_chunk = mq.chunk_start = ptls->mark_queue.chunk_start;
mq.chunk_end = ptls->mark_queue.chunk_end;
lostval = NULL;
lostval_parents.len = 0;
lostval_parents_done.len = 0;
clear_mark(GC_CLEAN);
gc_verifying = 1;
gc_mark_queue_all_roots(ptls, &sp);
gc_mark_queue_finlist(gc_cache, &sp, &to_finalize, 0);
for (int i = 0; i < gc_n_threads; i++) {
gc_mark_queue_all_roots(ptls, &mq);
gc_mark_finlist(&mq, &to_finalize, 0);
for (int i = 0; i < gc_n_threads;i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
gc_mark_queue_finlist(gc_cache, &sp, &ptls2->finalizers, 0);
gc_mark_finlist(&mq, &ptls2->finalizers, 0);
}
gc_mark_queue_finlist(gc_cache, &sp, &finalizer_list_marked, 0);
gc_mark_loop(ptls, sp);
gc_mark_finlist(&mq, &finalizer_list_marked, 0);
gc_mark_loop_(ptls, &mq);
int clean_len = bits_save[GC_CLEAN].len;
for(int i = 0; i < clean_len + bits_save[GC_OLD].len; i++) {
jl_taggedvalue_t *v = (jl_taggedvalue_t*)bits_save[i >= clean_len ? GC_OLD : GC_CLEAN].items[i >= clean_len ? i - clean_len : i];
Expand Down Expand Up @@ -500,7 +504,7 @@ int jl_gc_debug_check_other(void)
return gc_debug_alloc_check(&jl_gc_debug_env.other);
}

void jl_gc_debug_print_status(void)
void jl_gc_debug_print_status(void) JL_NOTSAFEPOINT
{
uint64_t pool_count = jl_gc_debug_env.pool.num;
uint64_t other_count = jl_gc_debug_env.other.num;
Expand All @@ -509,7 +513,7 @@ void jl_gc_debug_print_status(void)
pool_count + other_count, pool_count, other_count, gc_num.pause);
}

void jl_gc_debug_critical_error(void)
void jl_gc_debug_critical_error(void) JL_NOTSAFEPOINT
{
jl_gc_debug_print_status();
if (!jl_gc_debug_env.wait_for_debugger)
Expand Down Expand Up @@ -1264,9 +1268,9 @@ int gc_slot_to_arrayidx(void *obj, void *_slot) JL_NOTSAFEPOINT
return (slot - start) / elsize;
}

// Print a backtrace from the bottom (start) of the mark stack up to `sp`
// `pc_offset` will be added to `sp` for convenience in the debugger.
NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_offset)
// Print a backtrace from the `mq->start` of the mark queue up to `mq->current`
// `offset` will be added to `mq->current` for convenience in the debugger.
NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_markqueue_t *mq, int offset)
{
jl_jmp_buf *old_buf = jl_get_safe_restore();
jl_jmp_buf buf;
Expand All @@ -1276,123 +1280,14 @@ NOINLINE void gc_mark_loop_unwind(jl_ptls_t ptls, jl_gc_mark_sp_t sp, int pc_off
jl_set_safe_restore(old_buf);
return;
}
void **top = sp.pc + pc_offset;
jl_gc_mark_data_t *data_top = sp.data;
sp.data = ptls->gc_cache.data_stack;
sp.pc = ptls->gc_cache.pc_stack;
int isroot = 1;
while (sp.pc < top) {
void *pc = *sp.pc;
const char *prefix = isroot ? "r--" : " `-";
isroot = 0;
if (pc == gc_mark_label_addrs[GC_MARK_L_marked_obj]) {
gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: Root object: %p :: %p (bits: %d)\n of type ",
(void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
jl_((void*)data->tag);
isroot = 1;
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_scan_only]) {
gc_mark_marked_obj_t *data = gc_repush_markdata(&sp, gc_mark_marked_obj_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: Queued root: %p :: %p (bits: %d)\n of type ",
(void*)data, (void*)data->obj, (void*)data->tag, (int)data->bits);
jl_((void*)data->tag);
isroot = 1;
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_finlist]) {
gc_mark_finlist_t *data = gc_repush_markdata(&sp, gc_mark_finlist_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: Finalizer list from %p to %p\n",
(void*)data, (void*)data->begin, (void*)data->end);
isroot = 1;
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_objarray]) {
gc_mark_objarray_t *data = gc_repush_markdata(&sp, gc_mark_objarray_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: %s Array in object %p :: %p -- [%p, %p)\n of type ",
(void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
(void*)data->begin, (void*)data->end);
jl_(jl_typeof(data->parent));
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_obj8]) {
gc_mark_obj8_t *data = gc_repush_markdata(&sp, gc_mark_obj8_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
uint8_t *desc = (uint8_t*)jl_dt_layout_ptrs(vt->layout);
jl_safe_printf("%p: %s Object (8bit) %p :: %p -- [%d, %d)\n of type ",
(void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
(int)(data->begin - desc), (int)(data->end - desc));
jl_(jl_typeof(data->parent));
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_obj16]) {
gc_mark_obj16_t *data = gc_repush_markdata(&sp, gc_mark_obj16_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
uint16_t *desc = (uint16_t*)jl_dt_layout_ptrs(vt->layout);
jl_safe_printf("%p: %s Object (16bit) %p :: %p -- [%d, %d)\n of type ",
(void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
(int)(data->begin - desc), (int)(data->end - desc));
jl_(jl_typeof(data->parent));
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_obj32]) {
gc_mark_obj32_t *data = gc_repush_markdata(&sp, gc_mark_obj32_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_datatype_t *vt = (jl_datatype_t*)jl_typeof(data->parent);
uint32_t *desc = (uint32_t*)jl_dt_layout_ptrs(vt->layout);
jl_safe_printf("%p: %s Object (32bit) %p :: %p -- [%d, %d)\n of type ",
(void*)data, prefix, (void*)data->parent, ((void**)data->parent)[-1],
(int)(data->begin - desc), (int)(data->end - desc));
jl_(jl_typeof(data->parent));
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_stack]) {
gc_mark_stackframe_t *data = gc_repush_markdata(&sp, gc_mark_stackframe_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: %s Stack frame %p -- %d of %d (%s)\n",
(void*)data, prefix, (void*)data->s, (int)data->i,
(int)data->nroots >> 1,
(data->nroots & 1) ? "indirect" : "direct");
}
else if (pc == gc_mark_label_addrs[GC_MARK_L_module_binding]) {
// module_binding
gc_mark_binding_t *data = gc_repush_markdata(&sp, gc_mark_binding_t);
if ((jl_gc_mark_data_t *)data > data_top) {
jl_safe_printf("Mark stack unwind overflow -- ABORTING !!!\n");
break;
}
jl_safe_printf("%p: %s Module (bindings) %p -- [%p, %p)\n",
(void*)data, prefix, (void*)data->parent,
(void*)data->begin, (void*)data->end);
}
else {
jl_safe_printf("Unknown pc %p --- ABORTING !!!\n", pc);
break;
}
jl_value_t **start = mq->start;
jl_value_t **end = mq->current + offset;
for (; start < end; start++) {
jl_value_t *obj = *start;
jl_taggedvalue_t *o = jl_astaggedvalue(obj);
jl_safe_printf("Queued object: %p :: (tag: %zu) (bits: %zu)\n", obj,
(uintptr_t)o->header, ((uintptr_t)o->header & 3));
jl_((void*)(jl_datatype_t *)(o->header & ~(uintptr_t)0xf));
}
jl_set_safe_restore(old_buf);
}
Expand Down
Loading

0 comments on commit dfab7be

Please sign in to comment.