Skip to content

Commit

Permalink
Add Mono EventPipe Sample Profiler support. (#47858)
Browse files Browse the repository at this point in the history
Implement support for EventPipe Sample Profiler on Mono inline with CoreClr Sample Profiler behaviour. By default CoreClr sample profiler tries to run at 1000 hertz (every ms) and on each sample it will stop runtime, snap callstacks for all managed threads, submit EventPipe sample profile events and resume runtime. Doing a full stop/restart of the runtime on every sample could be quite invasive, but is probably the most portable way of implementing it working on most platforms.

This PR implements the same logic, but on Mono runtime, stopping the runtime, record all callstacks for all managed threads that should be sampled, restart runtime and then write all sample profile events into EventPipe. Note that events are written after the runtime has resumed since all code executed when runtime is stopped needs to be async safe (needed when running in preemptive mode) so we can't call into EventPipe at that point.

Going forward we should investigate alternative ways to do sample profiling depending on underlying platform and OS support. Currently implementation will work on all supported platforms, but it will not be as accurate as it could be (especially when using safe points and coop enabled runtime) and impacts measured target. Mono's profiler uses Signals/SuspendThread and for platforms supporting these API's, that could be an alternative implementation. It could also be worth to look into CPU hardware counters using ETW kernel log session on Windows and perf_event_open on Linux.
  • Loading branch information
lateralusX authored Feb 12, 2021
1 parent fb41bd6 commit 86835b8
Show file tree
Hide file tree
Showing 9 changed files with 231 additions and 94 deletions.
9 changes: 4 additions & 5 deletions src/mono/mono/eventpipe/ep-rt-mono.h
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ typedef char* (*ep_rt_mono_get_os_cmd_line_func)(void);
typedef char* (*ep_rt_mono_get_managed_cmd_line_func)(void);
typedef gboolean (*ep_rt_mono_execute_rundown_func)(ep_rt_mono_fire_domain_rundown_events_func domain_events_func, ep_rt_mono_fire_assembly_rundown_events_func assembly_events_func, ep_rt_mono_fire_method_rundown_events_func methods_events_func);
typedef gboolean (*ep_rt_mono_walk_managed_stack_for_thread_func)(ep_rt_thread_handle_t thread, EventPipeStackContents *stack_contents);
typedef gboolean (*ep_rt_mono_sample_profiler_write_sampling_event_for_threads_func)(ep_rt_thread_handle_t sampling_thread, EventPipeEvent *sampling_event);
typedef gboolean (*ep_rt_mono_method_get_simple_assembly_name_func)(ep_rt_method_desc_t *method, ep_char8_t *name, size_t name_len);
typedef gboolean (*ep_rt_mono_method_get_full_name_func)(ep_rt_method_desc_t *method, ep_char8_t *name, size_t name_len);

Expand Down Expand Up @@ -458,6 +459,7 @@ typedef struct _EventPipeMonoFuncTable {
ep_rt_mono_get_managed_cmd_line_func ep_rt_mono_get_managed_cmd_line;
ep_rt_mono_execute_rundown_func ep_rt_mono_execute_rundown;
ep_rt_mono_walk_managed_stack_for_thread_func ep_rt_mono_walk_managed_stack_for_thread;
ep_rt_mono_sample_profiler_write_sampling_event_for_threads_func ep_rt_mono_sample_profiler_write_sampling_event_for_threads;
ep_rt_mono_method_get_simple_assembly_name_func ep_rt_mono_method_get_simple_assembly_name;
ep_rt_mono_method_get_full_name_func ep_rt_mono_method_get_full_name;
} EventPipeMonoFuncTable;
Expand Down Expand Up @@ -1079,17 +1081,14 @@ static
void
ep_rt_sample_profiler_write_sampling_event_for_threads (ep_rt_thread_handle_t sampling_thread, EventPipeEvent *sampling_event)
{
// TODO: Implement.
// Suspend threads.
// Stack walk each thread, write sample event.
// Resume threads.
ep_rt_mono_func_table_get ()->ep_rt_mono_sample_profiler_write_sampling_event_for_threads (sampling_thread, sampling_event);
}

static
void
ep_rt_notify_profiler_provider_created (EventPipeProvider *provider)
{
// TODO: Not supported.
;
}

/*
Expand Down
8 changes: 4 additions & 4 deletions src/mono/mono/metadata/boehm-gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -290,15 +290,15 @@ mono_gc_collection_count (int generation)
}

void
mono_gc_stop_world ()
mono_stop_world (MonoThreadInfoFlags flags)
{
g_assert ("mono_gc_stop_world is not supported in Boehm");
g_assert ("mono_stop_world is not supported in Boehm");
}

void
mono_gc_restart_world ()
mono_restart_world (MonoThreadInfoFlags flags)
{
g_assert ("mono_gc_restart_world is not supported in Boehm");
g_assert ("mono_restart_world is not supported in Boehm");
}

/**
Expand Down
7 changes: 7 additions & 0 deletions src/mono/mono/metadata/gc-internals.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,4 +430,11 @@ extern gboolean mono_do_not_finalize;
/* List of names of classes not to finalize. */
extern gchar **mono_do_not_finalize_class_names;

/*
* Unified runtime stop/restart world, SGEN Only.
* Will take and release the LOCK_GC.
*/
void mono_stop_world (MonoThreadInfoFlags flags);
void mono_restart_world (MonoThreadInfoFlags flags);

#endif /* __MONO_METADATA_GC_INTERNAL_H__ */
125 changes: 121 additions & 4 deletions src/mono/mono/metadata/icall-eventpipe.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,23 @@ typedef struct _EventPipeFireMethodEventsData{
ep_rt_mono_fire_method_rundown_events_func method_events_func;
} EventPipeFireMethodEventsData;

typedef struct _EventPipeSampleProfileData {
EventPipeStackContents stack_contents;
uint64_t thread_id;
uintptr_t thread_ip;
uint32_t payload_data;
} EventPipeSampleProfileData;

gboolean ep_rt_mono_initialized;
MonoNativeTlsKey ep_rt_mono_thread_holder_tls_id;
gpointer ep_rt_mono_rand_provider;

static ep_rt_thread_holder_alloc_func thread_holder_alloc_callback_func;
static ep_rt_thread_holder_free_func thread_holder_free_callback_func;

static GArray * _ep_rt_mono_sampled_thread_callstacks = NULL;
static uint32_t _ep_rt_mono_max_sampled_thread_count = 32;

/*
* Forward declares of all static functions.
*/
Expand Down Expand Up @@ -128,6 +138,7 @@ static
void
eventpipe_fire_method_events (
MonoJitInfo *ji,
MonoMethod *method,
EventPipeFireMethodEventsData *events_data);

static
Expand Down Expand Up @@ -163,6 +174,19 @@ eventpipe_walk_managed_stack_for_thread (
ep_rt_thread_handle_t thread,
EventPipeStackContents *stack_contents);

static
gboolean
eventpipe_sample_profiler_walk_managed_stack_for_thread_func (
MonoStackFrameInfo *frame,
MonoContext *ctx,
gpointer data);

static
gboolean
eventpipe_sample_profiler_write_sampling_event_for_threads (
ep_rt_thread_handle_t sampling_thread,
EventPipeEvent *sampling_event);

static
gboolean
eventpipe_method_get_simple_assembly_name (
Expand Down Expand Up @@ -277,6 +301,7 @@ static
void
eventpipe_fire_method_events (
MonoJitInfo *ji,
MonoMethod *method,
EventPipeFireMethodEventsData *events_data)
{
g_assert_checked (ji != NULL);
Expand All @@ -296,7 +321,6 @@ eventpipe_fire_method_events (

//TODO: Optimize string formatting into functions accepting GString to reduce heap alloc.

MonoMethod *method = jinfo_get_method (ji);
if (method) {
method_id = (uint64_t)method;
method_token = method->token;
Expand Down Expand Up @@ -393,8 +417,11 @@ eventpipe_fire_method_events_func (
EventPipeFireMethodEventsData *events_data = (EventPipeFireMethodEventsData *)user_data;
g_assert_checked (events_data != NULL);

if (ji && !ji->is_trampoline && !ji->async)
eventpipe_fire_method_events (ji, events_data);
if (ji && !ji->is_trampoline && !ji->async) {
MonoMethod *method = jinfo_get_method (ji);
if (method && !m_method_is_wrapper (method))
eventpipe_fire_method_events (ji, method, events_data);
}
}

static
Expand Down Expand Up @@ -543,7 +570,8 @@ eventpipe_walk_managed_stack_for_thread_func (
if (!frame->ji)
return FALSE;
MonoMethod *method = frame->ji->async ? NULL : frame->actual_method;
ep_stack_contents_append ((EventPipeStackContents *)data, (uintptr_t)((uint8_t*)frame->ji->code_start + frame->native_offset), method);
if (method && !m_method_is_wrapper (method))
ep_stack_contents_append ((EventPipeStackContents *)data, (uintptr_t)((uint8_t*)frame->ji->code_start + frame->native_offset), method);
return ep_stack_contents_get_length ((EventPipeStackContents *)data) >= EP_MAX_STACK_DEPTH;
default:
g_assert_not_reached ();
Expand All @@ -567,6 +595,90 @@ eventpipe_walk_managed_stack_for_thread (
return TRUE;
}

static
gboolean
eventpipe_sample_profiler_walk_managed_stack_for_thread_func (
MonoStackFrameInfo *frame,
MonoContext *ctx,
gpointer data)
{
g_assert_checked (frame != NULL);
g_assert_checked (data != NULL);

EventPipeSampleProfileData *sample_data = (EventPipeSampleProfileData *)data;

if (sample_data->payload_data == EP_SAMPLE_PROFILER_SAMPLE_TYPE_ERROR) {
if (frame->type == FRAME_TYPE_MANAGED_TO_NATIVE)
sample_data->payload_data = EP_SAMPLE_PROFILER_SAMPLE_TYPE_EXTERNAL;
else
sample_data->payload_data = EP_SAMPLE_PROFILER_SAMPLE_TYPE_MANAGED;
}

return eventpipe_walk_managed_stack_for_thread_func (frame, ctx, &sample_data->stack_contents);
}

static
gboolean
eventpipe_sample_profiler_write_sampling_event_for_threads (
ep_rt_thread_handle_t sampling_thread,
EventPipeEvent *sampling_event)
{
// Follows CoreClr implementation of sample profiler. Generic invasive/expensive way to do CPU sample profiling relying on STW and stackwalks.
// TODO: Investigate alternatives on platforms supporting Signals/SuspendThread (see Mono profiler) or CPU PMU's (see ETW/perf_event_open).

// Sample profiler only runs on one thread, no need to synchorinize.
if (!_ep_rt_mono_sampled_thread_callstacks)
_ep_rt_mono_sampled_thread_callstacks = g_array_sized_new (FALSE, FALSE, sizeof (EventPipeSampleProfileData), _ep_rt_mono_max_sampled_thread_count);

// Make sure there is room based on previous max number of sampled threads.
// NOTE, there is a chance there are more threads than max, if that's the case we will
// miss those threads in this sample, but will be included in next when max has been adjusted.
g_array_set_size (_ep_rt_mono_sampled_thread_callstacks, _ep_rt_mono_max_sampled_thread_count);

uint32_t filtered_thread_count = 0;
uint32_t sampled_thread_count = 0;

mono_stop_world (MONO_THREAD_INFO_FLAGS_NO_GC | MONO_THREAD_INFO_FLAGS_NO_SAMPLE);

// Record all info needed in sample events while runtime is suspended, must be async safe.
FOREACH_THREAD_SAFE_EXCLUDE (thread_info, MONO_THREAD_INFO_FLAGS_NO_GC | MONO_THREAD_INFO_FLAGS_NO_SAMPLE) {
if (!mono_thread_info_is_running (thread_info)) {
MonoThreadUnwindState *thread_state = mono_thread_info_get_suspend_state (thread_info);
if (thread_state->valid) {
if (sampled_thread_count < _ep_rt_mono_max_sampled_thread_count) {
EventPipeSampleProfileData *data = &g_array_index (_ep_rt_mono_sampled_thread_callstacks, EventPipeSampleProfileData, sampled_thread_count);
data->thread_id = ep_rt_thread_id_t_to_uint64_t (mono_thread_info_get_tid (thread_info));
data->thread_ip = (uintptr_t)MONO_CONTEXT_GET_IP (&thread_state->ctx);
data->payload_data = EP_SAMPLE_PROFILER_SAMPLE_TYPE_ERROR;
ep_stack_contents_reset (&data->stack_contents);
mono_get_eh_callbacks ()->mono_walk_stack_with_state (eventpipe_sample_profiler_walk_managed_stack_for_thread_func, thread_state, MONO_UNWIND_SIGNAL_SAFE, data);
sampled_thread_count++;
}
}
}
filtered_thread_count++;
} FOREACH_THREAD_SAFE_END

mono_restart_world (MONO_THREAD_INFO_FLAGS_NO_GC | MONO_THREAD_INFO_FLAGS_NO_SAMPLE);

// Fire sample event for threads. Must be done after runtime is resumed since it's not async safe.
// Since we can't keep thread info around after runtime as been suspended, use an empty
// adapter instance and only set recorded tid as parameter inside adapter.
THREAD_INFO_TYPE adapter = { 0 };
for (uint32_t i = 0; i < sampled_thread_count; ++i) {
EventPipeSampleProfileData *data = &g_array_index (_ep_rt_mono_sampled_thread_callstacks, EventPipeSampleProfileData, i);
if (data->payload_data != EP_SAMPLE_PROFILER_SAMPLE_TYPE_ERROR && ep_stack_contents_get_length(&data->stack_contents) > 0) {
mono_thread_info_set_tid (&adapter, ep_rt_uint64_t_to_thread_id_t (data->thread_id));
ep_write_sample_profile_event (sampling_thread, sampling_event, &adapter, &data->stack_contents, (uint8_t *)&data->payload_data, sizeof (data->payload_data));
}
}

// Current thread count will be our next maximum sampled threads.
_ep_rt_mono_max_sampled_thread_count = filtered_thread_count;

return TRUE;
}

static
gboolean
eventpipe_method_get_simple_assembly_name (
Expand Down Expand Up @@ -643,6 +755,7 @@ mono_eventpipe_init (
table->ep_rt_mono_get_managed_cmd_line = mono_runtime_get_managed_cmd_line;
table->ep_rt_mono_execute_rundown = eventpipe_execute_rundown;
table->ep_rt_mono_walk_managed_stack_for_thread = eventpipe_walk_managed_stack_for_thread;
table->ep_rt_mono_sample_profiler_write_sampling_event_for_threads = eventpipe_sample_profiler_write_sampling_event_for_threads;
table->ep_rt_mono_method_get_simple_assembly_name = eventpipe_method_get_simple_assembly_name;
table->ep_rt_mono_method_get_full_name = evetpipe_method_get_full_name;
}
Expand All @@ -664,9 +777,13 @@ mono_eventpipe_init (
void
mono_eventpipe_fini (void)
{
if (_ep_rt_mono_sampled_thread_callstacks)
g_array_free (_ep_rt_mono_sampled_thread_callstacks, TRUE);

if (ep_rt_mono_initialized)
mono_rand_close (ep_rt_mono_rand_provider);

_ep_rt_mono_sampled_thread_callstacks = NULL;
ep_rt_mono_rand_provider = NULL;
thread_holder_alloc_callback_func = NULL;
thread_holder_free_callback_func = NULL;
Expand Down
6 changes: 0 additions & 6 deletions src/mono/mono/metadata/mono-gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,6 @@ MONO_API int mono_gc_walk_heap (int flags, MonoGCReferences callback,
MONO_API MONO_RT_EXTERNAL_ONLY void
mono_gc_init_finalizer_thread (void);

/*
* Only supported under SGen. These two with Sgen will take and release the LOCK_GC
*/
void mono_gc_stop_world (void);
void mono_gc_restart_world (void);

MONO_END_DECLS

#endif /* __METADATA_MONO_GC_H__ */
Expand Down
8 changes: 4 additions & 4 deletions src/mono/mono/metadata/null-gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,15 +86,15 @@ mono_gc_collection_count (int generation)
}

void
mono_gc_stop_world ()
mono_stop_world (MonoThreadInfoFlags flags)
{
g_assert ("mono_gc_stop_world is not supported in null GC");
g_assert ("mono_stop_world is not supported in null GC");
}

void
mono_gc_restart_world ()
mono_restart_world (MonoThreadInfoFlags flags)
{
g_assert ("mono_gc_restart_world is not supported in null GC");
g_assert ("mono_restart_world is not supported in null GC");
}

void
Expand Down
14 changes: 0 additions & 14 deletions src/mono/mono/metadata/sgen-mono.c
Original file line number Diff line number Diff line change
Expand Up @@ -840,20 +840,6 @@ sgen_finish_concurrent_work (const char *reason, gboolean stw)
sgen_major_collector.finish_sweeping ();
}

void
mono_gc_stop_world ()
{
LOCK_GC;
sgen_stop_world (0, FALSE);
}

void
mono_gc_restart_world ()
{
sgen_restart_world (0, FALSE);
UNLOCK_GC;
}

/*
* When appdomains are unloaded we can easily remove objects that have finalizers,
* but all the others could still be present in random places on the heap.
Expand Down
Loading

0 comments on commit 86835b8

Please sign in to comment.