From 5c7bd5f22df26aeb6edf3c0dcf34a52075cfd8ad Mon Sep 17 00:00:00 2001 From: Ivo Anjo Date: Thu, 18 Jul 2024 09:41:30 +0100 Subject: [PATCH] Fix placeholder for skipped samples in allocation profiler breaking heap profiler, try 2 Adding the "skipped samples" placeholder breaks the heap profiler, as it starts failing with: > WARN -- datadog: [datadog] CpuAndWallTimeWorker thread error. Cause: > RuntimeError Ended a heap recording that was not started > WARN -- datadog: [datadog] Detected issue with profiler (worker > component), stopping profiling. This is because the stack recorder up until now used the check for `alloc_samples > 0` as a proxy for "this is an allocation sample, and someone already called `track_object(...)` with it". The addition of the skipped samples placeholder broke this assumption, as it uses `alloc_samples > 0` but explicitly there's no object being tracked. To fix this issue, I've added a new argument to the `sample_values` that explicitly marks a sample as also being heap (or not). Thus, we can now explicitly tell the stack recorder which situation we are in using that flag, instead of trying to derive it from `alloc_samples > 0`. This is actually the second attempt at fixing this. The first attempt got reverted along with https://github.com/DataDog/dd-trace-rb/pull/3775 because it tried to detect placeholders (and thus used a placeholder as a proxy for "someone called `track_object(...)`". This was incorrect because there's other placeholders than just the "skipped samples" (case in point: the "In native code" placeholder), which can happen together with an object allocation. --- .../collectors_stack.c | 3 +++ .../collectors_thread_context.c | 2 +- .../stack_recorder.c | 2 +- .../stack_recorder.h | 1 + spec/datadog/profiling/stack_recorder_spec.rb | 16 +++++++++++++--- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/ext/datadog_profiling_native_extension/collectors_stack.c b/ext/datadog_profiling_native_extension/collectors_stack.c index 733aeba4104..58ad5199cfb 100644 --- a/ext/datadog_profiling_native_extension/collectors_stack.c +++ b/ext/datadog_profiling_native_extension/collectors_stack.c @@ -65,6 +65,8 @@ static VALUE _native_sample( ENFORCE_TYPE(numeric_labels_array, T_ARRAY); VALUE zero = INT2NUM(0); + VALUE heap_sample = rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("heap_sample"), Qfalse); + ENFORCE_BOOLEAN(heap_sample); sample_values values = { .cpu_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-time"), zero)), .cpu_or_wall_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("cpu-samples"), zero)), @@ -72,6 +74,7 @@ static VALUE _native_sample( .alloc_samples = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("alloc-samples"), zero)), .alloc_samples_unscaled = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("alloc-samples-unscaled"), zero)), .timeline_wall_time_ns = NUM2UINT(rb_hash_lookup2(metric_values_hash, rb_str_new_cstr("timeline"), zero)), + .heap_sample = heap_sample == Qtrue, }; long labels_count = RARRAY_LEN(labels_array) + RARRAY_LEN(numeric_labels_array); diff --git a/ext/datadog_profiling_native_extension/collectors_thread_context.c b/ext/datadog_profiling_native_extension/collectors_thread_context.c index 4116be91904..a7de125f62d 100644 --- a/ext/datadog_profiling_native_extension/collectors_thread_context.c +++ b/ext/datadog_profiling_native_extension/collectors_thread_context.c @@ -1303,7 +1303,7 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in /* thread: */ current_thread, /* stack_from_thread: */ current_thread, get_or_create_context_for(current_thread, state), - (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1}, + (sample_values) {.alloc_samples = sample_weight, .alloc_samples_unscaled = 1, .heap_sample = true}, INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions &ruby_vm_type, optional_class_name diff --git a/ext/datadog_profiling_native_extension/stack_recorder.c b/ext/datadog_profiling_native_extension/stack_recorder.c index f8970e9ba9d..83235019477 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.c +++ b/ext/datadog_profiling_native_extension/stack_recorder.c @@ -616,7 +616,7 @@ void record_sample(VALUE recorder_instance, ddog_prof_Slice_Location locations, metric_values[position_for[ALLOC_SAMPLES_UNSCALED_VALUE_ID]] = values.alloc_samples_unscaled; metric_values[position_for[TIMELINE_VALUE_ID]] = values.timeline_wall_time_ns; - if (values.alloc_samples != 0) { + if (values.heap_sample) { // If we got an allocation sample end the heap allocation recording to commit the heap sample. // FIXME: Heap sampling currently has to be done in 2 parts because the construction of locations is happening // very late in the allocation-sampling path (which is shared with the cpu sampling path). This can diff --git a/ext/datadog_profiling_native_extension/stack_recorder.h b/ext/datadog_profiling_native_extension/stack_recorder.h index 30b56746364..dbf104f33b0 100644 --- a/ext/datadog_profiling_native_extension/stack_recorder.h +++ b/ext/datadog_profiling_native_extension/stack_recorder.h @@ -9,6 +9,7 @@ typedef struct { uint32_t cpu_or_wall_samples; uint32_t alloc_samples; uint32_t alloc_samples_unscaled; + bool heap_sample; int64_t timeline_wall_time_ns; } sample_values; diff --git a/spec/datadog/profiling/stack_recorder_spec.rb b/spec/datadog/profiling/stack_recorder_spec.rb index bf1d2463493..997a695b0c3 100644 --- a/spec/datadog/profiling/stack_recorder_spec.rb +++ b/spec/datadog/profiling/stack_recorder_spec.rb @@ -394,7 +394,14 @@ def sample_types_from(decoded_profile) describe 'heap samples and sizes' do let(:sample_rate) { 50 } let(:metric_values) do - { 'cpu-time' => 101, 'cpu-samples' => 1, 'wall-time' => 789, 'alloc-samples' => sample_rate, 'timeline' => 42 } + { + 'cpu-time' => 101, + 'cpu-samples' => 1, + 'wall-time' => 789, + 'alloc-samples' => sample_rate, + 'timeline' => 42, + 'heap_sample' => true, + } end let(:labels) { { 'label_a' => 'value_a', 'label_b' => 'value_b', 'state' => 'unknown' }.to_a } @@ -540,6 +547,8 @@ def sample_allocation(obj) # for each profile-type there in. expected_summed_values = { 'heap-live-samples': 0, 'heap-live-size': 0, 'alloc-samples-unscaled': 0 } metric_values.each_pair do |k, v| + next if k == 'heap_sample' # This is not a metric, ignore it + expected_summed_values[k.to_sym] = v * @num_allocations end @@ -997,8 +1006,9 @@ def create_obj_in_recycled_slot(should_sample_original: false) def sample_allocation(obj) # Heap sampling currently requires this 2-step process to first pass data about the allocated object... described_class::Testing._native_track_object(stack_recorder, obj, 1, obj.class.name) - Datadog::Profiling::Collectors::Stack::Testing - ._native_sample(Thread.current, stack_recorder, { 'alloc-samples' => 1 }, [], [], 400, false) + Datadog::Profiling::Collectors::Stack::Testing._native_sample( + Thread.current, stack_recorder, { 'alloc-samples' => 1, 'heap_sample' => true }, [], [], 400, false + ) end it 'includes heap recorder snapshot' do