Skip to content

Commit

Permalink
Attempt to fix flaky spec due to timing issues
Browse files Browse the repository at this point in the history
This test failed in CI when running inside valgrind:

```
Failures:

  1) Datadog::Profiling::Collectors::ThreadContext#sample_after_gc when there is gc information to record when timeline is enabled creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns
    Failure/Error: expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after)
      expected 1724250478003178125 to be between 1724250478003215154 and 1724250478003295874 (inclusive)
    # ./spec/datadog/profiling/collectors/thread_context_spec.rb:1024:in 'block (5 levels) in <top (required)>'
    # ./spec/spec_helper.rb:231:in 'block (2 levels) in <top (required)>'
    # ./spec/spec_helper.rb:116:in 'block (2 levels) in <top (required)>'
    # ./vendor/bundle/ruby/3.4.0+0/gems/webmock-3.23.1/lib/webmock/rspec.rb:39:in 'block (2 levels) in <top (required)>'
    # ./vendor/bundle/ruby/3.4.0+0/gems/rspec-wait-0.0.10/lib/rspec/wait.rb:47:in 'block (2 levels) in <top (required)>'

Finished in 3 minutes 54.8 seconds (files took 25.77 seconds to load)
701 examples, 1 failure, 20 pending

Failed examples:

rspec ./spec/datadog/profiling/collectors/thread_context_spec.rb:1021 # Datadog::Profiling::Collectors::ThreadContext#sample_after_gc when there is gc information to record when timeline is enabled creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns

Randomized with seed 49098
```

I couldn't quite reproduce it, but given the really small delta
between the expected time and target time (37029 nanoseconds before
start time), I'm strongly suspecting this failure may be due to our
monotonic to system timestamp conversion code.

In particular, we use monotonic timestamps in the profiler, and
then we convert them to epoch timestamps by comparing the difference
between the monotonic clock and the system clock.

There can be some drift in this comparison because we cache the
difference rather than compute it every time (see comments on
`time_helpers.c:monotonic_to_system_epoch_ns`).

So by resetting the cache before this test, hopefully any
flakyness due to the drifting will go away.
  • Loading branch information
ivoanjo committed Aug 22, 2024
1 parent 168452e commit c8f2338
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 6 deletions.
15 changes: 12 additions & 3 deletions ext/datadog_profiling_native_extension/collectors_thread_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ static VALUE _native_initialize(
static VALUE _native_sample(VALUE self, VALUE collector_instance, VALUE profiler_overhead_stack_thread);
static VALUE _native_on_gc_start(VALUE self, VALUE collector_instance);
static VALUE _native_on_gc_finish(VALUE self, VALUE collector_instance);
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state);
void update_metrics_and_sample(
struct thread_context_collector_state *state,
VALUE thread_being_sampled,
Expand Down Expand Up @@ -261,7 +261,7 @@ void collectors_thread_context_init(VALUE profiling_module) {
rb_define_singleton_method(testing_module, "_native_sample_allocation", _native_sample_allocation, 3);
rb_define_singleton_method(testing_module, "_native_on_gc_start", _native_on_gc_start, 1);
rb_define_singleton_method(testing_module, "_native_on_gc_finish", _native_on_gc_finish, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 1);
rb_define_singleton_method(testing_module, "_native_sample_after_gc", _native_sample_after_gc, 2);
rb_define_singleton_method(testing_module, "_native_thread_list", _native_thread_list, 0);
rb_define_singleton_method(testing_module, "_native_per_thread_context", _native_per_thread_context, 1);
rb_define_singleton_method(testing_module, "_native_stats", _native_stats, 1);
Expand Down Expand Up @@ -439,7 +439,16 @@ static VALUE _native_on_gc_finish(DDTRACE_UNUSED VALUE self, VALUE collector_ins

// This method exists only to enable testing Datadog::Profiling::Collectors::ThreadContext behavior using RSpec.
// It SHOULD NOT be used for other purposes.
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance) {
static VALUE _native_sample_after_gc(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE reset_monotonic_to_system_state) {
ENFORCE_BOOLEAN(reset_monotonic_to_system_state);

struct thread_context_collector_state *state;
TypedData_Get_Struct(collector_instance, struct thread_context_collector_state, &thread_context_collector_typed_data, state);

if (reset_monotonic_to_system_state == Qtrue) {
state->time_converter_state = (monotonic_to_system_epoch_state) MONOTONIC_TO_SYSTEM_EPOCH_INITIALIZER;
}

thread_context_collector_sample_after_gc(collector_instance);
return Qtrue;
}
Expand Down
6 changes: 3 additions & 3 deletions spec/datadog/profiling/collectors/thread_context_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def on_gc_finish
described_class::Testing._native_on_gc_finish(cpu_and_wall_time_collector)
end

def sample_after_gc
described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector)
def sample_after_gc(reset_monotonic_to_system_state: false)
described_class::Testing._native_sample_after_gc(cpu_and_wall_time_collector, reset_monotonic_to_system_state)
end

def sample_allocation(weight:, new_object: Object.new)
Expand Down Expand Up @@ -1019,7 +1019,7 @@ def self.otel_sdk_available?
end

it "creates a Garbage Collection sample using the timestamp set by on_gc_finish, converted to epoch ns" do
sample_after_gc
sample_after_gc(reset_monotonic_to_system_state: true)

expect(gc_sample.labels.fetch(:end_timestamp_ns)).to be_between(@time_before, @time_after)
end
Expand Down

0 comments on commit c8f2338

Please sign in to comment.