Skip to content

Commit

Permalink
Merge pull request #3720 from DataDog/ivoanjo/prof-9342-dir-interrupt…
Browse files Browse the repository at this point in the history
…ion-workaround

[PROF-9342] Introduce profiler workaround for Ruby Dir interruption bug
  • Loading branch information
ivoanjo committed Jul 2, 2024
2 parents 240dcbc + 259e134 commit dc2739d
Show file tree
Hide file tree
Showing 16 changed files with 988 additions and 7 deletions.
44 changes: 44 additions & 0 deletions benchmarks/profiler_hold_resume_interruptions.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Used to quickly run benchmark under RSpec as part of the usual test suite, to validate it didn't bitrot
VALIDATE_BENCHMARK_MODE = ENV['VALIDATE_BENCHMARK'] == 'true'

return unless __FILE__ == $PROGRAM_NAME || VALIDATE_BENCHMARK_MODE

require 'benchmark/ips'
require 'datadog'
require 'pry'
require_relative 'dogstatsd_reporter'

# This benchmark measures the performance of the hold/resume interruptions used by the DirMonkeyPatches
class ProfilerHoldResumeInterruptions
def create_profiler
Datadog.configure do |c|
c.profiling.enabled = true
end
Datadog::Profiling.wait_until_running
end

def run_benchmark
Benchmark.ips do |x|
benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 }
x.config(
**benchmark_time,
suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_hold_resume_interruptions')
)

x.report("hold / resume") do
Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals
Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals
end

x.save! 'profiler_hold_resume_interruptions-results.json' unless VALIDATE_BENCHMARK_MODE
x.compare!
end
end
end

puts "Current pid is #{Process.pid}"

ProfilerHoldResumeInterruptions.new.instance_exec do
create_profiler
run_benchmark
end
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self);
static VALUE rescued_sample_allocation(VALUE tracepoint_data);
static void delayed_error(struct cpu_and_wall_time_worker_state *state, const char *error);
static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VALUE error_msg);
static VALUE _native_hold_signals(DDTRACE_UNUSED VALUE self);
static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self);

// Note on sampler global state safety:
//
Expand Down Expand Up @@ -285,7 +287,9 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) {
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_allocation_count", _native_allocation_count, 0);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_is_running?", _native_is_running, 1);
rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0);
// TODO: Remove `_native_is_running` from `testing_module` once `prof-correctness` has been updated to not need it
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_hold_signals", _native_hold_signals, 0);
rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_resume_signals", _native_resume_signals, 0);
// TODO: Remove `_native_is_running` from `testing_module` (should be in class) once `prof-correctness` has been updated to not need it
rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1);
rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0);
rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0);
Expand Down Expand Up @@ -1159,3 +1163,17 @@ static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VA

return Qnil;
}

// Masks SIGPROF interruptions for the current thread. Please don't use this -- you may end up with incomplete
// profiling data.
static VALUE _native_hold_signals(DDTRACE_UNUSED VALUE self) {
block_sigprof_signal_handler_from_running_in_current_thread();
return Qtrue;
}

// Unmasks SIGPROF interruptions for the current thread. If there's a pending sample, it'll be triggered inside this
// method.
static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) {
unblock_sigprof_signal_handler_from_running_in_current_thread();
return Qtrue;
}
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ void remove_sigprof_signal_handler(void) {
if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler");
}

static void toggle_sigprof_signal_handler_for_current_thread(int action) {
static inline void toggle_sigprof_signal_handler_for_current_thread(int action) {
sigset_t signals_to_toggle;
sigemptyset(&signals_to_toggle);
sigaddset(&signals_to_toggle, SIGPROF);
Expand Down
16 changes: 16 additions & 0 deletions lib/datadog/core/configuration/settings.rb
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,22 @@ def initialize(*_)
end
end

# The profiler gathers data by sending `SIGPROF` unix signals to Ruby application threads.
#
# We've discovered that this can trigger a bug in a number of Ruby APIs in the `Dir` class, as
# described in https://github.com/DataDog/dd-trace-rb/issues/3450 . This workaround prevents the issue
# from happening by monkey patching the affected APIs.
#
# (In the future, once a fix lands upstream, we'll disable this workaround for Rubies that don't need it)
#
# @default `DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED` environment variable as a boolean,
# otherwise `true`
option :dir_interruption_workaround_enabled do |o|
o.env 'DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED'
o.type :bool
o.default true
end

# Configures how much wall-time overhead the profiler targets. The profiler will dynamically adjust the
# interval between samples it takes so as to try and maintain the property that it spends no longer than
# this amount of wall-clock time profiling. For example, with the default value of 2%, the profiler will
Expand Down
1 change: 1 addition & 0 deletions lib/datadog/profiling.rb
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def self.allocation_count # rubocop:disable Lint/NestedMethodDefinition (On purp
return false unless supported?

require_relative 'profiling/ext/forking'
require_relative 'profiling/ext/dir_monkey_patches'
require_relative 'profiling/collectors/info'
require_relative 'profiling/collectors/code_provenance'
require_relative 'profiling/collectors/cpu_and_wall_time_worker'
Expand Down
13 changes: 13 additions & 0 deletions lib/datadog/profiling/component.rb
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
crashtracker = build_crashtracker(settings, transport)
profiler = Profiling::Profiler.new(worker: worker, scheduler: scheduler, optional_crashtracker: crashtracker)

if dir_interruption_workaround_enabled?(settings, no_signals_workaround_enabled)
Datadog::Profiling::Ext::DirMonkeyPatches.apply!
end

[profiler, { profiling_enabled: true }]
end

Expand Down Expand Up @@ -445,6 +449,15 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:)
libmysqlclient_version < Gem::Version.new('5.0.0') &&
header_version >= Gem::Version.new('10.0.0'))
end

private_class_method def self.dir_interruption_workaround_enabled?(settings, no_signals_workaround_enabled)
return false if no_signals_workaround_enabled

# NOTE: In the future this method will evolve to check for Ruby versions affected and not apply the workaround
# when it's not needed but currently all known Ruby versions are affected.

settings.profiling.advanced.dir_interruption_workaround_enabled
end
end
end
end
Loading

0 comments on commit dc2739d

Please sign in to comment.