diff --git a/benchmarks/profiler_hold_resume_interruptions.rb b/benchmarks/profiler_hold_resume_interruptions.rb new file mode 100644 index 00000000000..02f22dfca06 --- /dev/null +++ b/benchmarks/profiler_hold_resume_interruptions.rb @@ -0,0 +1,44 @@ +# Used to quickly run benchmark under RSpec as part of the usual test suite, to validate it didn't bitrot +VALIDATE_BENCHMARK_MODE = ENV['VALIDATE_BENCHMARK'] == 'true' + +return unless __FILE__ == $PROGRAM_NAME || VALIDATE_BENCHMARK_MODE + +require 'benchmark/ips' +require 'datadog' +require 'pry' +require_relative 'dogstatsd_reporter' + +# This benchmark measures the performance of the hold/resume interruptions used by the DirMonkeyPatches +class ProfilerHoldResumeInterruptions + def create_profiler + Datadog.configure do |c| + c.profiling.enabled = true + end + Datadog::Profiling.wait_until_running + end + + def run_benchmark + Benchmark.ips do |x| + benchmark_time = VALIDATE_BENCHMARK_MODE ? { time: 0.01, warmup: 0 } : { time: 10, warmup: 2 } + x.config( + **benchmark_time, + suite: report_to_dogstatsd_if_enabled_via_environment_variable(benchmark_name: 'profiler_hold_resume_interruptions') + ) + + x.report("hold / resume") do + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + x.save! 'profiler_hold_resume_interruptions-results.json' unless VALIDATE_BENCHMARK_MODE + x.compare! + end + end +end + +puts "Current pid is #{Process.pid}" + +ProfilerHoldResumeInterruptions.new.instance_exec do + create_profiler + run_benchmark +end diff --git a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c index 749b416c9fd..dff70397c16 100644 --- a/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c +++ b/ext/datadog_profiling_native_extension/collectors_cpu_and_wall_time_worker.c @@ -222,6 +222,8 @@ static VALUE _native_with_blocked_sigprof(DDTRACE_UNUSED VALUE self); static VALUE rescued_sample_allocation(VALUE tracepoint_data); static void delayed_error(struct cpu_and_wall_time_worker_state *state, const char *error); static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VALUE error_msg); +static VALUE _native_hold_signals(DDTRACE_UNUSED VALUE self); +static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self); // Note on sampler global state safety: // @@ -285,7 +287,9 @@ void collectors_cpu_and_wall_time_worker_init(VALUE profiling_module) { rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_allocation_count", _native_allocation_count, 0); rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_is_running?", _native_is_running, 1); rb_define_singleton_method(testing_module, "_native_current_sigprof_signal_handler", _native_current_sigprof_signal_handler, 0); - // TODO: Remove `_native_is_running` from `testing_module` once `prof-correctness` has been updated to not need it + rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_hold_signals", _native_hold_signals, 0); + rb_define_singleton_method(collectors_cpu_and_wall_time_worker_class, "_native_resume_signals", _native_resume_signals, 0); + // TODO: Remove `_native_is_running` from `testing_module` (should be in class) once `prof-correctness` has been updated to not need it rb_define_singleton_method(testing_module, "_native_is_running?", _native_is_running, 1); rb_define_singleton_method(testing_module, "_native_install_testing_signal_handler", _native_install_testing_signal_handler, 0); rb_define_singleton_method(testing_module, "_native_remove_testing_signal_handler", _native_remove_testing_signal_handler, 0); @@ -1159,3 +1163,17 @@ static VALUE _native_delayed_error(DDTRACE_UNUSED VALUE self, VALUE instance, VA return Qnil; } + +// Masks SIGPROF interruptions for the current thread. Please don't use this -- you may end up with incomplete +// profiling data. +static VALUE _native_hold_signals(DDTRACE_UNUSED VALUE self) { + block_sigprof_signal_handler_from_running_in_current_thread(); + return Qtrue; +} + +// Unmasks SIGPROF interruptions for the current thread. If there's a pending sample, it'll be triggered inside this +// method. +static VALUE _native_resume_signals(DDTRACE_UNUSED VALUE self) { + unblock_sigprof_signal_handler_from_running_in_current_thread(); + return Qtrue; +} diff --git a/ext/datadog_profiling_native_extension/setup_signal_handler.c b/ext/datadog_profiling_native_extension/setup_signal_handler.c index 93047b9bfee..fa4bb27e95c 100644 --- a/ext/datadog_profiling_native_extension/setup_signal_handler.c +++ b/ext/datadog_profiling_native_extension/setup_signal_handler.c @@ -91,7 +91,7 @@ void remove_sigprof_signal_handler(void) { if (sigaction(SIGPROF, &signal_handler_config, NULL) != 0) rb_sys_fail("Failure while removing the signal handler"); } -static void toggle_sigprof_signal_handler_for_current_thread(int action) { +static inline void toggle_sigprof_signal_handler_for_current_thread(int action) { sigset_t signals_to_toggle; sigemptyset(&signals_to_toggle); sigaddset(&signals_to_toggle, SIGPROF); diff --git a/lib/datadog/core/configuration/settings.rb b/lib/datadog/core/configuration/settings.rb index dd91ed4651a..71e43d89f8d 100644 --- a/lib/datadog/core/configuration/settings.rb +++ b/lib/datadog/core/configuration/settings.rb @@ -397,6 +397,22 @@ def initialize(*_) end end + # The profiler gathers data by sending `SIGPROF` unix signals to Ruby application threads. + # + # We've discovered that this can trigger a bug in a number of Ruby APIs in the `Dir` class, as + # described in https://github.com/DataDog/dd-trace-rb/issues/3450 . This workaround prevents the issue + # from happening by monkey patching the affected APIs. + # + # (In the future, once a fix lands upstream, we'll disable this workaround for Rubies that don't need it) + # + # @default `DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED` environment variable as a boolean, + # otherwise `true` + option :dir_interruption_workaround_enabled do |o| + o.env 'DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED' + o.type :bool + o.default true + end + # Configures how much wall-time overhead the profiler targets. The profiler will dynamically adjust the # interval between samples it takes so as to try and maintain the property that it spends no longer than # this amount of wall-clock time profiling. For example, with the default value of 2%, the profiler will diff --git a/lib/datadog/profiling.rb b/lib/datadog/profiling.rb index 604e9ac4462..367ce829415 100644 --- a/lib/datadog/profiling.rb +++ b/lib/datadog/profiling.rb @@ -136,6 +136,7 @@ def self.allocation_count # rubocop:disable Lint/NestedMethodDefinition (On purp return false unless supported? require_relative 'profiling/ext/forking' + require_relative 'profiling/ext/dir_monkey_patches' require_relative 'profiling/collectors/info' require_relative 'profiling/collectors/code_provenance' require_relative 'profiling/collectors/cpu_and_wall_time_worker' diff --git a/lib/datadog/profiling/component.rb b/lib/datadog/profiling/component.rb index cd6acf551b2..ed753e2dab4 100644 --- a/lib/datadog/profiling/component.rb +++ b/lib/datadog/profiling/component.rb @@ -75,6 +75,10 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) crashtracker = build_crashtracker(settings, transport) profiler = Profiling::Profiler.new(worker: worker, scheduler: scheduler, optional_crashtracker: crashtracker) + if dir_interruption_workaround_enabled?(settings, no_signals_workaround_enabled) + Datadog::Profiling::Ext::DirMonkeyPatches.apply! + end + [profiler, { profiling_enabled: true }] end @@ -445,6 +449,15 @@ def self.build_profiler_component(settings:, agent_settings:, optional_tracer:) libmysqlclient_version < Gem::Version.new('5.0.0') && header_version >= Gem::Version.new('10.0.0')) end + + private_class_method def self.dir_interruption_workaround_enabled?(settings, no_signals_workaround_enabled) + return false if no_signals_workaround_enabled + + # NOTE: In the future this method will evolve to check for Ruby versions affected and not apply the workaround + # when it's not needed but currently all known Ruby versions are affected. + + settings.profiling.advanced.dir_interruption_workaround_enabled + end end end end diff --git a/lib/datadog/profiling/ext/dir_monkey_patches.rb b/lib/datadog/profiling/ext/dir_monkey_patches.rb new file mode 100644 index 00000000000..efc34194055 --- /dev/null +++ b/lib/datadog/profiling/ext/dir_monkey_patches.rb @@ -0,0 +1,410 @@ +# frozen_string_literal: true + +module Datadog + module Profiling + # Monkey patches needed for profiler features and compatibility + module Ext + # All Ruby versions as of this writing have bugs in the dir class implementation, causing issues such as + # https://github.com/DataDog/dd-trace-rb/issues/3450 . + # See also https://bugs.ruby-lang.org/issues/20586 for more details. + # + # This monkey patch for the Ruby `Dir` class works around these bugs for affected Ruby versions by temporarily + # blocking the profiler from interrupting system calls. + # + # A lot of these APIs do very similar things -- they're provided by Ruby as helpers so users don't need to keep + # reimplementing them but share the same underlying buggy code. And so our monkey patches are a bit repetitive + # as well. + # We don't DRY out this file to have minimal overhead. + # + # These monkey patches are applied by the profiler when the "dir_interruption_workaround_enabled" setting is + # enabled. See the profiling settings for more detail. + module DirMonkeyPatches + def self.apply! + ::Dir.singleton_class.prepend(Datadog::Profiling::Ext::DirClassMonkeyPatches) + ::Dir.prepend(Datadog::Profiling::Ext::DirInstanceMonkeyPatches) + + true + end + end + + if RUBY_VERSION.start_with?('2.') + # Monkey patches for Dir.singleton_class (Ruby 2 version). See DirMonkeyPatches above for more details. + module DirClassMonkeyPatches + def [](*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def children(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + # NOTE: When wrapping methods that yield, it's OK if the `yield` raises an exception while signals are + # enabled. This is because: + # * We can call `_native_resume_signals` many times in a row, both because it's idempotent, as well as it's + # very low overhead (see benchmarks/profiler_hold_resume_interruptions.rb) + # * When an exception is being raised, the iteration will stop anyway, so there's no longer a concern of a + # signal causing Ruby to return an incorrect value + def each_child(*args, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + def empty?(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def entries(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + # See note on methods that yield above. + def foreach(*args, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + # See note on methods that yield above. + def glob(*args, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + begin + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + end + + def home(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + else + # Monkey patches for Dir.singleton_class (Ruby 3 version). See DirMonkeyPatches above for more details. + module DirClassMonkeyPatches + def [](*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def children(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + # See note on methods that yield above. + def each_child(*args, **kwargs, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + def empty?(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def entries(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + # See note on methods that yield above. + def foreach(*args, **kwargs, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + # See note on methods that yield above. + def glob(*args, **kwargs, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + begin + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + end + + def home(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + end + + if RUBY_VERSION.start_with?('2.') + # Monkey patches for Dir (Ruby 2 version). See DirMonkeyPatches above for more details. + module DirInstanceMonkeyPatches + # See note on methods that yield above. + def each(*args, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals # <-- End critical region + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + unless RUBY_VERSION.start_with?('2.5.') # This is Ruby 2.6+ + # See note on methods that yield above. + def each_child(*args, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + def children(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + + def tell(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def pos(*args, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + else + # Monkey patches for Dir (Ruby 3 version). See DirMonkeyPatches above for more details. + module DirInstanceMonkeyPatches + # See note on methods that yield above. + def each(*args, **kwargs, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals # <-- End critical region + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + # See note on methods that yield above. + def each_child(*args, **kwargs, &block) + if block + begin + # <-- Begin critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super do |entry_name| + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + # <-- We're safe now while running customer code + yield entry_name + # <-- We'll go back to the Dir internals, critical region again + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + end + ensure + # <-- End critical region + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + else + # This returns an enumerator. We don't want/need to intercede here, the enumerator will eventually call the + # other branch once it gets going. + super + end + end + + def children(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def tell(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + + def pos(*args, **kwargs, &block) + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_hold_signals + super + ensure + Datadog::Profiling::Collectors::CpuAndWallTimeWorker._native_resume_signals + end + end + end + end + end +end diff --git a/sig/datadog/profiling.rbs b/sig/datadog/profiling.rbs index 35902c8d523..386bfc7aa5c 100644 --- a/sig/datadog/profiling.rbs +++ b/sig/datadog/profiling.rbs @@ -6,6 +6,9 @@ module Datadog def self.allocation_count: () -> ::Integer? def self.enabled?: () -> bool def self.wait_until_running: () -> true + + private + def self.replace_noop_allocation_count: () -> void def self.native_library_compilation_skipped?: () -> ::String? def self.try_reading_skipped_reason_file: (?untyped file_api) -> ::String? diff --git a/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs b/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs index 917eab40ff1..bea60e8971b 100644 --- a/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs +++ b/sig/datadog/profiling/collectors/cpu_and_wall_time_worker.rbs @@ -44,6 +44,8 @@ module Datadog def self._native_is_running?: (CpuAndWallTimeWorker self_instance) -> bool def self._native_allocation_count: () -> ::Integer? def self._native_sampling_loop: (CpuAndWallTimeWorker self_instance) -> void + def self._native_hold_signals: () -> void + def self._native_resume_signals: () -> void def wait_until_running: (timeout_seconds: ::Integer?) -> true end diff --git a/sig/datadog/profiling/component.rbs b/sig/datadog/profiling/component.rbs index 9e74c5c7ea0..bda16b8f78e 100644 --- a/sig/datadog/profiling/component.rbs +++ b/sig/datadog/profiling/component.rbs @@ -38,12 +38,12 @@ module Datadog def self.enable_heap_size_profiling?: (untyped settings, bool heap_profiling_enabled) -> bool def self.no_signals_workaround_enabled?: (untyped settings) -> bool - def self.incompatible_libmysqlclient_version?: (untyped settings) -> bool def self.incompatible_passenger_version?: () -> bool def self.flush_interval: (untyped settings) -> ::Numeric def self.valid_overhead_target: (::Float overhead_target_percentage) -> ::Float def self.looks_like_mariadb?: ({ header_version: ::String? }, ::Gem::Version) -> bool + def self.dir_interruption_workaround_enabled?: (untyped settings, bool no_signals_workaround_enabled) -> bool end end end diff --git a/sig/datadog/profiling/ext/dir_monkey_patches.rbs b/sig/datadog/profiling/ext/dir_monkey_patches.rbs new file mode 100644 index 00000000000..e6cb306dbc0 --- /dev/null +++ b/sig/datadog/profiling/ext/dir_monkey_patches.rbs @@ -0,0 +1,34 @@ +module Datadog + module Profiling + module Ext + module DirMonkeyPatches + def self.apply!: () -> true + end + + # The signatures below are somehow inspired on + # https://github.com/ruby/rbs/blob/28849d4522ba18f45d4f6edfca89a515a4816373/core/dir.rbs + + # NOTE: Because we have variants for Ruby 2.x and 3.x of the code, there's a bunch of warnings from steep about + # that ("Unknown variable: (kwrestarg :kwargs)"). Any suggestions on how to clean that up are welcome :/ + + module DirClassMonkeyPatches + def []: (*untyped) -> Array[String] + def children: (*untyped) -> Array[String] + def each_child: (*untyped) -> Enumerator[String, nil] | (*untyped) { (String filename) -> void } -> nil + def empty?: (path path_name) -> bool + def entries: (*untyped) -> Array[String] + def foreach: (*untyped) -> Enumerator[String, nil] | (*untyped) { (String filename) -> void } -> nil + def glob: (*untyped) -> Array[String] | (*untyped) { (String pathname) -> void } -> nil + def home: (?string? user) -> String + end + + module DirInstanceMonkeyPatches + def each: () { (String) -> void } -> self | () -> Enumerator[String, self] + def each_child: () { (String) -> void } -> self | () -> Enumerator[String, self] + def children: () -> Array[String] + def tell: () -> Integer + def pos: () -> Integer + end + end + end +end diff --git a/spec/datadog/core/configuration/settings_spec.rb b/spec/datadog/core/configuration/settings_spec.rb index 4cc46cad087..33fe290f315 100644 --- a/spec/datadog/core/configuration/settings_spec.rb +++ b/spec/datadog/core/configuration/settings_spec.rb @@ -834,6 +834,41 @@ .to(true) end end + + describe '#dir_interruption_workaround_enabled' do + subject(:dir_interruption_workaround_enabled) { settings.profiling.advanced.dir_interruption_workaround_enabled } + + context 'when DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED' do + around do |example| + ClimateControl.modify('DD_PROFILING_DIR_INTERRUPTION_WORKAROUND_ENABLED' => environment) do + example.run + end + end + + context 'is not defined' do + let(:environment) { nil } + + it { is_expected.to be true } + end + + [true, false].each do |value| + context "is defined as #{value}" do + let(:environment) { value.to_s } + + it { is_expected.to be value } + end + end + end + end + + describe '#dir_interruption_workaround_enabled=' do + it 'updates the #dir_interruption_workaround_enabled setting from its default of true' do + expect { settings.profiling.advanced.dir_interruption_workaround_enabled = false } + .to change { settings.profiling.advanced.dir_interruption_workaround_enabled } + .from(true) + .to(false) + end + end end describe '#upload' do diff --git a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb index 7660468be15..633e1271ac6 100644 --- a/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb +++ b/spec/datadog/profiling/collectors/cpu_and_wall_time_worker_spec.rb @@ -1176,6 +1176,20 @@ end end + describe '._native_hold_signals and _native_resume_signals' do + it 'blocks/unblocks interruptions for the current thread' do + expect(described_class::Testing._native_is_sigprof_blocked_in_current_thread).to be false + + described_class._native_hold_signals + + expect(described_class::Testing._native_is_sigprof_blocked_in_current_thread).to be true + + described_class._native_resume_signals + + expect(described_class::Testing._native_is_sigprof_blocked_in_current_thread).to be false + end + end + def wait_until_running cpu_and_wall_time_worker.wait_until_running end diff --git a/spec/datadog/profiling/component_spec.rb b/spec/datadog/profiling/component_spec.rb index 3ff73da8c4f..4523dfd4082 100644 --- a/spec/datadog/profiling/component_spec.rb +++ b/spec/datadog/profiling/component_spec.rb @@ -7,9 +7,10 @@ let(:profiler_setup_task) { instance_double(Datadog::Profiling::Tasks::Setup) if Datadog::Profiling.supported? } before do - # Ensure the real task never gets run (so it doesn't apply our thread patches and other extensions to our test env) + # Make sure these never get run so they doesn't apply our monkey patches to the RSpec process if Datadog::Profiling.supported? allow(Datadog::Profiling::Tasks::Setup).to receive(:new).and_return(profiler_setup_task) + allow(Datadog::Profiling::Ext::DirMonkeyPatches).to receive(:apply!).and_return(true) end end @@ -586,6 +587,40 @@ build_profiler_component end end + + describe 'dir interruption workaround' do + let(:no_signals_workaround_enabled) { false } + + before do + expect(described_class).to receive(:no_signals_workaround_enabled?).and_return(no_signals_workaround_enabled) + end + + it 'is enabled by default' do + expect(Datadog::Profiling::Ext::DirMonkeyPatches).to receive(:apply!) + + build_profiler_component + end + + context 'when the no signals workaround is enabled' do + let(:no_signals_workaround_enabled) { true } + + it 'is not applied' do + expect(Datadog::Profiling::Ext::DirMonkeyPatches).to_not receive(:apply!) + + build_profiler_component + end + end + + context 'when the dir interruption workaround is disabled via configuration' do + before { settings.profiling.advanced.dir_interruption_workaround_enabled = false } + + it 'is not applied' do + expect(Datadog::Profiling::Ext::DirMonkeyPatches).to_not receive(:apply!) + + build_profiler_component + end + end + end end end diff --git a/spec/datadog/profiling/ext/dir_monkey_patches_spec.rb b/spec/datadog/profiling/ext/dir_monkey_patches_spec.rb new file mode 100644 index 00000000000..4af8c55f73c --- /dev/null +++ b/spec/datadog/profiling/ext/dir_monkey_patches_spec.rb @@ -0,0 +1,346 @@ +require 'datadog/profiling/spec_helper' + +require 'datadog/profiling/collectors/cpu_and_wall_time_worker' +require 'datadog/profiling/ext/dir_monkey_patches' + +# NOTE: Specs in this file are written so as to not leave the DirMonkeyPatches loaded into the Ruby VM after this +# test executes. They do this by only applying these monkey patches in a separate process. +RSpec.describe Datadog::Profiling::Ext::DirMonkeyPatches do + before do + skip_if_profiling_not_supported(self) + + File.open("#{temporary_directory}/file1", 'w') { |f| f.write('file1') } + File.open("#{temporary_directory}/file2", 'w') { |f| f.write('file2') } + File.open("#{temporary_directory}/file3", 'w') { |f| f.write('file3') } + + expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to_not receive(:_native_hold_signals) + expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).to_not receive(:_native_resume_signals) + end + + let(:temporary_directory) { Dir.mktmpdir } + let(:temporary_files_count) { 3 } + let(:expected_hold_resume_calls_count) { 1 } + + after do + begin + FileUtils.remove_dir(temporary_directory) + rescue Errno::ENOENT => _e + # Do nothing, it's ok + end + end + + describe 'DirClassMonkeyPatches' do + describe '.[]' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir['*1', '*2', base: temporary_directory]).to contain_exactly('file1', 'file2') + end + end + end + + describe '.children' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + result = Dir.children(temporary_directory, encoding: 'US-ASCII').sort + expect(result.first.encoding).to be Encoding::US_ASCII + expect(result.first).to eq 'file1' + end + end + end + + describe '.each_child' do + let(:expected_hold_resume_calls_count) { 1 + temporary_files_count } + + context 'with a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + files = [] + + Dir.each_child(temporary_directory, encoding: 'UTF-8') { |it| files << it } + + expect(files).to contain_exactly('file1', 'file2', 'file3') + end + end + + it 'allows signals to arrive inside the user block' do + test_with_monkey_patch do + ran_assertion = false + + Dir.each_child(temporary_directory, encoding: 'UTF-8') do + expect_sigprof_to_be(:unblocked) + ran_assertion = true + end + + expect(ran_assertion).to be true + end + end + end + + context 'without a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.each_child(temporary_directory, encoding: 'UTF-8').to_a).to include('file1', 'file2', 'file3') + end + end + end + end + + describe '.empty?' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.empty?(temporary_directory)).to be false + end + end + end + + describe '.entries' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.entries(temporary_directory)).to contain_exactly('.', '..', 'file1', 'file2', 'file3') + end + end + end + + describe '.foreach' do + let(:expected_hold_resume_calls_count) { 1 + temporary_files_count + ['.', '..'].size } + + context 'with a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + files = [] + + Dir.foreach(temporary_directory, encoding: 'UTF-8') { |it| files << it } + + expect(files).to contain_exactly('file1', 'file2', 'file3', '.', '..') + end + end + + it 'allows signals to arrive inside the user block' do + test_with_monkey_patch do + ran_assertion = false + + Dir.foreach(temporary_directory, encoding: 'UTF-8') do + expect_sigprof_to_be(:unblocked) + ran_assertion = true + end + + expect(ran_assertion).to be true + end + end + end + + context 'without a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.foreach(temporary_directory, encoding: 'UTF-8').to_a) + .to include('file1', 'file2', 'file3', '.', '..') + end + end + end + end + + describe '.glob' do + before do + File.open("#{temporary_directory}/.hidden_file1", 'w') { |f| f.write('.hidden_file1') } + end + + let(:expected_files_result) { ['.hidden_file1', 'file1', 'file2'] } + + context 'with a block' do + let(:expected_hold_resume_calls_count) { 1 + expected_files_result.size } + + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + files = [] + + Dir.glob(['*1', '*2'], base: temporary_directory, flags: File::FNM_DOTMATCH) { |it| files << it } + + expect(files).to contain_exactly(*expected_files_result) + end + end + + it 'allows signals to arrive inside the user block' do + test_with_monkey_patch do + ran_assertion = false + + Dir.glob(['*1', '*2'], base: temporary_directory, flags: File::FNM_DOTMATCH) do + expect_sigprof_to_be(:unblocked) + ran_assertion = true + end + + expect(ran_assertion).to be true + end + end + end + + context 'without a block' do + # You may be wondering why this one has a call count of 1 when for instance .foreach and each_child have a call + # count of > 1. The difference is the "without a block" versions of those calls **return an enumerator** and + # the enumerator then just calls the block version when executed. + # + # This is not what happens with glob -- glob never returns an enumerator, so the "without a block" version + # does not get turned into a "with a block" call. + let(:expected_hold_resume_calls_count) { 1 } + + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.glob(['*1', '*2'], base: temporary_directory, flags: File::FNM_DOTMATCH)) + .to contain_exactly(*expected_files_result) + end + end + end + end + + describe '.home' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(Dir.home).to start_with('/') + end + end + end + end + + describe 'DirInstanceMonkeyPatches' do + let(:dir) { Dir.new(temporary_directory) } + + describe '#each' do + let(:expected_hold_resume_calls_count) { 1 + temporary_files_count + ['.', '..'].size } + + context 'with a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + files = [] + + dir.each { |it| files << it } + + expect(files).to contain_exactly('file1', 'file2', 'file3', '.', '..') + end + end + + it 'allows signals to arrive inside the user block' do + test_with_monkey_patch do + ran_assertion = false + + dir.each do + expect_sigprof_to_be(:unblocked) + ran_assertion = true + end + + expect(ran_assertion).to be true + end + end + end + + context 'without a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(dir.each.to_a).to contain_exactly('file1', 'file2', 'file3', '.', '..') + end + end + end + end + + describe '#each_child' do + before { skip('API not available on Ruby 2.5') if RUBY_VERSION.start_with?('2.5.') } + + let(:expected_hold_resume_calls_count) { 1 + temporary_files_count } + + context 'with a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + files = [] + + dir.each_child { |it| files << it } + + expect(files).to contain_exactly('file1', 'file2', 'file3') + end + end + + it 'allows signals to arrive inside the user block' do + test_with_monkey_patch do + ran_assertion = false + + dir.each_child do + expect_sigprof_to_be(:unblocked) + ran_assertion = true + end + + expect(ran_assertion).to be true + end + end + end + + context 'without a block' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(dir.each_child.to_a).to include('file1', 'file2', 'file3') + end + end + end + end + + describe '#children' do + before { skip('API not available on Ruby 2.5') if RUBY_VERSION.start_with?('2.5.') } + + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(dir.children).to contain_exactly('file1', 'file2', 'file3') + end + end + end + + describe '#tell' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(dir.tell).to be_a_kind_of(Integer) + end + end + end + + describe '#pos' do + it 'matches the ruby behavior without monkey patching' do + test_with_and_without_monkey_patch do + expect(dir.pos).to be_a_kind_of(Integer) + end + end + end + end + + def test_with_and_without_monkey_patch(&testcase) + yield + test_with_monkey_patch(&testcase) + end + + def test_with_monkey_patch(in_fork: true, &testcase) + wrapped_testcase = proc do + RSpec::Mocks.space.proxy_for(Datadog::Profiling::Collectors::CpuAndWallTimeWorker).reset + + allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker) + .to receive(:_native_hold_signals).and_call_original + allow(Datadog::Profiling::Collectors::CpuAndWallTimeWorker) + .to receive(:_native_resume_signals).and_call_original + + Datadog::Profiling::Ext::DirMonkeyPatches.apply! + yield + + expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker) + .to have_received(:_native_hold_signals).exactly(expected_hold_resume_calls_count).times + expect(Datadog::Profiling::Collectors::CpuAndWallTimeWorker) + .to have_received(:_native_resume_signals).exactly(expected_hold_resume_calls_count).times + end + + if in_fork + expect_in_fork(&wrapped_testcase) + else + wrapped_testcase.call + end + end + + def expect_sigprof_to_be(state) + raise ArgumentError unless [:blocked, :unblocked].include?(state) + + expect( + Datadog::Profiling::Collectors::CpuAndWallTimeWorker::Testing._native_is_sigprof_blocked_in_current_thread + ).to be(state == :blocked), "Sigprof was expected to be #{state}, but it's actually not" + end +end diff --git a/spec/datadog/profiling/validate_benchmarks_spec.rb b/spec/datadog/profiling/validate_benchmarks_spec.rb index bec1576c986..74cad9583bb 100644 --- a/spec/datadog/profiling/validate_benchmarks_spec.rb +++ b/spec/datadog/profiling/validate_benchmarks_spec.rb @@ -9,15 +9,25 @@ end end - [ + benchmarks_to_validate = [ 'profiler_sample_loop_v2', 'profiler_http_transport', 'profiler_sample_serialize', 'profiler_memory_sample_serialize', - 'profiler_gc' - ].each do |benchmark| + 'profiler_gc', + 'profiler_hold_resume_interruptions', + ].freeze + + benchmarks_to_validate.each do |benchmark| describe benchmark do it('runs without raising errors') { expect_in_fork { load "./benchmarks/#{benchmark}.rb" } } end end + + # This test validates that we don't forget to add new benchmarks to benchmarks_to_validate + it 'tests all expected benchmarks in the benchmarks folder' do + all_benchmarks = Dir['./benchmarks/profiler_*'].map { |it| it.gsub('./benchmarks/', '').gsub('.rb', '') } + + expect(benchmarks_to_validate).to contain_exactly(*all_benchmarks) + end end