From bcf39917d098ad2248f84c016c65c9c27d7d46a7 Mon Sep 17 00:00:00 2001 From: Oleg Pudeyev Date: Mon, 23 Sep 2024 11:14:01 -0400 Subject: [PATCH] DEBUG-2334 Dynamic Instrumentation code tracker component --- lib/datadog/di/code_tracker.rb | 144 ++++++++++++++++++ sig/datadog/di/code_tracker.rbs | 21 +++ spec/datadog/di/code_tracker_spec.rb | 119 +++++++++++++++ spec/datadog/di/code_tracker_test_class_1.rb | 2 + spec/datadog/di/code_tracker_test_class_2.rb | 2 + spec/datadog/di/code_tracker_test_class_3.rb | 2 + .../code_tracker_test_class_1.rb | 4 + 7 files changed, 294 insertions(+) create mode 100644 lib/datadog/di/code_tracker.rb create mode 100644 sig/datadog/di/code_tracker.rbs create mode 100644 spec/datadog/di/code_tracker_spec.rb create mode 100644 spec/datadog/di/code_tracker_test_class_1.rb create mode 100644 spec/datadog/di/code_tracker_test_class_2.rb create mode 100644 spec/datadog/di/code_tracker_test_class_3.rb create mode 100644 spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb diff --git a/lib/datadog/di/code_tracker.rb b/lib/datadog/di/code_tracker.rb new file mode 100644 index 00000000000..1fb007b108f --- /dev/null +++ b/lib/datadog/di/code_tracker.rb @@ -0,0 +1,144 @@ +# frozen_string_literal: true + +require "concurrent/map" + +module Datadog + module DI + # Tracks loaded Ruby code by source file and maintains a map from + # source file to the loaded code (instruction sequences). + # Also arranges for code in the loaded files to be instrumented by + # line probes that have already been received by the library. + # + # The loaded code is used to target line trace points when installing + # line probes which dramatically improves efficiency of line trace points. + # + # Note that, since most files will only be loaded one time (via the + # "require" mechanism), the code tracker needs to be global and not be + # recreated when the DI component is created. + # + # @api private + class CodeTracker + def initialize + @registry = Concurrent::Map.new + @lock = Mutex.new + end + + def start + # If this code tracker is already running, we can do nothing or + # restart it (by disabling the trace point and recreating it). + # It is likely that some applications will attempt to activate + # DI more than once where the intention is to just activate DI; + # do not break such applications by clearing out the registry. + # For now, until there is a use case for recreating the trace point, + # do nothing if the code tracker has already started. + return if active? + + compiled_trace_point = TracePoint.trace(:script_compiled) do |tp| + # Useful attributes of the trace point object here: + # .instruction_sequence + # .method_id + # .path (refers to the code location that called the require/eval/etc., + # not where the loaded code is; use .path on the instruction sequence + # to obtain the location of the compiled code) + # .eval_script + # + # For now just map the path to the instruction sequence. + path = tp.instruction_sequence.path + registry[path] = tp.instruction_sequence + end + + @lock.synchronize do + # Since trace point creation itself is not under a lock, see if + # another thread created the trace point, in which case we can + # disable our trace point and do nothing. + if @compiled_trace_point + # Disable the local variable, leave instance variable as it is. + compiled_trace_point.disable + return + end + + @compiled_trace_point = compiled_trace_point + end + end + + # Returns whether this code tracker has been activated and is + # tracking. + def active? + @lock.synchronize do + !!@compiled_trace_point + end + end + + # Returns an array of RubVM::InstructionSequence (i.e. the compiled code) + # for the provided path. + # + # The argument can be a full path to a Ruby source code file or a + # suffix (basename + one or more directories preceding the basename). + # The idea with suffix matches is that file paths are likely to + # be different between development and production environments and + # the source control system uses relative paths and doesn't have + # absolute paths at all. + # + # Suffix matches are not guaranteed to be correct, meaning there may + # be multiple files with the same basename and they may all match a + # given suffix. In such cases, this method will return all matching + # paths (and all of these paths will be attempted to be instrumented + # by upstream code). + # + # If the suffix matches one of the paths completely (which requires it + # to be an absolute path), only the exactly matching path is returned. + # Otherwise all known paths that end in the suffix are returned. + # If no paths match, an empty array is returned. + def iseqs_for_path(suffix) + exact = registry[suffix] + if exact + return [exact] + end + inexact = [] + registry.each do |path, iseq| + # Exact match is not possible here, meaning any matching path + # has to be longer than the suffix. Require full component matches, + # meaning either the first character of the suffix is a slash + # or the previous character in the path is a slash. + # For now only check for forward slashes for Unix-like OSes; + # backslash is a legitimate character of a file name in Unix + # therefore simply permitting forward or back slash is not + # sufficient, we need to perform an OS check to know which + # path separator to use. + if path.length > suffix.length && ( + path[path.length - suffix.length - 1] == "/" || + suffix[0] == "/" + ) && path.end_with?(suffix) + inexact << iseq + end + end + inexact + end + + # Stops tracking code that is being loaded. + # + # This method should ordinarily never be called - if a file is loaded + # when code tracking is not active, this file will not be instrumentable + # by line probes. + # + # This method is intended for test suite use only, where multiple + # code tracker instances are created, to fully clean up the old instances. + def stop + # Permit multiple stop calls. + @lock.synchronize do + @compiled_trace_point&.disable + # Clear the instance variable so that the trace point may be + # reinstated in the future. + @compiled_trace_point = nil + end + registry.clear + end + + private + + # Mapping from paths of loaded files to RubyVM::InstructionSequence + # objects representing compiled code of those files. + attr_reader :registry + end + end +end diff --git a/sig/datadog/di/code_tracker.rbs b/sig/datadog/di/code_tracker.rbs new file mode 100644 index 00000000000..0dafdc605b1 --- /dev/null +++ b/sig/datadog/di/code_tracker.rbs @@ -0,0 +1,21 @@ +module Datadog + module DI + class CodeTracker + @registry: Hash[String,RubyVM::InstructionSequence] + + @lock: Thread::Mutex + + @compiled_trace_point: TracePoint? + + def initialize: () -> void + + def start: () -> void + def active?: () -> bool + def iseqs_for_path: (String suffix) -> (::Array[RubyVM::InstructionSequence]) + def stop: () -> void + + private + attr_reader registry: Hash[String,RubyVM::InstructionSequence] + end + end +end diff --git a/spec/datadog/di/code_tracker_spec.rb b/spec/datadog/di/code_tracker_spec.rb new file mode 100644 index 00000000000..b14a20e3d78 --- /dev/null +++ b/spec/datadog/di/code_tracker_spec.rb @@ -0,0 +1,119 @@ +require "datadog/di/code_tracker" + +RSpec.describe Datadog::DI::CodeTracker do + let(:tracker) do + described_class.new + end + + describe ".new" do + it "creates an instance" do + expect(tracker).to be_a(described_class) + end + end + + describe "#start" do + after do + tracker.stop + end + + it "tracks loaded files" do + # The expectations appear to be lazy-loaded, therefore + # we need to invoke the same expectation before starting + # code tracking as we'll be using later in the test. + expect(tracker.send(:registry)).to be_empty + tracker.start + # Should still be empty here. + expect(tracker.send(:registry)).to be_empty + load File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb") + expect(tracker.send(:registry).each.to_a.length).to eq(1) + + path = tracker.send(:registry).each.to_a.first.first + # The path in the registry should be absolute. + expect(path[0]).to eq "/" + # The full path is dependent on the environment/system + # running the tests, but we can assert on the basename + # which will be the same. + expect(File.basename(path)).to eq("code_tracker_test_class_1.rb") + # And, we should in fact have a full path. + expect(path).to start_with("/") + end + end + + describe "#active?" do + context "when started" do + before do + tracker.start + end + + after do + tracker.stop + end + + it "is true" do + expect(tracker.active?).to be true + end + end + + context "when stopped" do + before do + tracker.start + tracker.stop + end + + it "is false" do + expect(tracker.active?).to be false + end + end + end + + describe "#iseqs_for_path" do + around do |example| + tracker.start + + load File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb") + load File.join(File.dirname(__FILE__), "code_tracker_test_class_2.rb") + load File.join(File.dirname(__FILE__), "code_tracker_test_class_3.rb") + load File.join(File.dirname(__FILE__), "code_tracker_test_classes", "code_tracker_test_class_1.rb") + expect(tracker.send(:registry).each.to_a.length).to eq(4) + + # To be able to assert on the registry, replace values (iseqs) + # with the keys. + (registry = tracker.send(:registry)).each do |k, v| + registry[k] = k + end + + example.run + + tracker.stop + end + + context "exact match for full path" do + let(:path) do + File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb") + end + + it "returns the exact match only" do + expect(tracker.iseqs_for_path(path)).to eq([path]) + end + end + + context "basename match" do + let(:expected) do + [ + File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb"), + File.join(File.dirname(__FILE__), "code_tracker_test_classes", "code_tracker_test_class_1.rb"), + ] + end + + it "returns the exact match only" do + expect(tracker.iseqs_for_path("code_tracker_test_class_1.rb")).to eq(expected) + end + end + + context "match not on path component boundary" do + it "returns no matches" do + expect(tracker.iseqs_for_path("1.rb")).to eq([]) + end + end + end +end diff --git a/spec/datadog/di/code_tracker_test_class_1.rb b/spec/datadog/di/code_tracker_test_class_1.rb new file mode 100644 index 00000000000..60f4b30f80c --- /dev/null +++ b/spec/datadog/di/code_tracker_test_class_1.rb @@ -0,0 +1,2 @@ +class CodeTrackerTestClass1 +end diff --git a/spec/datadog/di/code_tracker_test_class_2.rb b/spec/datadog/di/code_tracker_test_class_2.rb new file mode 100644 index 00000000000..c99b1367fd5 --- /dev/null +++ b/spec/datadog/di/code_tracker_test_class_2.rb @@ -0,0 +1,2 @@ +class CodeTrackerTestClass2 +end diff --git a/spec/datadog/di/code_tracker_test_class_3.rb b/spec/datadog/di/code_tracker_test_class_3.rb new file mode 100644 index 00000000000..eaa42bbe766 --- /dev/null +++ b/spec/datadog/di/code_tracker_test_class_3.rb @@ -0,0 +1,2 @@ +class CodeTrackerTestClass3 +end diff --git a/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb b/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb new file mode 100644 index 00000000000..27075513397 --- /dev/null +++ b/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb @@ -0,0 +1,4 @@ +# Different name to not conflict with the upper-level class definition. +# Note that file basenames need to be identical for some of the test cases. +class SubdirCodeTrackerTestClass1 +end