From bcf39917d098ad2248f84c016c65c9c27d7d46a7 Mon Sep 17 00:00:00 2001
From: Oleg Pudeyev <code@olegp.name>
Date: Mon, 23 Sep 2024 11:14:01 -0400
Subject: [PATCH] DEBUG-2334 Dynamic Instrumentation code tracker component

---
 lib/datadog/di/code_tracker.rb                | 144 ++++++++++++++++++
 sig/datadog/di/code_tracker.rbs               |  21 +++
 spec/datadog/di/code_tracker_spec.rb          | 119 +++++++++++++++
 spec/datadog/di/code_tracker_test_class_1.rb  |   2 +
 spec/datadog/di/code_tracker_test_class_2.rb  |   2 +
 spec/datadog/di/code_tracker_test_class_3.rb  |   2 +
 .../code_tracker_test_class_1.rb              |   4 +
 7 files changed, 294 insertions(+)
 create mode 100644 lib/datadog/di/code_tracker.rb
 create mode 100644 sig/datadog/di/code_tracker.rbs
 create mode 100644 spec/datadog/di/code_tracker_spec.rb
 create mode 100644 spec/datadog/di/code_tracker_test_class_1.rb
 create mode 100644 spec/datadog/di/code_tracker_test_class_2.rb
 create mode 100644 spec/datadog/di/code_tracker_test_class_3.rb
 create mode 100644 spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb

diff --git a/lib/datadog/di/code_tracker.rb b/lib/datadog/di/code_tracker.rb
new file mode 100644
index 00000000000..1fb007b108f
--- /dev/null
+++ b/lib/datadog/di/code_tracker.rb
@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+
+require "concurrent/map"
+
+module Datadog
+  module DI
+    # Tracks loaded Ruby code by source file and maintains a map from
+    # source file to the loaded code (instruction sequences).
+    # Also arranges for code in the loaded files to be instrumented by
+    # line probes that have already been received by the library.
+    #
+    # The loaded code is used to target line trace points when installing
+    # line probes which dramatically improves efficiency of line trace points.
+    #
+    # Note that, since most files will only be loaded one time (via the
+    # "require" mechanism), the code tracker needs to be global and not be
+    # recreated when the DI component is created.
+    #
+    # @api private
+    class CodeTracker
+      def initialize
+        @registry = Concurrent::Map.new
+        @lock = Mutex.new
+      end
+
+      def start
+        # If this code tracker is already running, we can do nothing or
+        # restart it (by disabling the trace point and recreating it).
+        # It is likely that some applications will attempt to activate
+        # DI more than once where the intention is to just activate DI;
+        # do not break such applications by clearing out the registry.
+        # For now, until there is a use case for recreating the trace point,
+        # do nothing if the code tracker has already started.
+        return if active?
+
+        compiled_trace_point = TracePoint.trace(:script_compiled) do |tp|
+          # Useful attributes of the trace point object here:
+          # .instruction_sequence
+          # .method_id
+          # .path (refers to the code location that called the require/eval/etc.,
+          #   not where the loaded code is; use .path on the instruction sequence
+          #   to obtain the location of the compiled code)
+          # .eval_script
+          #
+          # For now just map the path to the instruction sequence.
+          path = tp.instruction_sequence.path
+          registry[path] = tp.instruction_sequence
+        end
+
+        @lock.synchronize do
+          # Since trace point creation itself is not under a lock, see if
+          # another thread created the trace point, in which case we can
+          # disable our trace point and do nothing.
+          if @compiled_trace_point
+            # Disable the local variable, leave instance variable as it is.
+            compiled_trace_point.disable
+            return
+          end
+
+          @compiled_trace_point = compiled_trace_point
+        end
+      end
+
+      # Returns whether this code tracker has been activated and is
+      # tracking.
+      def active?
+        @lock.synchronize do
+          !!@compiled_trace_point
+        end
+      end
+
+      # Returns an array of RubVM::InstructionSequence (i.e. the compiled code)
+      # for the provided path.
+      #
+      # The argument can be a full path to a Ruby source code file or a
+      # suffix (basename + one or more directories preceding the basename).
+      # The idea with suffix matches is that file paths are likely to
+      # be different between development and production environments and
+      # the source control system uses relative paths and doesn't have
+      # absolute paths at all.
+      #
+      # Suffix matches are not guaranteed to be correct, meaning there may
+      # be multiple files with the same basename and they may all match a
+      # given suffix. In such cases, this method will return all matching
+      # paths (and all of these paths will be attempted to be instrumented
+      # by upstream code).
+      #
+      # If the suffix matches one of the paths completely (which requires it
+      # to be an absolute path), only the exactly matching path is returned.
+      # Otherwise all known paths that end in the suffix are returned.
+      # If no paths match, an empty array is returned.
+      def iseqs_for_path(suffix)
+        exact = registry[suffix]
+        if exact
+          return [exact]
+        end
+        inexact = []
+        registry.each do |path, iseq|
+          # Exact match is not possible here, meaning any matching path
+          # has to be longer than the suffix. Require full component matches,
+          # meaning either the first character of the suffix is a slash
+          # or the previous character in the path is a slash.
+          # For now only check for forward slashes for Unix-like OSes;
+          # backslash is a legitimate character of a file name in Unix
+          # therefore simply permitting forward or back slash is not
+          # sufficient, we need to perform an OS check to know which
+          # path separator to use.
+          if path.length > suffix.length && (
+            path[path.length - suffix.length - 1] == "/" ||
+            suffix[0] == "/"
+          ) && path.end_with?(suffix)
+            inexact << iseq
+          end
+        end
+        inexact
+      end
+
+      # Stops tracking code that is being loaded.
+      #
+      # This method should ordinarily never be called - if a file is loaded
+      # when code tracking is not active, this file will not be instrumentable
+      # by line probes.
+      #
+      # This method is intended for test suite use only, where multiple
+      # code tracker instances are created, to fully clean up the old instances.
+      def stop
+        # Permit multiple stop calls.
+        @lock.synchronize do
+          @compiled_trace_point&.disable
+          # Clear the instance variable so that the trace point may be
+          # reinstated in the future.
+          @compiled_trace_point = nil
+        end
+        registry.clear
+      end
+
+      private
+
+      # Mapping from paths of loaded files to RubyVM::InstructionSequence
+      # objects representing compiled code of those files.
+      attr_reader :registry
+    end
+  end
+end
diff --git a/sig/datadog/di/code_tracker.rbs b/sig/datadog/di/code_tracker.rbs
new file mode 100644
index 00000000000..0dafdc605b1
--- /dev/null
+++ b/sig/datadog/di/code_tracker.rbs
@@ -0,0 +1,21 @@
+module Datadog
+  module DI
+    class CodeTracker
+      @registry: Hash[String,RubyVM::InstructionSequence]
+
+      @lock: Thread::Mutex
+
+      @compiled_trace_point: TracePoint?
+
+      def initialize: () -> void
+
+      def start: () -> void
+      def active?: () -> bool
+      def iseqs_for_path: (String suffix) -> (::Array[RubyVM::InstructionSequence])
+      def stop: () -> void
+
+      private
+      attr_reader registry: Hash[String,RubyVM::InstructionSequence]
+    end
+  end
+end
diff --git a/spec/datadog/di/code_tracker_spec.rb b/spec/datadog/di/code_tracker_spec.rb
new file mode 100644
index 00000000000..b14a20e3d78
--- /dev/null
+++ b/spec/datadog/di/code_tracker_spec.rb
@@ -0,0 +1,119 @@
+require "datadog/di/code_tracker"
+
+RSpec.describe Datadog::DI::CodeTracker do
+  let(:tracker) do
+    described_class.new
+  end
+
+  describe ".new" do
+    it "creates an instance" do
+      expect(tracker).to be_a(described_class)
+    end
+  end
+
+  describe "#start" do
+    after do
+      tracker.stop
+    end
+
+    it "tracks loaded files" do
+      # The expectations appear to be lazy-loaded, therefore
+      # we need to invoke the same expectation before starting
+      # code tracking as we'll be using later in the test.
+      expect(tracker.send(:registry)).to be_empty
+      tracker.start
+      # Should still be empty here.
+      expect(tracker.send(:registry)).to be_empty
+      load File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb")
+      expect(tracker.send(:registry).each.to_a.length).to eq(1)
+
+      path = tracker.send(:registry).each.to_a.first.first
+      # The path in the registry should be absolute.
+      expect(path[0]).to eq "/"
+      # The full path is dependent on the environment/system
+      # running the tests, but we can assert on the basename
+      # which will be the same.
+      expect(File.basename(path)).to eq("code_tracker_test_class_1.rb")
+      # And, we should in fact have a full path.
+      expect(path).to start_with("/")
+    end
+  end
+
+  describe "#active?" do
+    context "when started" do
+      before do
+        tracker.start
+      end
+
+      after do
+        tracker.stop
+      end
+
+      it "is true" do
+        expect(tracker.active?).to be true
+      end
+    end
+
+    context "when stopped" do
+      before do
+        tracker.start
+        tracker.stop
+      end
+
+      it "is false" do
+        expect(tracker.active?).to be false
+      end
+    end
+  end
+
+  describe "#iseqs_for_path" do
+    around do |example|
+      tracker.start
+
+      load File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb")
+      load File.join(File.dirname(__FILE__), "code_tracker_test_class_2.rb")
+      load File.join(File.dirname(__FILE__), "code_tracker_test_class_3.rb")
+      load File.join(File.dirname(__FILE__), "code_tracker_test_classes", "code_tracker_test_class_1.rb")
+      expect(tracker.send(:registry).each.to_a.length).to eq(4)
+
+      # To be able to assert on the registry, replace values (iseqs)
+      # with the keys.
+      (registry = tracker.send(:registry)).each do |k, v|
+        registry[k] = k
+      end
+
+      example.run
+
+      tracker.stop
+    end
+
+    context "exact match for full path" do
+      let(:path) do
+        File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb")
+      end
+
+      it "returns the exact match only" do
+        expect(tracker.iseqs_for_path(path)).to eq([path])
+      end
+    end
+
+    context "basename match" do
+      let(:expected) do
+        [
+          File.join(File.dirname(__FILE__), "code_tracker_test_class_1.rb"),
+          File.join(File.dirname(__FILE__), "code_tracker_test_classes", "code_tracker_test_class_1.rb"),
+        ]
+      end
+
+      it "returns the exact match only" do
+        expect(tracker.iseqs_for_path("code_tracker_test_class_1.rb")).to eq(expected)
+      end
+    end
+
+    context "match not on path component boundary" do
+      it "returns no matches" do
+        expect(tracker.iseqs_for_path("1.rb")).to eq([])
+      end
+    end
+  end
+end
diff --git a/spec/datadog/di/code_tracker_test_class_1.rb b/spec/datadog/di/code_tracker_test_class_1.rb
new file mode 100644
index 00000000000..60f4b30f80c
--- /dev/null
+++ b/spec/datadog/di/code_tracker_test_class_1.rb
@@ -0,0 +1,2 @@
+class CodeTrackerTestClass1
+end
diff --git a/spec/datadog/di/code_tracker_test_class_2.rb b/spec/datadog/di/code_tracker_test_class_2.rb
new file mode 100644
index 00000000000..c99b1367fd5
--- /dev/null
+++ b/spec/datadog/di/code_tracker_test_class_2.rb
@@ -0,0 +1,2 @@
+class CodeTrackerTestClass2
+end
diff --git a/spec/datadog/di/code_tracker_test_class_3.rb b/spec/datadog/di/code_tracker_test_class_3.rb
new file mode 100644
index 00000000000..eaa42bbe766
--- /dev/null
+++ b/spec/datadog/di/code_tracker_test_class_3.rb
@@ -0,0 +1,2 @@
+class CodeTrackerTestClass3
+end
diff --git a/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb b/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb
new file mode 100644
index 00000000000..27075513397
--- /dev/null
+++ b/spec/datadog/di/code_tracker_test_classes/code_tracker_test_class_1.rb
@@ -0,0 +1,4 @@
+# Different name to not conflict with the upper-level class definition.
+# Note that file basenames need to be identical for some of the test cases.
+class SubdirCodeTrackerTestClass1
+end