Track un-active instrumentation for better reporting (#2016)

This PR changes when the instrumentation manager tracks an instrumentation. Once an instrumentation is initialized, we should start tracking it even if it fails to load. The reason is, that we need to call the reporter once the process exits (so it can report/clean it) - however, if we only track the PID once the instrumentation is loaded we won't be able to invoke the reporter properly once a failed-to-load instrumentation needs to be cleaned. Each PID we track is marked whether the instrumentation is active (loaded successfully) or not - by the inst field being nil or not. For un-active PIDs we won't apply config updates. In addition, in case we get an exec event on a non-active PID, we try to instrument it again. This is helpful in cases of chain-loading where the first executable is written in a language we can't instrument, while the second is valid for instrumentation.
odigos-io · Dec 17, 2024 · 716aac7 · 716aac7
1 parent 01cef4b
commit 716aac7
Showing 1 changed file with 30 additions and 16 deletions.
diff --git a/instrumentation/manager.go b/instrumentation/manager.go
@@ -24,6 +24,9 @@ var (
 type ConfigUpdate[configGroup ConfigGroup] map[configGroup]Config
 
 type instrumentationDetails[processGroup ProcessGroup, configGroup ConfigGroup] struct {
+	// we want to track the instrumentation even if it failed to load, to be able to report the error
+	// and clean up the reporter resources once the process exits.
+	// hence, this might be nil if the instrumentation failed to load.
 	inst Instrumentation
 	pg   processGroup
 	cg   configGroup
@@ -75,11 +78,11 @@ type manager[processGroup ProcessGroup, configGroup ConfigGroup] struct {
 	factories  map[OtelDistribution]Factory
 	logger     logr.Logger
 
-	// all the active instrumentations by pid,
+	// all the created instrumentations by pid,
 	// this map is not concurrent safe, so it should be accessed only from the main event loop
 	detailsByPid map[int]*instrumentationDetails[processGroup, configGroup]
 
-	// active instrumentations by workload, and aggregated by pid
+	// instrumentations by workload, and aggregated by pid
 	// this map is not concurrent safe, so it should be accessed only from the main event loop
 	detailsByWorkload map[configGroup]map[int]*instrumentationDetails[processGroup, configGroup]
 
@@ -142,6 +145,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) runEventLoop(ctx context.Context) {
 		case <-ctx.Done():
 			m.logger.Info("stopping eBPF instrumentation manager")
 			for pid, details := range m.detailsByPid {
+				if details.inst == nil {
+					continue
+				}
 				err := details.inst.Close(ctx)
 				if err != nil {
 					m.logger.Error(err, "failed to close instrumentation", "pid", pid)
@@ -201,12 +207,14 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co
 
 	m.logger.Info("cleaning instrumentation resources", "pid", pid, "process group details", details.pg)
 
-	err := details.inst.Close(ctx)
-	if err != nil {
-		m.logger.Error(err, "failed to close instrumentation")
+	if details.inst != nil {
+		err := details.inst.Close(ctx)
+		if err != nil {
+			m.logger.Error(err, "failed to close instrumentation")
+		}
 	}
 
-	err = m.handler.Reporter.OnExit(ctx, pid, details.pg)
+	err := m.handler.Reporter.OnExit(ctx, pid, details.pg)
 	if err != nil {
 		m.logger.Error(err, "failed to report instrumentation exit")
 	}
@@ -215,7 +223,7 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co
 }
 
 func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.Context, e detector.ProcessEvent) error {
-	if _, found := m.detailsByPid[e.PID]; found {
+	if details, found := m.detailsByPid[e.PID]; found && details.inst != nil {
 		// this can happen if we have multiple exec events for the same pid (chain loading)
 		// TODO: better handle this?
 		// this can be done by first closing the existing instrumentation,
@@ -266,21 +274,24 @@ func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.
 		return err
 	}
 
-	err = inst.Load(ctx)
-	// call the reporter regardless of the load result - as we want to report the load status
-	reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, err, pg)
-	if err != nil {
+	loadErr := inst.Load(ctx)
+
+	reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, loadErr, pg)
+	if reporterErr != nil {
+		m.logger.Error(reporterErr, "failed to report instrumentation load", "loaded", loadErr == nil, "pid", e.PID, "process group details", pg)
+	}
+	if loadErr != nil {
+		// we need to track the instrumentation even if the load failed.
+		// consider a reporter which writes a persistent record for a failed/successful load
+		// we need to notify the reporter once that PID exits to clean up the resources - hence we track it.
+		// saving the inst as nil marking the instrumentation failed to load, and is not valid to run/configure/close.
+		m.startTrackInstrumentation(e.PID, nil, pg, configGroup)
 		m.logger.Error(err, "failed to load instrumentation", "language", otelDisto.Language, "sdk", otelDisto.OtelSdk)
 		// TODO: should we return here the load error? or the instance write error? or both?
 		return err
 	}
 
-	if reporterErr != nil {
-		m.logger.Error(reporterErr, "failed to report instrumentation load")
-	}
-
 	m.startTrackInstrumentation(e.PID, inst, pg, configGroup)
-
 	m.logger.Info("instrumentation loaded", "pid", e.PID, "process group details", pg)
 
 	go func() {
@@ -337,6 +348,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) applyInstrumentationConfigurationFo
 	}
 
 	for _, instDetails := range configGroupInstrumentations {
+		if instDetails.inst == nil {
+			continue
+		}
 		m.logger.Info("applying configuration to instrumentation", "process group details", instDetails.pg, "configGroup", configGroup)
 		applyErr := instDetails.inst.ApplyConfig(ctx, config)
 		err = errors.Join(err, applyErr)