Skip to content

Commit

Permalink
Track un-active instrumentation for better reporting (#2016)
Browse files Browse the repository at this point in the history
This PR changes when the instrumentation manager tracks an
instrumentation.
Once an instrumentation is initialized, we should start tracking it even
if it fails to load.
The reason is, that we need to call the reporter once the process exits
(so it can report/clean it) - however, if we only track the PID once the
instrumentation is loaded we won't be able to invoke the reporter
properly once a failed-to-load instrumentation needs to be cleaned.

Each PID we track is marked whether the instrumentation is active
(loaded successfully) or not - by the inst field being nil or not.
For un-active PIDs we won't apply config updates.

In addition, in case we get an exec event on a non-active PID, we try to
instrument it again. This is helpful in cases of chain-loading where the
first executable is written in a language we can't instrument, while the
second is valid for instrumentation.
  • Loading branch information
RonFed authored Dec 17, 2024
1 parent 01cef4b commit 716aac7
Showing 1 changed file with 30 additions and 16 deletions.
46 changes: 30 additions & 16 deletions instrumentation/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ var (
type ConfigUpdate[configGroup ConfigGroup] map[configGroup]Config

type instrumentationDetails[processGroup ProcessGroup, configGroup ConfigGroup] struct {
// we want to track the instrumentation even if it failed to load, to be able to report the error
// and clean up the reporter resources once the process exits.
// hence, this might be nil if the instrumentation failed to load.
inst Instrumentation
pg processGroup
cg configGroup
Expand Down Expand Up @@ -75,11 +78,11 @@ type manager[processGroup ProcessGroup, configGroup ConfigGroup] struct {
factories map[OtelDistribution]Factory
logger logr.Logger

// all the active instrumentations by pid,
// all the created instrumentations by pid,
// this map is not concurrent safe, so it should be accessed only from the main event loop
detailsByPid map[int]*instrumentationDetails[processGroup, configGroup]

// active instrumentations by workload, and aggregated by pid
// instrumentations by workload, and aggregated by pid
// this map is not concurrent safe, so it should be accessed only from the main event loop
detailsByWorkload map[configGroup]map[int]*instrumentationDetails[processGroup, configGroup]

Expand Down Expand Up @@ -142,6 +145,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) runEventLoop(ctx context.Context) {
case <-ctx.Done():
m.logger.Info("stopping eBPF instrumentation manager")
for pid, details := range m.detailsByPid {
if details.inst == nil {
continue
}
err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation", "pid", pid)
Expand Down Expand Up @@ -201,12 +207,14 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co

m.logger.Info("cleaning instrumentation resources", "pid", pid, "process group details", details.pg)

err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation")
if details.inst != nil {
err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation")
}
}

err = m.handler.Reporter.OnExit(ctx, pid, details.pg)
err := m.handler.Reporter.OnExit(ctx, pid, details.pg)
if err != nil {
m.logger.Error(err, "failed to report instrumentation exit")
}
Expand All @@ -215,7 +223,7 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co
}

func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.Context, e detector.ProcessEvent) error {
if _, found := m.detailsByPid[e.PID]; found {
if details, found := m.detailsByPid[e.PID]; found && details.inst != nil {
// this can happen if we have multiple exec events for the same pid (chain loading)
// TODO: better handle this?
// this can be done by first closing the existing instrumentation,
Expand Down Expand Up @@ -266,21 +274,24 @@ func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.
return err
}

err = inst.Load(ctx)
// call the reporter regardless of the load result - as we want to report the load status
reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, err, pg)
if err != nil {
loadErr := inst.Load(ctx)

reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, loadErr, pg)
if reporterErr != nil {
m.logger.Error(reporterErr, "failed to report instrumentation load", "loaded", loadErr == nil, "pid", e.PID, "process group details", pg)
}
if loadErr != nil {
// we need to track the instrumentation even if the load failed.
// consider a reporter which writes a persistent record for a failed/successful load
// we need to notify the reporter once that PID exits to clean up the resources - hence we track it.
// saving the inst as nil marking the instrumentation failed to load, and is not valid to run/configure/close.
m.startTrackInstrumentation(e.PID, nil, pg, configGroup)
m.logger.Error(err, "failed to load instrumentation", "language", otelDisto.Language, "sdk", otelDisto.OtelSdk)
// TODO: should we return here the load error? or the instance write error? or both?
return err
}

if reporterErr != nil {
m.logger.Error(reporterErr, "failed to report instrumentation load")
}

m.startTrackInstrumentation(e.PID, inst, pg, configGroup)

m.logger.Info("instrumentation loaded", "pid", e.PID, "process group details", pg)

go func() {
Expand Down Expand Up @@ -337,6 +348,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) applyInstrumentationConfigurationFo
}

for _, instDetails := range configGroupInstrumentations {
if instDetails.inst == nil {
continue
}
m.logger.Info("applying configuration to instrumentation", "process group details", instDetails.pg, "configGroup", configGroup)
applyErr := instDetails.inst.ApplyConfig(ctx, config)
err = errors.Join(err, applyErr)
Expand Down

0 comments on commit 716aac7

Please sign in to comment.