Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track un-active instrumentation for better reporting #2016

Merged
merged 4 commits into from
Dec 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 30 additions & 16 deletions instrumentation/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ var (
type ConfigUpdate[configGroup ConfigGroup] map[configGroup]Config

type instrumentationDetails[processGroup ProcessGroup, configGroup ConfigGroup] struct {
// we want to track the instrumentation even if it failed to load, to be able to report the error
// and clean up the reporter resources once the process exits.
// hence, this might be nil if the instrumentation failed to load.
inst Instrumentation
pg processGroup
cg configGroup
Expand Down Expand Up @@ -75,11 +78,11 @@ type manager[processGroup ProcessGroup, configGroup ConfigGroup] struct {
factories map[OtelDistribution]Factory
logger logr.Logger

// all the active instrumentations by pid,
// all the created instrumentations by pid,
// this map is not concurrent safe, so it should be accessed only from the main event loop
detailsByPid map[int]*instrumentationDetails[processGroup, configGroup]

// active instrumentations by workload, and aggregated by pid
// instrumentations by workload, and aggregated by pid
// this map is not concurrent safe, so it should be accessed only from the main event loop
detailsByWorkload map[configGroup]map[int]*instrumentationDetails[processGroup, configGroup]

Expand Down Expand Up @@ -142,6 +145,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) runEventLoop(ctx context.Context) {
case <-ctx.Done():
m.logger.Info("stopping eBPF instrumentation manager")
for pid, details := range m.detailsByPid {
if details.inst == nil {
continue
}
err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation", "pid", pid)
Expand Down Expand Up @@ -201,12 +207,14 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co

m.logger.Info("cleaning instrumentation resources", "pid", pid, "process group details", details.pg)

err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation")
if details.inst != nil {
err := details.inst.Close(ctx)
if err != nil {
m.logger.Error(err, "failed to close instrumentation")
}
}

err = m.handler.Reporter.OnExit(ctx, pid, details.pg)
err := m.handler.Reporter.OnExit(ctx, pid, details.pg)
if err != nil {
m.logger.Error(err, "failed to report instrumentation exit")
}
Expand All @@ -215,7 +223,7 @@ func (m *manager[ProcessGroup, ConfigGroup]) cleanInstrumentation(ctx context.Co
}

func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.Context, e detector.ProcessEvent) error {
if _, found := m.detailsByPid[e.PID]; found {
if details, found := m.detailsByPid[e.PID]; found && details.inst != nil {
// this can happen if we have multiple exec events for the same pid (chain loading)
// TODO: better handle this?
// this can be done by first closing the existing instrumentation,
Expand Down Expand Up @@ -266,21 +274,24 @@ func (m *manager[ProcessGroup, ConfigGroup]) handleProcessExecEvent(ctx context.
return err
}

err = inst.Load(ctx)
// call the reporter regardless of the load result - as we want to report the load status
reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, err, pg)
if err != nil {
loadErr := inst.Load(ctx)

reporterErr := m.handler.Reporter.OnLoad(ctx, e.PID, loadErr, pg)
if reporterErr != nil {
m.logger.Error(reporterErr, "failed to report instrumentation load", "loaded", loadErr == nil, "pid", e.PID, "process group details", pg)
}
if loadErr != nil {
// we need to track the instrumentation even if the load failed.
// consider a reporter which writes a persistent record for a failed/successful load
// we need to notify the reporter once that PID exits to clean up the resources - hence we track it.
// saving the inst as nil marking the instrumentation failed to load, and is not valid to run/configure/close.
m.startTrackInstrumentation(e.PID, nil, pg, configGroup)
m.logger.Error(err, "failed to load instrumentation", "language", otelDisto.Language, "sdk", otelDisto.OtelSdk)
// TODO: should we return here the load error? or the instance write error? or both?
return err
}

if reporterErr != nil {
m.logger.Error(reporterErr, "failed to report instrumentation load")
}

m.startTrackInstrumentation(e.PID, inst, pg, configGroup)

m.logger.Info("instrumentation loaded", "pid", e.PID, "process group details", pg)

go func() {
Expand Down Expand Up @@ -337,6 +348,9 @@ func (m *manager[ProcessGroup, ConfigGroup]) applyInstrumentationConfigurationFo
}

for _, instDetails := range configGroupInstrumentations {
if instDetails.inst == nil {
continue
}
m.logger.Info("applying configuration to instrumentation", "process group details", instDetails.pg, "configGroup", configGroup)
applyErr := instDetails.inst.ApplyConfig(ctx, config)
err = errors.Join(err, applyErr)
Expand Down
Loading