From 819e02e6ff013ac46d8a0a902e8b7a31a193feb2 Mon Sep 17 00:00:00 2001 From: Sven Rebhan Date: Mon, 29 Apr 2024 22:07:19 +0200 Subject: [PATCH] fix(inputs.procstat): Do not report dead processes as running for orphan PID files --- plugins/inputs/procstat/filter.go | 32 +++++++++++++--------- plugins/inputs/procstat/procstat.go | 21 +++++++++----- plugins/inputs/procstat/service_finders.go | 22 ++++++++++++--- 3 files changed, 51 insertions(+), 24 deletions(-) diff --git a/plugins/inputs/procstat/filter.go b/plugins/inputs/procstat/filter.go index 13a09d004d996..f7b89c12a811b 100644 --- a/plugins/inputs/procstat/filter.go +++ b/plugins/inputs/procstat/filter.go @@ -7,28 +7,32 @@ import ( "strconv" "strings" - "github.com/influxdata/telegraf/filter" "github.com/shirou/gopsutil/v3/process" + + "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/filter" ) type Filter struct { - Name string `toml:"name"` - PidFiles []string `toml:"pid_files"` - SystemdUnits []string `toml:"systemd_units"` - SupervisorUnits []string `toml:"supervisor_units"` - WinService []string `toml:"win_services"` - CGroups []string `toml:"cgroups"` - Patterns []string `toml:"patterns"` - Users []string `toml:"users"` - Executables []string `toml:"executables"` - ProcessNames []string `toml:"process_names"` - RecursionDepth int `toml:"recursion_depth"` + Name string `toml:"name"` + PidFiles []string `toml:"pid_files"` + SystemdUnits []string `toml:"systemd_units"` + SupervisorUnits []string `toml:"supervisor_units"` + WinService []string `toml:"win_services"` + CGroups []string `toml:"cgroups"` + Patterns []string `toml:"patterns"` + Users []string `toml:"users"` + Executables []string `toml:"executables"` + ProcessNames []string `toml:"process_names"` + RecursionDepth int `toml:"recursion_depth"` + Log telegraf.Logger `toml:"-"` filterSupervisorUnit string filterCmds []*regexp.Regexp filterUser filter.Filter filterExecutable filter.Filter filterProcessName filter.Filter + finder *processFinder } func (f *Filter) Init() error { @@ -80,6 +84,8 @@ func (f *Filter) Init() error { return fmt.Errorf("compiling process-names filter for %q failed: %w", f.Name, err) } + // Setup the process finder + f.finder = newProcessFinder(f.Log) return nil } @@ -89,7 +95,7 @@ func (f *Filter) ApplyFilter() ([]processGroup, error) { var groups []processGroup switch { case len(f.PidFiles) > 0: - g, err := findByPidFiles(f.PidFiles) + g, err := f.finder.findByPidFiles(f.PidFiles) if err != nil { return nil, err } diff --git a/plugins/inputs/procstat/procstat.go b/plugins/inputs/procstat/procstat.go index fdbed339f4281..ab17aea37a37a 100644 --- a/plugins/inputs/procstat/procstat.go +++ b/plugins/inputs/procstat/procstat.go @@ -149,6 +149,7 @@ func (p *Procstat) Init() error { // New-style operations for i := range p.Filter { + p.Filter[i].Log = p.Log if err := p.Filter[i].Init(); err != nil { return fmt.Errorf("initializing filter %d failed: %w", i, err) } @@ -200,17 +201,23 @@ func (p *Procstat) gatherOld(acc telegraf.Accumulator) error { } count += len(r.PIDs) for _, pid := range r.PIDs { + // Check if the process is still running + proc, err := p.createProcess(pid) + if err != nil { + // No problem; process may have ended after we found it or it + // might be delivered from a non-checking source like a PID file + // of a dead process. + continue + } + // Use the cached processes as we need the existing instances // to compute delta-metrics (e.g. cpu-usage). - proc, found := p.processes[pid] - if !found { + if cached, found := p.processes[pid]; found { + proc = cached + } else { // We've found a process that was not recorded before so add it // to the list of processes - proc, err = p.createProcess(pid) - if err != nil { - // No problem; process may have ended after we found it - continue - } + // Assumption: if a process has no name, it probably does not exist if name, _ := proc.Name(); name == "" { continue diff --git a/plugins/inputs/procstat/service_finders.go b/plugins/inputs/procstat/service_finders.go index 8daf07a7e37fc..50a0dc8bdfe11 100644 --- a/plugins/inputs/procstat/service_finders.go +++ b/plugins/inputs/procstat/service_finders.go @@ -8,10 +8,23 @@ import ( "strconv" "strings" + "github.com/influxdata/telegraf" "github.com/shirou/gopsutil/v3/process" ) -func findByPidFiles(paths []string) ([]processGroup, error) { +type processFinder struct { + errPidFiles map[string]bool + log telegraf.Logger +} + +func newProcessFinder(log telegraf.Logger) *processFinder { + return &processFinder{ + errPidFiles: make(map[string]bool), + log: log, + } +} + +func (f *processFinder) findByPidFiles(paths []string) ([]processGroup, error) { groups := make([]processGroup, 0, len(paths)) for _, path := range paths { buf, err := os.ReadFile(path) @@ -24,8 +37,9 @@ func findByPidFiles(paths []string) ([]processGroup, error) { } p, err := process.NewProcess(int32(pid)) - if err != nil { - return nil, fmt.Errorf("failed to find process for PID %d of file %q: %w", pid, path, err) + if err != nil && !f.errPidFiles[path] { + f.log.Errorf("failed to find process for PID %d of file %q: %v", pid, path, err) + f.errPidFiles[path] = true } groups = append(groups, processGroup{ processes: []*process.Process{p}, @@ -46,7 +60,7 @@ func findByCgroups(cgroups []string) ([]processGroup, error) { files, err := filepath.Glob(path) if err != nil { - return nil, fmt.Errorf("failed to determin files for cgroup %q: %w", cgroup, err) + return nil, fmt.Errorf("failed to determine files for cgroup %q: %w", cgroup, err) } for _, fpath := range files {