diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 6e585dcceec..c4280950a80 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -55,6 +55,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Remove fallback to the node limit for the `kubernetes.pod.cpu.usage.limit.pct` and `kubernetes.pod.memory.usage.limit.pct` metrics calculation - Add support for Kibana status metricset in v8 format {pull}40275[40275] - Update metrics for the vSphere Host metricset. {pull}40429[40429] +- Mark system process metricsets as running if metrics are partially available {pull}40565[40565] - Added back `elasticsearch.node.stats.jvm.mem.pools.*` to the `node_stats` metricset {pull}40571[40571] *Osquerybeat* diff --git a/metricbeat/mb/event.go b/metricbeat/mb/event.go index 98430732ef4..fb6907b6396 100644 --- a/metricbeat/mb/event.go +++ b/metricbeat/mb/event.go @@ -214,3 +214,18 @@ func tryToMapStr(v interface{}) (mapstr.M, bool) { return nil, false } } + +// PartialMetricsError indicates that metrics are only partially filled. +// This will be removed once we fix the underlying problem. +// See https://github.com/elastic/beats/issues/40542 for more details. +type PartialMetricsError struct { + Err error +} + +func (p PartialMetricsError) Error() string { + return p.Err.Error() +} + +func (p PartialMetricsError) Unwrap() error { + return p.Err +} diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go index 5243d956365..95185817f5f 100644 --- a/metricbeat/mb/module/wrapper.go +++ b/metricbeat/mb/module/wrapper.go @@ -19,6 +19,7 @@ package module import ( "context" + "errors" "fmt" "math/rand" "sync" @@ -255,7 +256,13 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(reporter.V2()) if err != nil { reporter.V2().Error(err) - msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + if errors.As(err, &mb.PartialMetricsError{}) { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } else { msw.module.UpdateStatus(status.Running, "") @@ -265,7 +272,13 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(ctx, reporter.V2()) if err != nil { reporter.V2().Error(err) - msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + if errors.As(err, &mb.PartialMetricsError{}) { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } else { msw.module.UpdateStatus(status.Running, "") diff --git a/metricbeat/module/system/process/process.go b/metricbeat/module/system/process/process.go index 684c87059c9..01c8480656d 100644 --- a/metricbeat/module/system/process/process.go +++ b/metricbeat/module/system/process/process.go @@ -115,6 +115,8 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { if err != nil && !errors.Is(err, process.NonFatalErr{}) { // return only if the error is fatal in nature return fmt.Errorf("process stats: %w", err) + } else if (err != nil && errors.Is(err, process.NonFatalErr{})) { + err = mb.PartialMetricsError{Err: err} } for evtI := range procs { diff --git a/metricbeat/module/system/process_summary/process_summary.go b/metricbeat/module/system/process_summary/process_summary.go index cbf1c63a2fe..cc8d5e38552 100644 --- a/metricbeat/module/system/process_summary/process_summary.go +++ b/metricbeat/module/system/process_summary/process_summary.go @@ -73,6 +73,8 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { if degradeErr != nil && !errors.Is(degradeErr, process.NonFatalErr{}) { // return only if the error is fatal in nature return fmt.Errorf("error fetching process list: %w", degradeErr) + } else if (degradeErr != nil && errors.Is(degradeErr, process.NonFatalErr{})) { + degradeErr = mb.PartialMetricsError{Err: degradeErr} } procStates := map[string]int{}