From 1ea385c682eca4cb26bff919e8664ae55a651464 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Tue, 20 Aug 2024 18:17:50 +0530 Subject: [PATCH 1/6] chore: mark module as healthy if metrics are partially filled --- metricbeat/mb/event.go | 15 +++++++++++++++ metricbeat/mb/module/wrapper.go | 8 +++++++- metricbeat/module/system/process/process.go | 2 +- .../system/process_summary/process_summary.go | 2 +- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/metricbeat/mb/event.go b/metricbeat/mb/event.go index 98430732ef4..a7e1b2afb30 100644 --- a/metricbeat/mb/event.go +++ b/metricbeat/mb/event.go @@ -214,3 +214,18 @@ func tryToMapStr(v interface{}) (mapstr.M, bool) { return nil, false } } + +// Following error indicates that the metrics are partially filled. +// This will be removed once we fix the underlying problem. +// See https://github.com/elastic/beats/issues/40542 for more details. +type PartialMetricsError struct { + Err error +} + +func (p PartialMetricsError) Error() string { + return p.Err.Error() +} + +func (p PartialMetricsError) Unwrap() error { + return p.Err +} diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go index 5243d956365..4fbae112e56 100644 --- a/metricbeat/mb/module/wrapper.go +++ b/metricbeat/mb/module/wrapper.go @@ -265,7 +265,13 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(ctx, reporter.V2()) if err != nil { reporter.V2().Error(err) - msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + if _, ok := err.(mb.PartialMetricsError); ok { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } else { msw.module.UpdateStatus(status.Running, "") diff --git a/metricbeat/module/system/process/process.go b/metricbeat/module/system/process/process.go index 684c87059c9..ddd1a81eaa0 100644 --- a/metricbeat/module/system/process/process.go +++ b/metricbeat/module/system/process/process.go @@ -126,7 +126,7 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { return err } } - return err + return mb.PartialMetricsError{Err: err} } else { proc, root, err := m.stats.GetOneRootEvent(m.setpid) if err != nil { diff --git a/metricbeat/module/system/process_summary/process_summary.go b/metricbeat/module/system/process_summary/process_summary.go index cbf1c63a2fe..645496c5f6e 100644 --- a/metricbeat/module/system/process_summary/process_summary.go +++ b/metricbeat/module/system/process_summary/process_summary.go @@ -103,7 +103,7 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { MetricSetFields: outMap, }) - return degradeErr + return mb.PartialMetricsError{Err: degradeErr} } // threadStats returns a map of state counts for running threads on a system From c852306c072a6128ed19894fd986d6d7adf42878 Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Wed, 21 Aug 2024 14:48:50 +0530 Subject: [PATCH 2/6] chore: mark module as healthy if metrics are partially filled --- metricbeat/mb/module/wrapper.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go index 4fbae112e56..09190ef5ff7 100644 --- a/metricbeat/mb/module/wrapper.go +++ b/metricbeat/mb/module/wrapper.go @@ -255,7 +255,13 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(reporter.V2()) if err != nil { reporter.V2().Error(err) - msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + if _, ok := err.(mb.PartialMetricsError); ok { + // mark module as running if metrics are partially available and display the error message + msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } else { + // mark it as degraded for any other issue encountered + msw.module.UpdateStatus(status.Degraded, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) + } logp.Err("Error fetching data for metricset %s.%s: %s", msw.module.Name(), msw.Name(), err) } else { msw.module.UpdateStatus(status.Running, "") From 3411bab962c2e88a02760793d8f2ce0adc17525d Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Wed, 21 Aug 2024 21:13:37 +0530 Subject: [PATCH 3/6] fix: use errors.As --- metricbeat/mb/module/wrapper.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/metricbeat/mb/module/wrapper.go b/metricbeat/mb/module/wrapper.go index 09190ef5ff7..95185817f5f 100644 --- a/metricbeat/mb/module/wrapper.go +++ b/metricbeat/mb/module/wrapper.go @@ -19,6 +19,7 @@ package module import ( "context" + "errors" "fmt" "math/rand" "sync" @@ -255,7 +256,7 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(reporter.V2()) if err != nil { reporter.V2().Error(err) - if _, ok := err.(mb.PartialMetricsError); ok { + if errors.As(err, &mb.PartialMetricsError{}) { // mark module as running if metrics are partially available and display the error message msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) } else { @@ -271,7 +272,7 @@ func (msw *metricSetWrapper) fetch(ctx context.Context, reporter reporter) { err := fetcher.Fetch(ctx, reporter.V2()) if err != nil { reporter.V2().Error(err) - if _, ok := err.(mb.PartialMetricsError); ok { + if errors.As(err, &mb.PartialMetricsError{}) { // mark module as running if metrics are partially available and display the error message msw.module.UpdateStatus(status.Running, fmt.Sprintf("Error fetching data for metricset %s.%s: %v", msw.module.Name(), msw.MetricSet.Name(), err)) } else { From 087b64a7c9adac94921fca469cfd9b49d244c3ef Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Wed, 21 Aug 2024 22:02:38 +0530 Subject: [PATCH 4/6] fix: fix lint --- metricbeat/module/system/process/process.go | 4 +++- metricbeat/module/system/process_summary/process_summary.go | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/metricbeat/module/system/process/process.go b/metricbeat/module/system/process/process.go index ddd1a81eaa0..01c8480656d 100644 --- a/metricbeat/module/system/process/process.go +++ b/metricbeat/module/system/process/process.go @@ -115,6 +115,8 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { if err != nil && !errors.Is(err, process.NonFatalErr{}) { // return only if the error is fatal in nature return fmt.Errorf("process stats: %w", err) + } else if (err != nil && errors.Is(err, process.NonFatalErr{})) { + err = mb.PartialMetricsError{Err: err} } for evtI := range procs { @@ -126,7 +128,7 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { return err } } - return mb.PartialMetricsError{Err: err} + return err } else { proc, root, err := m.stats.GetOneRootEvent(m.setpid) if err != nil { diff --git a/metricbeat/module/system/process_summary/process_summary.go b/metricbeat/module/system/process_summary/process_summary.go index 645496c5f6e..cc8d5e38552 100644 --- a/metricbeat/module/system/process_summary/process_summary.go +++ b/metricbeat/module/system/process_summary/process_summary.go @@ -73,6 +73,8 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { if degradeErr != nil && !errors.Is(degradeErr, process.NonFatalErr{}) { // return only if the error is fatal in nature return fmt.Errorf("error fetching process list: %w", degradeErr) + } else if (degradeErr != nil && errors.Is(degradeErr, process.NonFatalErr{})) { + degradeErr = mb.PartialMetricsError{Err: degradeErr} } procStates := map[string]int{} @@ -103,7 +105,7 @@ func (m *MetricSet) Fetch(r mb.ReporterV2) error { MetricSetFields: outMap, }) - return mb.PartialMetricsError{Err: degradeErr} + return degradeErr } // threadStats returns a map of state counts for running threads on a system From 2a06a204199475d0531fbe833309461577782156 Mon Sep 17 00:00:00 2001 From: VihasMakwana <121151420+VihasMakwana@users.noreply.github.com> Date: Fri, 23 Aug 2024 12:49:30 +0530 Subject: [PATCH 5/6] Update metricbeat/mb/event.go Co-authored-by: Mauri de Souza Meneguzzo --- metricbeat/mb/event.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metricbeat/mb/event.go b/metricbeat/mb/event.go index a7e1b2afb30..fb6907b6396 100644 --- a/metricbeat/mb/event.go +++ b/metricbeat/mb/event.go @@ -215,7 +215,7 @@ func tryToMapStr(v interface{}) (mapstr.M, bool) { } } -// Following error indicates that the metrics are partially filled. +// PartialMetricsError indicates that metrics are only partially filled. // This will be removed once we fix the underlying problem. // See https://github.com/elastic/beats/issues/40542 for more details. type PartialMetricsError struct { From df2b78cf874747285beddf81da4f52885d596f9e Mon Sep 17 00:00:00 2001 From: Vihas Makwana Date: Fri, 23 Aug 2024 12:53:08 +0530 Subject: [PATCH 6/6] fix: changelog --- CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index 503667dc384..f07a835c1ad 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -48,6 +48,7 @@ https://github.com/elastic/beats/compare/v8.8.1\...main[Check the HEAD diff] - Allow metricsets to report their status via control v2 protocol. {pull}40025[40025] - Remove fallback to the node limit for the `kubernetes.pod.cpu.usage.limit.pct` and `kubernetes.pod.memory.usage.limit.pct` metrics calculation - Add support for Kibana status metricset in v8 format {pull}40275[40275] +- Mark system process metricsets as running if metrics are partially available {pull}40565[40565] *Osquerybeat*