From 177fb317d2a623c2f0d74597634f8a9b4ed67228 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Wed, 11 Mar 2020 22:55:02 +0530 Subject: [PATCH 1/5] metallb: Add alerts for metallb This commit adds a prometheus operator CR PrometheusRule which includes alertmanager rules for metallb. * This sends an alert when it sees that a session is not established for more than 2mins. * This sends an alert when any of the pods have stale config. Signed-off-by: Suraj Deshmukh --- pkg/components/metallb/component.go | 1 + pkg/components/metallb/manifests.go | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/pkg/components/metallb/component.go b/pkg/components/metallb/component.go index c2406660f..cde3e7c7a 100644 --- a/pkg/components/metallb/component.go +++ b/pkg/components/metallb/component.go @@ -121,6 +121,7 @@ func (c *component) RenderManifests() (map[string]string, error) { rendered["service.yaml"] = service rendered["service-monitor.yaml"] = serviceMonitor rendered["grafana-dashboard.yaml"] = grafanaDashboard + rendered["grafana-alertmanager-rule.yaml"] = metallbPrometheusRule } return rendered, nil diff --git a/pkg/components/metallb/manifests.go b/pkg/components/metallb/manifests.go index 96032cea8..edf6ab6b6 100644 --- a/pkg/components/metallb/manifests.go +++ b/pkg/components/metallb/manifests.go @@ -817,3 +817,30 @@ data: "version": 1 } ` + +const metallbPrometheusRule = ` +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: alertmanager-rules + namespace: metallb-system + labels: + release: prometheus-operator + app: prometheus-operator +spec: + groups: + - name: metallb-rules + rules: + - alert: MetalLBNoBGPSession + expr: metallb_bgp_session_up != 1 + for: 2m + annotations: + description: '{{ $labels.instance }}: MetalLB has not established a BGP session for more than 2 minutes.' + summary: '{{ $labels.instance }}: MetalLB has not established BGP session.' + - alert: MetalLBConfigStale + expr: metallb_k8s_client_config_stale_bool != 0 + for: 2m + annotations: + description: '{{ $labels.instance }}: MetalLB instance has stale configuration.' + summary: '{{ $labels.instance }}: MetalLB stale configuration.' +` From 29c306a5c890229d5c29da17f2455820d5f409c6 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Thu, 19 Mar 2020 15:11:39 +0530 Subject: [PATCH 2/5] metallb: Add alert for controller deployment When metallb controller deployment pod is not available for more than a minute then an alert will be triggered. Signed-off-by: Suraj Deshmukh --- pkg/components/metallb/manifests.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/components/metallb/manifests.go b/pkg/components/metallb/manifests.go index edf6ab6b6..2873d2f48 100644 --- a/pkg/components/metallb/manifests.go +++ b/pkg/components/metallb/manifests.go @@ -843,4 +843,10 @@ spec: annotations: description: '{{ $labels.instance }}: MetalLB instance has stale configuration.' summary: '{{ $labels.instance }}: MetalLB stale configuration.' + - alert: MetalLBControllerPodsAvailability + expr: kube_deployment_status_replicas_unavailable{deployment="controller",namespace="metallb-system"} != 0 + for: 1m + annotations: + description: '{{ $labels.instance }}: MetalLB Controller pod was not available in the last minute.' + summary: '{{ $labels.instance }}: MetalLB Controller deployment pods.' ` From 0ad7db16f3f279b88267e01ff519e7d8aa27d339 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Thu, 19 Mar 2020 15:15:23 +0530 Subject: [PATCH 3/5] metallb: Add alert for speaker daemonset When metallb speaker daemonset pods are not available for more than a minute then an alert will be triggered. Signed-off-by: Suraj Deshmukh --- pkg/components/metallb/manifests.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pkg/components/metallb/manifests.go b/pkg/components/metallb/manifests.go index 2873d2f48..34ddaf3c4 100644 --- a/pkg/components/metallb/manifests.go +++ b/pkg/components/metallb/manifests.go @@ -849,4 +849,10 @@ spec: annotations: description: '{{ $labels.instance }}: MetalLB Controller pod was not available in the last minute.' summary: '{{ $labels.instance }}: MetalLB Controller deployment pods.' + - alert: MetalLBSpeakerPodsAvailability + expr: kube_daemonset_status_number_unavailable{daemonset="speaker",namespace="metallb-system"} != 0 + for: 1m + annotations: + description: '{{ $labels.instance }}: MetalLB Speaker pod(s) were not available in the last minute.' + summary: '{{ $labels.instance }}: MetalLB Speaker daemonset pods.' ` From a4438540bfc68a0c6860d129f3f57d2e4d529fb2 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Wed, 18 Mar 2020 14:30:30 +0530 Subject: [PATCH 4/5] e2e prometheus: Add metallb test to check alerts Signed-off-by: Suraj Deshmukh --- test/monitoring/components_alerts_test.go | 121 ++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 test/monitoring/components_alerts_test.go diff --git a/test/monitoring/components_alerts_test.go b/test/monitoring/components_alerts_test.go new file mode 100644 index 000000000..d5b25ff93 --- /dev/null +++ b/test/monitoring/components_alerts_test.go @@ -0,0 +1,121 @@ +// Copyright 2020 The Lokomotive Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build aws packet +// +build poste2e + +package monitoring + +import ( + "context" + "fmt" + "reflect" + "testing" + "time" + + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "k8s.io/apimachinery/pkg/util/wait" + + testutil "github.com/kinvolk/lokomotive/test/components/util" +) + +const ( + retryInterval = time.Second * 5 + timeout = time.Minute * 9 + contextTimeout = 10 +) + +type alertTestCase struct { + ComponentName string + RuleGroup string + platforms []testutil.Platform + Alerts []string +} + +//nolint:funlen +func testComponentAlerts(t *testing.T, v1api v1.API) { + alertTestCases := []alertTestCase{ + { + ComponentName: "metallb", + RuleGroup: "metallb-rules", + platforms: []testutil.Platform{testutil.PlatformPacket}, + Alerts: []string{ + "MetalLBNoBGPSession", "MetalLBConfigStale", "MetalLBControllerPodsAvailability", + "MetalLBSpeakerPodsAvailability", + }, + }, + } + + for _, tc := range alertTestCases { + tc := tc + t.Run(tc.ComponentName, func(t *testing.T) { + t.Parallel() + + if !testutil.IsPlatformSupported(t, tc.platforms) { + t.Skip() + } + + if err := wait.PollImmediate( + retryInterval, timeout, getComponentAlertRetryFunc(t, v1api, tc), + ); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func getComponentAlertRetryFunc(t *testing.T, v1api v1.API, tc alertTestCase) func() (done bool, err error) { + return func() (done bool, err error) { + ctx, cancel := context.WithTimeout(context.Background(), contextTimeout*time.Second) + defer cancel() + + result, err := v1api.Rules(ctx) + if err != nil { + return false, fmt.Errorf("error listing rules: %v", err) + } + + // This map will store information from cluster so that it is easier to search it against + // the test cases. + ruleGroups := make(map[string][]string, len(result.Groups)) + + for _, ruleGroup := range result.Groups { + rules := make([]string, 0) + + for _, rule := range ruleGroup.Rules { + switch v := rule.(type) { + case v1.AlertingRule: + rules = append(rules, v.Name) + default: + } + } + + ruleGroups[ruleGroup.Name] = rules + } + + rules, ok := ruleGroups[tc.RuleGroup] + if !ok { + // We don't return error here and just log it here because there is a + // possibility that the prometheus has not reconciled and we need to just return + // false i.e. not done and try again. + t.Logf("error: RuleGroup %q not found. Retrying...", tc.RuleGroup) + return false, nil + } + + if !reflect.DeepEqual(rules, tc.Alerts) { + return false, fmt.Errorf("Rules don't match. Expected: %#v and \ngot %#v", tc.Alerts, rules) + } + + return true, nil + } +} From 922554bf7c519023f668c5da45cf130d32e1a60d Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Fri, 20 Mar 2020 18:41:26 +0530 Subject: [PATCH 5/5] prometheus e2e test: Register Component Alerts Signed-off-by: Suraj Deshmukh --- test/monitoring/monitoring_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/monitoring/monitoring_test.go b/test/monitoring/monitoring_test.go index 682e9fb1a..657f24cff 100644 --- a/test/monitoring/monitoring_test.go +++ b/test/monitoring/monitoring_test.go @@ -58,6 +58,10 @@ func TestPrometheus(t *testing.T) { Name: "ComponentMetrics", Func: testComponentsPrometheusMetrics, }, + { + Name: "ComponentAlerts", + Func: testComponentAlerts, + }, } // Invoke the test functions passing them the test object and the prometheus client.