diff --git a/pkg/components/metallb/component.go b/pkg/components/metallb/component.go index c2406660f..cde3e7c7a 100644 --- a/pkg/components/metallb/component.go +++ b/pkg/components/metallb/component.go @@ -121,6 +121,7 @@ func (c *component) RenderManifests() (map[string]string, error) { rendered["service.yaml"] = service rendered["service-monitor.yaml"] = serviceMonitor rendered["grafana-dashboard.yaml"] = grafanaDashboard + rendered["grafana-alertmanager-rule.yaml"] = metallbPrometheusRule } return rendered, nil diff --git a/pkg/components/metallb/manifests.go b/pkg/components/metallb/manifests.go index 96032cea8..34ddaf3c4 100644 --- a/pkg/components/metallb/manifests.go +++ b/pkg/components/metallb/manifests.go @@ -817,3 +817,42 @@ data: "version": 1 } ` + +const metallbPrometheusRule = ` +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: alertmanager-rules + namespace: metallb-system + labels: + release: prometheus-operator + app: prometheus-operator +spec: + groups: + - name: metallb-rules + rules: + - alert: MetalLBNoBGPSession + expr: metallb_bgp_session_up != 1 + for: 2m + annotations: + description: '{{ $labels.instance }}: MetalLB has not established a BGP session for more than 2 minutes.' + summary: '{{ $labels.instance }}: MetalLB has not established BGP session.' + - alert: MetalLBConfigStale + expr: metallb_k8s_client_config_stale_bool != 0 + for: 2m + annotations: + description: '{{ $labels.instance }}: MetalLB instance has stale configuration.' + summary: '{{ $labels.instance }}: MetalLB stale configuration.' + - alert: MetalLBControllerPodsAvailability + expr: kube_deployment_status_replicas_unavailable{deployment="controller",namespace="metallb-system"} != 0 + for: 1m + annotations: + description: '{{ $labels.instance }}: MetalLB Controller pod was not available in the last minute.' + summary: '{{ $labels.instance }}: MetalLB Controller deployment pods.' + - alert: MetalLBSpeakerPodsAvailability + expr: kube_daemonset_status_number_unavailable{daemonset="speaker",namespace="metallb-system"} != 0 + for: 1m + annotations: + description: '{{ $labels.instance }}: MetalLB Speaker pod(s) were not available in the last minute.' + summary: '{{ $labels.instance }}: MetalLB Speaker daemonset pods.' +` diff --git a/test/monitoring/components_alerts_test.go b/test/monitoring/components_alerts_test.go new file mode 100644 index 000000000..d5b25ff93 --- /dev/null +++ b/test/monitoring/components_alerts_test.go @@ -0,0 +1,121 @@ +// Copyright 2020 The Lokomotive Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build aws packet +// +build poste2e + +package monitoring + +import ( + "context" + "fmt" + "reflect" + "testing" + "time" + + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "k8s.io/apimachinery/pkg/util/wait" + + testutil "github.com/kinvolk/lokomotive/test/components/util" +) + +const ( + retryInterval = time.Second * 5 + timeout = time.Minute * 9 + contextTimeout = 10 +) + +type alertTestCase struct { + ComponentName string + RuleGroup string + platforms []testutil.Platform + Alerts []string +} + +//nolint:funlen +func testComponentAlerts(t *testing.T, v1api v1.API) { + alertTestCases := []alertTestCase{ + { + ComponentName: "metallb", + RuleGroup: "metallb-rules", + platforms: []testutil.Platform{testutil.PlatformPacket}, + Alerts: []string{ + "MetalLBNoBGPSession", "MetalLBConfigStale", "MetalLBControllerPodsAvailability", + "MetalLBSpeakerPodsAvailability", + }, + }, + } + + for _, tc := range alertTestCases { + tc := tc + t.Run(tc.ComponentName, func(t *testing.T) { + t.Parallel() + + if !testutil.IsPlatformSupported(t, tc.platforms) { + t.Skip() + } + + if err := wait.PollImmediate( + retryInterval, timeout, getComponentAlertRetryFunc(t, v1api, tc), + ); err != nil { + t.Fatalf("%v", err) + } + }) + } +} + +func getComponentAlertRetryFunc(t *testing.T, v1api v1.API, tc alertTestCase) func() (done bool, err error) { + return func() (done bool, err error) { + ctx, cancel := context.WithTimeout(context.Background(), contextTimeout*time.Second) + defer cancel() + + result, err := v1api.Rules(ctx) + if err != nil { + return false, fmt.Errorf("error listing rules: %v", err) + } + + // This map will store information from cluster so that it is easier to search it against + // the test cases. + ruleGroups := make(map[string][]string, len(result.Groups)) + + for _, ruleGroup := range result.Groups { + rules := make([]string, 0) + + for _, rule := range ruleGroup.Rules { + switch v := rule.(type) { + case v1.AlertingRule: + rules = append(rules, v.Name) + default: + } + } + + ruleGroups[ruleGroup.Name] = rules + } + + rules, ok := ruleGroups[tc.RuleGroup] + if !ok { + // We don't return error here and just log it here because there is a + // possibility that the prometheus has not reconciled and we need to just return + // false i.e. not done and try again. + t.Logf("error: RuleGroup %q not found. Retrying...", tc.RuleGroup) + return false, nil + } + + if !reflect.DeepEqual(rules, tc.Alerts) { + return false, fmt.Errorf("Rules don't match. Expected: %#v and \ngot %#v", tc.Alerts, rules) + } + + return true, nil + } +} diff --git a/test/monitoring/monitoring_test.go b/test/monitoring/monitoring_test.go index 682e9fb1a..657f24cff 100644 --- a/test/monitoring/monitoring_test.go +++ b/test/monitoring/monitoring_test.go @@ -58,6 +58,10 @@ func TestPrometheus(t *testing.T) { Name: "ComponentMetrics", Func: testComponentsPrometheusMetrics, }, + { + Name: "ComponentAlerts", + Func: testComponentAlerts, + }, } // Invoke the test functions passing them the test object and the prometheus client.