From e07d075be00e6eca7f34915c2f5851dd0adf898e Mon Sep 17 00:00:00 2001 From: gabemontero Date: Wed, 20 Sep 2017 13:02:28 -0400 Subject: [PATCH] prometheus alerts for openshift build subsystem --- examples/prometheus/prometheus.yaml | 23 ++- pkg/oc/bootstrap/bindata.go | 23 ++- test/extended/prometheus/prometheus_builds.go | 179 ++++++++++-------- test/extended/testdata/bindata.go | 23 ++- 4 files changed, 163 insertions(+), 85 deletions(-) diff --git a/examples/prometheus/prometheus.yaml b/examples/prometheus/prometheus.yaml index cb453189c85a..ca8a64ca3daa 100644 --- a/examples/prometheus/prometheus.yaml +++ b/examples/prometheus/prometheus.yaml @@ -30,6 +30,15 @@ parameters: name: SESSION_SECRET generate: expression from: "[a-zA-Z0-9]{43}" +- description: The threshold for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD + value: "100" +- description: The allowable active build duration for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION + value: "360" +- description: The allowable ration of failed to complete builds for the failed build alert + name: PROMETHEUS_FAILED_BUILD_RATIO + value: "0.5" objects: # Authorize the prometheus service account to read data about the cluster - apiVersion: v1 @@ -243,12 +252,22 @@ objects: - name: example-rules interval: 30s # defaults to global interval rules: - - alert: Node Down + - alert: NodeDown expr: up{job="kubernetes-nodes"} == 0 annotations: miqTarget: "ContainerNode" severity: "HIGH" - message: "{{$labels.instance}} is down" + message: "{{$labels.instance}} is down" + - alert: HangingBuild + expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete" + - alert: BuildFailureRate + expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}" prometheus.yml: | rule_files: - 'prometheus.rules' diff --git a/pkg/oc/bootstrap/bindata.go b/pkg/oc/bootstrap/bindata.go index 350572508374..cbfdfc95d653 100644 --- a/pkg/oc/bootstrap/bindata.go +++ b/pkg/oc/bootstrap/bindata.go @@ -13137,6 +13137,15 @@ parameters: name: SESSION_SECRET generate: expression from: "[a-zA-Z0-9]{43}" +- description: The threshold for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD + value: "100" +- description: The allowable active build duration for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION + value: "360" +- description: The allowable ration of failed to complete builds for the failed build alert + name: PROMETHEUS_FAILED_BUILD_RATIO + value: "0.5" objects: # Authorize the prometheus service account to read data about the cluster - apiVersion: v1 @@ -13350,12 +13359,22 @@ objects: - name: example-rules interval: 30s # defaults to global interval rules: - - alert: Node Down + - alert: NodeDown expr: up{job="kubernetes-nodes"} == 0 annotations: miqTarget: "ContainerNode" severity: "HIGH" - message: "{{$labels.instance}} is down" + message: "{{$labels.instance}} is down" + - alert: HangingBuild + expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete" + - alert: BuildFailureRate + expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}" prometheus.yml: | rule_files: - 'prometheus.rules' diff --git a/test/extended/prometheus/prometheus_builds.go b/test/extended/prometheus/prometheus_builds.go index 74caae452347..a1e2498ec0c6 100644 --- a/test/extended/prometheus/prometheus_builds.go +++ b/test/extended/prometheus/prometheus_builds.go @@ -17,13 +17,15 @@ import ( exutil "github.com/openshift/origin/test/extended/util" ) +var ( + execPodName, ns, host, bearerToken string + statsPort int +) + var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() { defer g.GinkgoRecover() var ( oc = exutil.NewCLI("prometheus", exutil.KubeConfigPath()) - - execPodName, ns, host, bearerToken string - statsPort int ) g.BeforeEach(func() { @@ -52,20 +54,35 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() { err = expectBearerTokenURLStatusCodeExec(ns, execPodName, fmt.Sprintf("https://%s:%d/graph", host, statsPort), bearerToken, 200) o.Expect(err).NotTo(o.HaveOccurred()) - executeOpenShiftBuild(oc, appTemplate) + g.By("verifying the build alerts defined in the template are present") + alertDefs, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/alerts", host, statsPort), bearerToken) + o.Expect(alertDefs).To(o.ContainSubstring("HangingBuild")) + o.Expect(alertDefs).To(o.ContainSubstring("BuildFailureRate")) - g.By("verifying a service account token is able to query build metrics from the Prometheus API") - metricTests := map[string][]metricTest{ - // NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running; - // so far the prometheus query interval and the length of the frontend build have - // been sufficient for reliable success here, but bear in mind the timing windows - // if this particular metricTest starts flaking + br := startOpenShiftBuild(oc, appTemplate) + + g.By("verifying a service account token is able to query active build metrics from the Prometheus API") + // NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running; + // timing has been a bit tricky when attempting to query after the build is complete based on the + // default prometheus scrapping window, so we do the active query while the build is running + activeTests := map[string][]metricTest{ activeBuildCountQuery: { metricTest{ labels: map[string]string{"phase": "running"}, greaterThan: "0", }, }, + } + + runQueries(activeTests) + + g.By("verifying build completed successfully") + err = exutil.WaitForBuildResult(oc.Client().Builds(oc.Namespace()), br) + o.Expect(err).NotTo(o.HaveOccurred()) + br.AssertSuccess() + + g.By("verifying a service account token is able to query terminal build metrics from the Prometheus API") + terminalTests := map[string][]metricTest{ terminalBuildCountQuery: { metricTest{ labels: map[string]string{"phase": "complete"}, @@ -85,71 +102,7 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() { }, }, } - // expect all correct metrics within 60 seconds - lastErrsMap := map[string]error{} - for i := 0; i < 60; i++ { - for query, tcs := range metricTests { - g.By("perform prometheus metric query " + query) - contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken) - o.Expect(err).NotTo(o.HaveOccurred()) - - correctMetrics := map[int]bool{} - for i, tc := range tcs { - result := prometheusResponse{} - json.Unmarshal([]byte(contents), &result) - metrics := result.Data.Result - - for _, sample := range metrics { - // first see if a metric has all the label names and label values we are looking for - foundCorrectLabels := true - for labelName, labelValue := range tc.labels { - if v, ok := sample.Metric[model.LabelName(labelName)]; ok { - if string(v) != labelValue { - foundCorrectLabels = false - break - } - } else { - foundCorrectLabels = false - break - } - } - - // if found metric with correct set of labels, now see if the metric value is what we are expecting - if foundCorrectLabels { - switch { - case len(tc.equals) > 0: - if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x { - correctMetrics[i] = true - break - } - case len(tc.greaterThan) > 0: - if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x { - correctMetrics[i] = true - break - } - } - } - - } - } - - if len(correctMetrics) == len(tcs) { - delete(metricTests, query) // delete in case there are retries on remaining tests - delete(lastErrsMap, query) - } else { - // maintain separate map of errors for diagnostics - lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics) - } - } - - if len(metricTests) == 0 { - break - } - - time.Sleep(time.Second) - } - - o.Expect(lastErrsMap).To(o.BeEmpty()) + runQueries(terminalTests) }) }) }) @@ -170,7 +123,75 @@ type metricTest struct { greaterThan string } -func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) { +func runQueries(metricTests map[string][]metricTest) { + // expect all correct metrics within 60 seconds + lastErrsMap := map[string]error{} + for i := 0; i < 60; i++ { + for query, tcs := range metricTests { + g.By("perform prometheus metric query " + query) + contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken) + o.Expect(err).NotTo(o.HaveOccurred()) + + correctMetrics := map[int]bool{} + for i, tc := range tcs { + result := prometheusResponse{} + json.Unmarshal([]byte(contents), &result) + metrics := result.Data.Result + + for _, sample := range metrics { + // first see if a metric has all the label names and label values we are looking for + foundCorrectLabels := true + for labelName, labelValue := range tc.labels { + if v, ok := sample.Metric[model.LabelName(labelName)]; ok { + if string(v) != labelValue { + foundCorrectLabels = false + break + } + } else { + foundCorrectLabels = false + break + } + } + + // if found metric with correct set of labels, now see if the metric value is what we are expecting + if foundCorrectLabels { + switch { + case len(tc.equals) > 0: + if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x { + correctMetrics[i] = true + break + } + case len(tc.greaterThan) > 0: + if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x { + correctMetrics[i] = true + break + } + } + } + + } + } + + if len(correctMetrics) == len(tcs) { + delete(metricTests, query) // delete in case there are retries on remaining tests + delete(lastErrsMap, query) + } else { + // maintain separate map of errors for diagnostics + lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics) + } + } + + if len(metricTests) == 0 { + break + } + + time.Sleep(time.Second) + } + + o.Expect(lastErrsMap).To(o.BeEmpty()) +} + +func startOpenShiftBuild(oc *exutil.CLI, appTemplate string) *exutil.BuildResult { g.By("waiting for builder service account") err := exutil.WaitForBuilderAccount(oc.KubeClient().Core().ServiceAccounts(oc.Namespace())) o.Expect(err).NotTo(o.HaveOccurred()) @@ -185,8 +206,8 @@ func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) { err = oc.AsAdmin().Run("tag").Args("openshift/nodejs:latest", oc.Namespace()+"/nodejs-010-centos7:latest").Execute() o.Expect(err).NotTo(o.HaveOccurred()) - g.By("start build, wait for completion") - br, err := exutil.StartBuildAndWait(oc, "frontend") + g.By("start build") + br, err := exutil.StartBuildResult(oc, "frontend") o.Expect(err).NotTo(o.HaveOccurred()) - br.AssertSuccess() + return br } diff --git a/test/extended/testdata/bindata.go b/test/extended/testdata/bindata.go index d3edaceb0af0..e524855e8a71 100644 --- a/test/extended/testdata/bindata.go +++ b/test/extended/testdata/bindata.go @@ -24483,6 +24483,15 @@ parameters: name: SESSION_SECRET generate: expression from: "[a-zA-Z0-9]{43}" +- description: The threshold for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD + value: "100" +- description: The allowable active build duration for the active build alert + name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION + value: "360" +- description: The allowable ration of failed to complete builds for the failed build alert + name: PROMETHEUS_FAILED_BUILD_RATIO + value: "0.5" objects: # Authorize the prometheus service account to read data about the cluster - apiVersion: v1 @@ -24696,12 +24705,22 @@ objects: - name: example-rules interval: 30s # defaults to global interval rules: - - alert: Node Down + - alert: NodeDown expr: up{job="kubernetes-nodes"} == 0 annotations: miqTarget: "ContainerNode" severity: "HIGH" - message: "{{$labels.instance}} is down" + message: "{{$labels.instance}} is down" + - alert: HangingBuild + expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete" + - alert: BuildFailureRate + expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO} + annotations: + severity: "MEDIUM" + message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}" prometheus.yml: | rule_files: - 'prometheus.rules'