From e07d075be00e6eca7f34915c2f5851dd0adf898e Mon Sep 17 00:00:00 2001
From: gabemontero <gmontero@redhat.com>
Date: Wed, 20 Sep 2017 13:02:28 -0400
Subject: [PATCH] prometheus alerts for openshift build subsystem

---
 examples/prometheus/prometheus.yaml           |  23 ++-
 pkg/oc/bootstrap/bindata.go                   |  23 ++-
 test/extended/prometheus/prometheus_builds.go | 179 ++++++++++--------
 test/extended/testdata/bindata.go             |  23 ++-
 4 files changed, 163 insertions(+), 85 deletions(-)

diff --git a/examples/prometheus/prometheus.yaml b/examples/prometheus/prometheus.yaml
index cb453189c85a..ca8a64ca3daa 100644
--- a/examples/prometheus/prometheus.yaml
+++ b/examples/prometheus/prometheus.yaml
@@ -30,6 +30,15 @@ parameters:
   name: SESSION_SECRET
   generate: expression
   from: "[a-zA-Z0-9]{43}"
+- description: The threshold for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD
+  value: "100"
+- description: The allowable active build duration for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION
+  value: "360"
+- description: The allowable ration of failed to complete builds for the failed build alert
+  name: PROMETHEUS_FAILED_BUILD_RATIO
+  value: "0.5"
 objects:
 # Authorize the prometheus service account to read data about the cluster
 - apiVersion: v1
@@ -243,12 +252,22 @@ objects:
       - name: example-rules
         interval: 30s # defaults to global interval
         rules:
-        - alert: Node Down
+        - alert: NodeDown
           expr: up{job="kubernetes-nodes"} == 0
           annotations:
             miqTarget: "ContainerNode"
             severity: "HIGH"
-            message: "{{$labels.instance}} is down"
+            message: "{{$labels.instance}} is down"    
+        - alert: HangingBuild
+          expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete"
+        - alert: BuildFailureRate
+          expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}"
     prometheus.yml: |
       rule_files:
         - 'prometheus.rules'
diff --git a/pkg/oc/bootstrap/bindata.go b/pkg/oc/bootstrap/bindata.go
index 350572508374..cbfdfc95d653 100644
--- a/pkg/oc/bootstrap/bindata.go
+++ b/pkg/oc/bootstrap/bindata.go
@@ -13137,6 +13137,15 @@ parameters:
   name: SESSION_SECRET
   generate: expression
   from: "[a-zA-Z0-9]{43}"
+- description: The threshold for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD
+  value: "100"
+- description: The allowable active build duration for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION
+  value: "360"
+- description: The allowable ration of failed to complete builds for the failed build alert
+  name: PROMETHEUS_FAILED_BUILD_RATIO
+  value: "0.5"
 objects:
 # Authorize the prometheus service account to read data about the cluster
 - apiVersion: v1
@@ -13350,12 +13359,22 @@ objects:
       - name: example-rules
         interval: 30s # defaults to global interval
         rules:
-        - alert: Node Down
+        - alert: NodeDown
           expr: up{job="kubernetes-nodes"} == 0
           annotations:
             miqTarget: "ContainerNode"
             severity: "HIGH"
-            message: "{{$labels.instance}} is down"
+            message: "{{$labels.instance}} is down"    
+        - alert: HangingBuild
+          expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete"
+        - alert: BuildFailureRate
+          expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}"
     prometheus.yml: |
       rule_files:
         - 'prometheus.rules'
diff --git a/test/extended/prometheus/prometheus_builds.go b/test/extended/prometheus/prometheus_builds.go
index 74caae452347..a1e2498ec0c6 100644
--- a/test/extended/prometheus/prometheus_builds.go
+++ b/test/extended/prometheus/prometheus_builds.go
@@ -17,13 +17,15 @@ import (
 	exutil "github.com/openshift/origin/test/extended/util"
 )
 
+var (
+	execPodName, ns, host, bearerToken string
+	statsPort                          int
+)
+
 var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
 	defer g.GinkgoRecover()
 	var (
 		oc = exutil.NewCLI("prometheus", exutil.KubeConfigPath())
-
-		execPodName, ns, host, bearerToken string
-		statsPort                          int
 	)
 
 	g.BeforeEach(func() {
@@ -52,20 +54,35 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
 			err = expectBearerTokenURLStatusCodeExec(ns, execPodName, fmt.Sprintf("https://%s:%d/graph", host, statsPort), bearerToken, 200)
 			o.Expect(err).NotTo(o.HaveOccurred())
 
-			executeOpenShiftBuild(oc, appTemplate)
+			g.By("verifying the build alerts defined in the template are present")
+			alertDefs, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/alerts", host, statsPort), bearerToken)
+			o.Expect(alertDefs).To(o.ContainSubstring("HangingBuild"))
+			o.Expect(alertDefs).To(o.ContainSubstring("BuildFailureRate"))
 
-			g.By("verifying a service account token is able to query build metrics from the Prometheus API")
-			metricTests := map[string][]metricTest{
-				// NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running;
-				// so far the prometheus query interval and the length of the frontend build have
-				// been sufficient for reliable success here, but bear in mind the timing windows
-				// if this particular metricTest starts flaking
+			br := startOpenShiftBuild(oc, appTemplate)
+
+			g.By("verifying a service account token is able to query active build metrics from the Prometheus API")
+			// NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running;
+			// timing has been a bit tricky when attempting to query after the build is complete based on the
+			// default prometheus scrapping window, so we do the active query while the build is running
+			activeTests := map[string][]metricTest{
 				activeBuildCountQuery: {
 					metricTest{
 						labels:      map[string]string{"phase": "running"},
 						greaterThan: "0",
 					},
 				},
+			}
+
+			runQueries(activeTests)
+
+			g.By("verifying build completed successfully")
+			err = exutil.WaitForBuildResult(oc.Client().Builds(oc.Namespace()), br)
+			o.Expect(err).NotTo(o.HaveOccurred())
+			br.AssertSuccess()
+
+			g.By("verifying a service account token is able to query terminal build metrics from the Prometheus API")
+			terminalTests := map[string][]metricTest{
 				terminalBuildCountQuery: {
 					metricTest{
 						labels:      map[string]string{"phase": "complete"},
@@ -85,71 +102,7 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
 					},
 				},
 			}
-			// expect all correct metrics within 60 seconds
-			lastErrsMap := map[string]error{}
-			for i := 0; i < 60; i++ {
-				for query, tcs := range metricTests {
-					g.By("perform prometheus metric query " + query)
-					contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken)
-					o.Expect(err).NotTo(o.HaveOccurred())
-
-					correctMetrics := map[int]bool{}
-					for i, tc := range tcs {
-						result := prometheusResponse{}
-						json.Unmarshal([]byte(contents), &result)
-						metrics := result.Data.Result
-
-						for _, sample := range metrics {
-							// first see if a metric has all the label names and label values we are looking for
-							foundCorrectLabels := true
-							for labelName, labelValue := range tc.labels {
-								if v, ok := sample.Metric[model.LabelName(labelName)]; ok {
-									if string(v) != labelValue {
-										foundCorrectLabels = false
-										break
-									}
-								} else {
-									foundCorrectLabels = false
-									break
-								}
-							}
-
-							// if found metric with correct set of labels, now see if the metric value is what we are expecting
-							if foundCorrectLabels {
-								switch {
-								case len(tc.equals) > 0:
-									if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x {
-										correctMetrics[i] = true
-										break
-									}
-								case len(tc.greaterThan) > 0:
-									if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x {
-										correctMetrics[i] = true
-										break
-									}
-								}
-							}
-
-						}
-					}
-
-					if len(correctMetrics) == len(tcs) {
-						delete(metricTests, query) // delete in case there are retries on remaining tests
-						delete(lastErrsMap, query)
-					} else {
-						// maintain separate map of errors for diagnostics
-						lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics)
-					}
-				}
-
-				if len(metricTests) == 0 {
-					break
-				}
-
-				time.Sleep(time.Second)
-			}
-
-			o.Expect(lastErrsMap).To(o.BeEmpty())
+			runQueries(terminalTests)
 		})
 	})
 })
@@ -170,7 +123,75 @@ type metricTest struct {
 	greaterThan string
 }
 
-func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) {
+func runQueries(metricTests map[string][]metricTest) {
+	// expect all correct metrics within 60 seconds
+	lastErrsMap := map[string]error{}
+	for i := 0; i < 60; i++ {
+		for query, tcs := range metricTests {
+			g.By("perform prometheus metric query " + query)
+			contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken)
+			o.Expect(err).NotTo(o.HaveOccurred())
+
+			correctMetrics := map[int]bool{}
+			for i, tc := range tcs {
+				result := prometheusResponse{}
+				json.Unmarshal([]byte(contents), &result)
+				metrics := result.Data.Result
+
+				for _, sample := range metrics {
+					// first see if a metric has all the label names and label values we are looking for
+					foundCorrectLabels := true
+					for labelName, labelValue := range tc.labels {
+						if v, ok := sample.Metric[model.LabelName(labelName)]; ok {
+							if string(v) != labelValue {
+								foundCorrectLabels = false
+								break
+							}
+						} else {
+							foundCorrectLabels = false
+							break
+						}
+					}
+
+					// if found metric with correct set of labels, now see if the metric value is what we are expecting
+					if foundCorrectLabels {
+						switch {
+						case len(tc.equals) > 0:
+							if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x {
+								correctMetrics[i] = true
+								break
+							}
+						case len(tc.greaterThan) > 0:
+							if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x {
+								correctMetrics[i] = true
+								break
+							}
+						}
+					}
+
+				}
+			}
+
+			if len(correctMetrics) == len(tcs) {
+				delete(metricTests, query) // delete in case there are retries on remaining tests
+				delete(lastErrsMap, query)
+			} else {
+				// maintain separate map of errors for diagnostics
+				lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics)
+			}
+		}
+
+		if len(metricTests) == 0 {
+			break
+		}
+
+		time.Sleep(time.Second)
+	}
+
+	o.Expect(lastErrsMap).To(o.BeEmpty())
+}
+
+func startOpenShiftBuild(oc *exutil.CLI, appTemplate string) *exutil.BuildResult {
 	g.By("waiting for builder service account")
 	err := exutil.WaitForBuilderAccount(oc.KubeClient().Core().ServiceAccounts(oc.Namespace()))
 	o.Expect(err).NotTo(o.HaveOccurred())
@@ -185,8 +206,8 @@ func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) {
 	err = oc.AsAdmin().Run("tag").Args("openshift/nodejs:latest", oc.Namespace()+"/nodejs-010-centos7:latest").Execute()
 	o.Expect(err).NotTo(o.HaveOccurred())
 
-	g.By("start build, wait for completion")
-	br, err := exutil.StartBuildAndWait(oc, "frontend")
+	g.By("start build")
+	br, err := exutil.StartBuildResult(oc, "frontend")
 	o.Expect(err).NotTo(o.HaveOccurred())
-	br.AssertSuccess()
+	return br
 }
diff --git a/test/extended/testdata/bindata.go b/test/extended/testdata/bindata.go
index d3edaceb0af0..e524855e8a71 100644
--- a/test/extended/testdata/bindata.go
+++ b/test/extended/testdata/bindata.go
@@ -24483,6 +24483,15 @@ parameters:
   name: SESSION_SECRET
   generate: expression
   from: "[a-zA-Z0-9]{43}"
+- description: The threshold for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD
+  value: "100"
+- description: The allowable active build duration for the active build alert
+  name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION
+  value: "360"
+- description: The allowable ration of failed to complete builds for the failed build alert
+  name: PROMETHEUS_FAILED_BUILD_RATIO
+  value: "0.5"
 objects:
 # Authorize the prometheus service account to read data about the cluster
 - apiVersion: v1
@@ -24696,12 +24705,22 @@ objects:
       - name: example-rules
         interval: 30s # defaults to global interval
         rules:
-        - alert: Node Down
+        - alert: NodeDown
           expr: up{job="kubernetes-nodes"} == 0
           annotations:
             miqTarget: "ContainerNode"
             severity: "HIGH"
-            message: "{{$labels.instance}} is down"
+            message: "{{$labels.instance}} is down"    
+        - alert: HangingBuild
+          expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete"
+        - alert: BuildFailureRate
+          expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO}
+          annotations:
+            severity: "MEDIUM"
+            message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}"
     prometheus.yml: |
       rule_files:
         - 'prometheus.rules'