Merge pull request #16495 from gabemontero/build-alert

Automatic merge from submit-queue. prometheus alerts for openshift build subsystem https://trello.com/c/RskNHpfh/1334-5-prometheus-alerts-for-build-metrics A WIP initial pass at alerts for the openshift build subsystem @openshift/devex @smarterclayton @zgalor @moolitayer @mfojtik ptal, defer if bandwidth dictates, and/or pull in others as you each deem fit Disclaimers: 1) I'm still debating the pros/cons of these alerts with https://docs.google.com/document/d/199PqyG3UsyXlwieHaqbGiWVa8eMWi8zzAn0YfcApr8Q/edit#heading=h.2efurbugauf in mind 2) still debating the template parameters / defaults for the various thresholds ... I still have a to-do to revisit with ops contacts potential default values based on their existing zabbix monitoring 3) still debating the severity as well 4) based on the activity in #16026 I did not include the `miqTarget` annotation I also removed the space in the existing alert name based on how I interpreted various naming conventions. And other than the query on the alerts URI, the extended test changes stemmed from flakiness experienced during testing that was unrelated to the addition of the alerts. thanks
openshift · Sep 28, 2017 · f86e504 · f86e504
2 parents dbf1fc3 + cf023ef
commit f86e504
Show file tree

Hide file tree

Showing 3 changed files with 211 additions and 129 deletions.
diff --git a/examples/prometheus/README.md b/examples/prometheus/README.md
@@ -116,3 +116,29 @@ Returns a running count (not a rate) of docker operations that have failed since
 > kubelet_pleg_relist_latency_microseconds
 
 Returns PLEG (pod lifecycle event generator) latency metrics.  This represents the latency experienced by calls from the kubelet to the container runtime (i.e. docker or CRI-O).  High PLEG latency is often related to disk I/O performance on the docker storage partition.
+
+### OpenShift build related queries
+
+> count(openshift_build_running_phase_start_time_seconds{} < time() - 600)
+
+Returns the number of builds that have been running for more than 10 minutes (600 seconds).
+
+> count(openshift_build_new_pending_phase_creation_time_seconds{} < time() - 600)
+
+Returns the number of build that have been waiting at least 10 minutes (600 seconds) to start.
+
+> sum(openshift_build_failed_phase_total{})
+
+Returns the number of failed builds, regardless of the failure reason.
+
+> sum(openshift_build_terminal_phase_total{phase="complete"})
+
+Returns the number of successfully completed builds.
+
+> openshift_build_failed_phase_total{}
+
+Returns the latest totals, per failure reason, for any failed builds.
+
+> openshift_build_failed_phase_total{} offset 5m
+
+Returns the failed builds totals, per failure reason, from 5 minutes ago.
diff --git a/pkg/build/metrics/prometheus/metrics.go b/pkg/build/metrics/prometheus/metrics.go
@@ -1,43 +1,59 @@
 package prometheus
 
 import (
-	kselector "k8s.io/apimachinery/pkg/labels"
 	"strings"
 
 	"github.com/golang/glog"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	kselector "k8s.io/apimachinery/pkg/labels"
+
 	buildapi "github.com/openshift/origin/pkg/build/apis/build"
 	internalversion "github.com/openshift/origin/pkg/build/generated/listers/build/internalversion"
 	"github.com/prometheus/client_golang/prometheus"
 )
 
 const (
-	separator               = "_"
-	buildSubsystem          = "openshift_build"
-	terminalBuildCount      = "terminal_phase_total"
-	terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
-	activeBuildCount        = "running_phase_start_time_seconds"
-	activeBuildCountQuery   = buildSubsystem + separator + activeBuildCount
+	separator                 = "_"
+	buildSubsystem            = "openshift_build"
+	terminalBuildCount        = "terminal_phase_total"
+	terminalBuildCountQuery   = buildSubsystem + separator + terminalBuildCount
+	failedBuildCount          = "failed_phase_total"
+	failedBuildCountQuery     = buildSubsystem + separator + failedBuildCount
+	activeBuildCount          = "running_phase_start_time_seconds"
+	activeBuildCountQuery     = buildSubsystem + separator + activeBuildCount
+	newPendingBuildCount      = "new_pending_phase_creation_time_seconds"
+	newPendingBuildCountQuery = buildSubsystem + separator + newPendingBuildCount
+	errorBuildReason          = "BuildPodError"
 )
 
 var (
-	// decided not to have a separate counter for failed builds, which have reasons,
-	// vs. the other "finished" builds phases, where the reason is not set
 	terminalBuildCountDesc = prometheus.NewDesc(
-		buildSubsystem+separator+terminalBuildCount,
-		"Counts total terminal builds by phase",
+		terminalBuildCountQuery,
+		"Counts total successful/aborted builds by phase",
 		[]string{"phase"},
 		nil,
 	)
+	failedBuildCountDesc = prometheus.NewDesc(
+		failedBuildCountQuery,
+		"Counts total failed builds by reason",
+		[]string{"reason"},
+		nil,
+	)
 	activeBuildCountDesc = prometheus.NewDesc(
-		buildSubsystem+separator+activeBuildCount,
-		"Show the start time in unix epoch form of running builds by namespace, name, and phase",
+		activeBuildCountQuery,
+		"Show the start time in unix epoch form of running builds by namespace and name",
+		[]string{"namespace", "name"},
+		nil,
+	)
+	newPendingBuildCountDesc = prometheus.NewDesc(
+		newPendingBuildCountQuery,
+		"Show the creation time in unix epoch form of new or pending builds by namespace, name, and phase",
 		[]string{"namespace", "name", "phase"},
 		nil,
 	)
 	bc             = buildCollector{}
 	registered     = false
-	failedPhase    = strings.ToLower(string(buildapi.BuildPhaseFailed))
-	errorPhase     = strings.ToLower(string(buildapi.BuildPhaseError))
 	cancelledPhase = strings.ToLower(string(buildapi.BuildPhaseCancelled))
 	completePhase  = strings.ToLower(string(buildapi.BuildPhaseComplete))
 )
@@ -77,46 +93,62 @@ func (bc *buildCollector) Collect(ch chan<- prometheus.Metric) {
 
 	// since we do not collect terminal build metrics on a per build basis, collectBuild will return counts
 	// to be added to the total amount posted to prometheus
-	var failed, error, cancelled, complete int
+	var cancelledCount, completeCount int
+	reasons := map[string]int{}
 	for _, b := range result {
-		f, e, cc, cp := bc.collectBuild(ch, b)
-		failed = failed + f
-		error = error + e
-		cancelled = cancelled + cc
-		complete = complete + cp
+		cc, cp, r := bc.collectBuild(ch, b)
+		for key, value := range r {
+			reasons[key] = reasons[key] + value
+		}
+		cancelledCount = cancelledCount + cc
+		completeCount = completeCount + cp
+	}
+	// explicitly note there are no failed builds
+	if len(reasons) == 0 {
+		addCountGauge(ch, failedBuildCountDesc, "", float64(0))
+	}
+	for reason, count := range reasons {
+		addCountGauge(ch, failedBuildCountDesc, reason, float64(count))
 	}
-	addCountGauge(ch, terminalBuildCountDesc, failedPhase, float64(failed))
-	addCountGauge(ch, terminalBuildCountDesc, errorPhase, float64(error))
-	addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelled))
-	addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(complete))
+	addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelledCount))
+	addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(completeCount))
 }
 
-func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, phase string, v float64) {
-	lv := []string{phase}
+func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, label string, v float64) {
+	lv := []string{label}
 	ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
 }
 
-func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, desc *prometheus.Desc) {
-	if b.Status.StartTimestamp != nil {
-		lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name, strings.ToLower(string(b.Status.Phase))}
-		ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(b.Status.StartTimestamp.Unix()), lv...)
+func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, time *metav1.Time, desc *prometheus.Desc, phase string) {
+	if time != nil {
+		lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name}
+		if len(phase) > 0 {
+			lv = append(lv, strings.ToLower(phase))
+		}
+		ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(time.Unix()), lv...)
 	}
 }
 
-func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (failed, error, cancelled, complete int) {
+func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (cancelledCount, completeCount int, reasonsCount map[string]int) {
 
+	reasonsCount = map[string]int{}
 	switch b.Status.Phase {
 	// remember, new and pending builds don't have a start time
+	case buildapi.BuildPhaseNew:
+	case buildapi.BuildPhasePending:
+		addTimeGauge(ch, b, &b.CreationTimestamp, newPendingBuildCountDesc, string(b.Status.Phase))
 	case buildapi.BuildPhaseRunning:
-		addTimeGauge(ch, b, activeBuildCountDesc)
+		addTimeGauge(ch, b, b.Status.StartTimestamp, activeBuildCountDesc, "")
 	case buildapi.BuildPhaseFailed:
-		failed++
+		// currently only failed builds have reasons
+		reasonsCount[string(b.Status.Reason)] = 1
 	case buildapi.BuildPhaseError:
-		error++
+		// it was decided to couple this one under failed, using the custom 'BuildPodError'
+		reasonsCount[errorBuildReason] = 1
 	case buildapi.BuildPhaseCancelled:
-		cancelled++
+		cancelledCount++
 	case buildapi.BuildPhaseComplete:
-		complete++
+		completeCount++
 	}
 	return
 }