Skip to content

Commit

Permalink
Merge pull request #16495 from gabemontero/build-alert
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue.

prometheus alerts for openshift build subsystem

https://trello.com/c/RskNHpfh/1334-5-prometheus-alerts-for-build-metrics

A WIP initial pass at alerts for the openshift build subsystem

@openshift/devex @smarterclayton @zgalor @moolitayer @mfojtik ptal, defer if bandwidth dictates, and/or pull in others as you each deem fit

Disclaimers:
1) I'm still debating the pros/cons of these alerts with https://docs.google.com/document/d/199PqyG3UsyXlwieHaqbGiWVa8eMWi8zzAn0YfcApr8Q/edit#heading=h.2efurbugauf in mind

2) still debating the template parameters / defaults for the various thresholds ... I still have a to-do to revisit with ops contacts potential default values based on their existing zabbix monitoring

3) still debating the severity as well

4) based on the activity in #16026 I did not include the `miqTarget` annotation

I also removed the space in the existing alert name based on how I interpreted various naming conventions.

And other than the query on the alerts URI, the extended test changes stemmed from flakiness experienced during testing that was unrelated to the addition of the alerts.

thanks
  • Loading branch information
openshift-merge-robot committed Sep 28, 2017
2 parents dbf1fc3 + cf023ef commit f86e504
Show file tree
Hide file tree
Showing 3 changed files with 211 additions and 129 deletions.
26 changes: 26 additions & 0 deletions examples/prometheus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,29 @@ Returns a running count (not a rate) of docker operations that have failed since
> kubelet_pleg_relist_latency_microseconds
Returns PLEG (pod lifecycle event generator) latency metrics. This represents the latency experienced by calls from the kubelet to the container runtime (i.e. docker or CRI-O). High PLEG latency is often related to disk I/O performance on the docker storage partition.

### OpenShift build related queries

> count(openshift_build_running_phase_start_time_seconds{} < time() - 600)
Returns the number of builds that have been running for more than 10 minutes (600 seconds).

> count(openshift_build_new_pending_phase_creation_time_seconds{} < time() - 600)
Returns the number of build that have been waiting at least 10 minutes (600 seconds) to start.

> sum(openshift_build_failed_phase_total{})
Returns the number of failed builds, regardless of the failure reason.

> sum(openshift_build_terminal_phase_total{phase="complete"})
Returns the number of successfully completed builds.

> openshift_build_failed_phase_total{}
Returns the latest totals, per failure reason, for any failed builds.

> openshift_build_failed_phase_total{} offset 5m
Returns the failed builds totals, per failure reason, from 5 minutes ago.
106 changes: 69 additions & 37 deletions pkg/build/metrics/prometheus/metrics.go
Original file line number Diff line number Diff line change
@@ -1,43 +1,59 @@
package prometheus

import (
kselector "k8s.io/apimachinery/pkg/labels"
"strings"

"github.com/golang/glog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kselector "k8s.io/apimachinery/pkg/labels"

buildapi "github.com/openshift/origin/pkg/build/apis/build"
internalversion "github.com/openshift/origin/pkg/build/generated/listers/build/internalversion"
"github.com/prometheus/client_golang/prometheus"
)

const (
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
failedBuildCount = "failed_phase_total"
failedBuildCountQuery = buildSubsystem + separator + failedBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
newPendingBuildCount = "new_pending_phase_creation_time_seconds"
newPendingBuildCountQuery = buildSubsystem + separator + newPendingBuildCount
errorBuildReason = "BuildPodError"
)

var (
// decided not to have a separate counter for failed builds, which have reasons,
// vs. the other "finished" builds phases, where the reason is not set
terminalBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+terminalBuildCount,
"Counts total terminal builds by phase",
terminalBuildCountQuery,
"Counts total successful/aborted builds by phase",
[]string{"phase"},
nil,
)
failedBuildCountDesc = prometheus.NewDesc(
failedBuildCountQuery,
"Counts total failed builds by reason",
[]string{"reason"},
nil,
)
activeBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+activeBuildCount,
"Show the start time in unix epoch form of running builds by namespace, name, and phase",
activeBuildCountQuery,
"Show the start time in unix epoch form of running builds by namespace and name",
[]string{"namespace", "name"},
nil,
)
newPendingBuildCountDesc = prometheus.NewDesc(
newPendingBuildCountQuery,
"Show the creation time in unix epoch form of new or pending builds by namespace, name, and phase",
[]string{"namespace", "name", "phase"},
nil,
)
bc = buildCollector{}
registered = false
failedPhase = strings.ToLower(string(buildapi.BuildPhaseFailed))
errorPhase = strings.ToLower(string(buildapi.BuildPhaseError))
cancelledPhase = strings.ToLower(string(buildapi.BuildPhaseCancelled))
completePhase = strings.ToLower(string(buildapi.BuildPhaseComplete))
)
Expand Down Expand Up @@ -77,46 +93,62 @@ func (bc *buildCollector) Collect(ch chan<- prometheus.Metric) {

// since we do not collect terminal build metrics on a per build basis, collectBuild will return counts
// to be added to the total amount posted to prometheus
var failed, error, cancelled, complete int
var cancelledCount, completeCount int
reasons := map[string]int{}
for _, b := range result {
f, e, cc, cp := bc.collectBuild(ch, b)
failed = failed + f
error = error + e
cancelled = cancelled + cc
complete = complete + cp
cc, cp, r := bc.collectBuild(ch, b)
for key, value := range r {
reasons[key] = reasons[key] + value
}
cancelledCount = cancelledCount + cc
completeCount = completeCount + cp
}
// explicitly note there are no failed builds
if len(reasons) == 0 {
addCountGauge(ch, failedBuildCountDesc, "", float64(0))
}
for reason, count := range reasons {
addCountGauge(ch, failedBuildCountDesc, reason, float64(count))
}
addCountGauge(ch, terminalBuildCountDesc, failedPhase, float64(failed))
addCountGauge(ch, terminalBuildCountDesc, errorPhase, float64(error))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelled))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(complete))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelledCount))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(completeCount))
}

func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, phase string, v float64) {
lv := []string{phase}
func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, label string, v float64) {
lv := []string{label}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
}

func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, desc *prometheus.Desc) {
if b.Status.StartTimestamp != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name, strings.ToLower(string(b.Status.Phase))}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(b.Status.StartTimestamp.Unix()), lv...)
func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, time *metav1.Time, desc *prometheus.Desc, phase string) {
if time != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name}
if len(phase) > 0 {
lv = append(lv, strings.ToLower(phase))
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(time.Unix()), lv...)
}
}

func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (failed, error, cancelled, complete int) {
func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (cancelledCount, completeCount int, reasonsCount map[string]int) {

reasonsCount = map[string]int{}
switch b.Status.Phase {
// remember, new and pending builds don't have a start time
case buildapi.BuildPhaseNew:
case buildapi.BuildPhasePending:
addTimeGauge(ch, b, &b.CreationTimestamp, newPendingBuildCountDesc, string(b.Status.Phase))
case buildapi.BuildPhaseRunning:
addTimeGauge(ch, b, activeBuildCountDesc)
addTimeGauge(ch, b, b.Status.StartTimestamp, activeBuildCountDesc, "")
case buildapi.BuildPhaseFailed:
failed++
// currently only failed builds have reasons
reasonsCount[string(b.Status.Reason)] = 1
case buildapi.BuildPhaseError:
error++
// it was decided to couple this one under failed, using the custom 'BuildPodError'
reasonsCount[errorBuildReason] = 1
case buildapi.BuildPhaseCancelled:
cancelled++
cancelledCount++
case buildapi.BuildPhaseComplete:
complete++
completeCount++
}
return
}
Loading

0 comments on commit f86e504

Please sign in to comment.