Skip to content

Commit

Permalink
metrics, readme changes to prep for prometheus alerts for openshift b…
Browse files Browse the repository at this point in the history
…uild subsystem
  • Loading branch information
gabemontero committed Sep 26, 2017
1 parent e647638 commit 2126108
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 114 deletions.
26 changes: 26 additions & 0 deletions examples/prometheus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,29 @@ Returns a running count (not a rate) of docker operations that have failed since
> kubelet_pleg_relist_latency_microseconds
Returns PLEG (pod lifecycle event generator) latency metrics. This represents the latency experienced by calls from the kubelet to the container runtime (i.e. docker or CRI-O). High PLEG latency is often related to disk I/O performance on the docker storage partition.

### OpenShift build related queries

> count(openshift_build_running_phase_start_time_seconds{} < time() - 360)
Returns the number of builds that have been running for more than 10 minutes (360 seconds).

> count(openshift_build_new_pending_phase_creation_time_seconds{} < time() - 360)
Returns the number of build that have been waiting at least 10 minutes (360 seconds) to start.

> count(openshift_build_failed_phase_total{})
Returns the number of failed builds.

> count(openshift_build_terminal_phase_total{phase="complete"})
Returns the number of successfully completed builds.

> openshift_build_failed_phase_total{}
Returns the latest totals for failed builds.

> openshift_build_failed_phase_total{} offset 5m
Returns the failed builds totals from 5 minutes ago.
87 changes: 62 additions & 25 deletions pkg/build/metrics/prometheus/metrics.go
Original file line number Diff line number Diff line change
@@ -1,45 +1,64 @@
package prometheus

import (
kselector "k8s.io/apimachinery/pkg/labels"
"strings"

"github.com/golang/glog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kselector "k8s.io/apimachinery/pkg/labels"

buildapi "github.com/openshift/origin/pkg/build/apis/build"
internalversion "github.com/openshift/origin/pkg/build/generated/listers/build/internalversion"
"github.com/prometheus/client_golang/prometheus"
)

const (
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
failedBuildCount = "failed_phase_total"
failedBuildCountQuery = buildSubsystem + separator + failedBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
newPendingBuildCount = "new_pending_phase_creation_time_seconds"
newPendingBuildCountQuery = buildSubsystem + separator + newPendingBuildCount
)

var (
// decided not to have a separate counter for failed builds, which have reasons,
// vs. the other "finished" builds phases, where the reason is not set
terminalBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+terminalBuildCount,
"Counts total teriminal builds by phase",
terminalBuildCountQuery,
"Counts total successful/aborted builds by phase",
[]string{"phase"},
nil,
)
failedBuildCountDesc = prometheus.NewDesc(
failedBuildCountQuery,
"Counts total failed builds by reason",
[]string{"reason"},
nil,
)
activeBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+activeBuildCount,
"Show the start time in unix epoch form of running builds by namespace, name, and phase",
activeBuildCountQuery,
"Show the start time in unix epoch form of running builds by namespace and name",
[]string{"namespace", "name"},
nil,
)
newPendingBuildCountDesc = prometheus.NewDesc(
newPendingBuildCountQuery,
"Show the creation time in unix epoch form of new or pending builds by namespace, name, and phase",
[]string{"namespace", "name", "phase"},
nil,
)
bc = buildCollector{}
registered = false
failedPhase = strings.ToLower(string(buildapi.BuildPhaseFailed))
errorPhase = strings.ToLower(string(buildapi.BuildPhaseError))
cancelledPhase = strings.ToLower(string(buildapi.BuildPhaseCancelled))
completePhase = strings.ToLower(string(buildapi.BuildPhaseComplete))
errorReason = strings.ToLower(string(buildapi.StatusReasonError))
)

type buildCollector struct {
Expand Down Expand Up @@ -77,40 +96,58 @@ func (bc *buildCollector) Collect(ch chan<- prometheus.Metric) {

// since we do not collect terminal build metrics on a per build basis, collectBuild will return counts
// to be added to the total amount posted to prometheus
var failed, error, cancelled, complete int
var error, cancelled, complete int
reasons := map[string]int{}
for _, b := range result {
f, e, cc, cp := bc.collectBuild(ch, b)
failed = failed + f
e, cc, cp, r := bc.collectBuild(ch, b)
for key, value := range r {
count := reasons[key]
count = count + value
reasons[key] = count
}
error = error + e
cancelled = cancelled + cc
complete = complete + cp
}
addCountGauge(ch, terminalBuildCountDesc, failedPhase, float64(failed))
// explicitly note there are no failed builds
if len(reasons) == 0 {
addCountGauge(ch, failedBuildCountDesc, "", float64(0))
}
for reason, count := range reasons {
addCountGauge(ch, failedBuildCountDesc, reason, float64(count))
}
addCountGauge(ch, terminalBuildCountDesc, errorPhase, float64(error))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelled))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(complete))
}

func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, phase string, v float64) {
lv := []string{phase}
func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, label string, v float64) {
lv := []string{label}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
}

func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, desc *prometheus.Desc) {
if b.Status.StartTimestamp != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name, strings.ToLower(string(b.Status.Phase))}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(b.Status.StartTimestamp.Unix()), lv...)
func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, time *metav1.Time, desc *prometheus.Desc, phase string) {
if time != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name}
if len(phase) > 0 {
lv = append(lv, strings.ToLower(phase))
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(time.Unix()), lv...)
}
}

func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (failed, error, cancelled, complete int) {
func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (error, cancelled, complete int, reasons map[string]int) {

switch b.Status.Phase {
// remember, new and pending builds don't have a start time
case buildapi.BuildPhaseNew:
case buildapi.BuildPhasePending:
addTimeGauge(ch, b, &b.CreationTimestamp, newPendingBuildCountDesc, string(b.Status.Phase))
case buildapi.BuildPhaseRunning:
addTimeGauge(ch, b, activeBuildCountDesc)
addTimeGauge(ch, b, b.Status.StartTimestamp, activeBuildCountDesc, "")
case buildapi.BuildPhaseFailed:
failed++
// currently only failed builds have reasons
reasons[string(b.Status.Reason)] = 1
case buildapi.BuildPhaseError:
error++
case buildapi.BuildPhaseCancelled:
Expand Down
Loading

0 comments on commit 2126108

Please sign in to comment.