Skip to content

Commit

Permalink
metrics, readme changes to prep for prometheus alerts for openshift b…
Browse files Browse the repository at this point in the history
…uild subsystem
  • Loading branch information
gabemontero committed Sep 27, 2017
1 parent 6d590d6 commit 456dce5
Show file tree
Hide file tree
Showing 3 changed files with 210 additions and 129 deletions.
26 changes: 26 additions & 0 deletions examples/prometheus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,29 @@ Returns a running count (not a rate) of docker operations that have failed since
> kubelet_pleg_relist_latency_microseconds
Returns PLEG (pod lifecycle event generator) latency metrics. This represents the latency experienced by calls from the kubelet to the container runtime (i.e. docker or CRI-O). High PLEG latency is often related to disk I/O performance on the docker storage partition.

### OpenShift build related queries

> count(openshift_build_running_phase_start_time_seconds{} < time() - 600)
Returns the number of builds that have been running for more than 10 minutes (600 seconds).

> count(openshift_build_new_pending_phase_creation_time_seconds{} < time() - 600)
Returns the number of build that have been waiting at least 10 minutes (600 seconds) to start.

> sum(openshift_build_failed_phase_total{})
Returns the number of failed builds, regardless of the failure reason.

> sum(openshift_build_terminal_phase_total{phase="complete"})
Returns the number of successfully completed builds.

> openshift_build_failed_phase_total{}
Returns the latest totals, per failure reason, for any failed builds.

> openshift_build_failed_phase_total{} offset 5m
Returns the failed builds totals, per failure reason, from 5 minutes ago.
105 changes: 68 additions & 37 deletions pkg/build/metrics/prometheus/metrics.go
Original file line number Diff line number Diff line change
@@ -1,43 +1,59 @@
package prometheus

import (
kselector "k8s.io/apimachinery/pkg/labels"
"strings"

"github.com/golang/glog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kselector "k8s.io/apimachinery/pkg/labels"

buildapi "github.com/openshift/origin/pkg/build/apis/build"
internalversion "github.com/openshift/origin/pkg/build/generated/listers/build/internalversion"
"github.com/prometheus/client_golang/prometheus"
)

const (
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
failedBuildCount = "failed_phase_total"
failedBuildCountQuery = buildSubsystem + separator + failedBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
newPendingBuildCount = "new_pending_phase_creation_time_seconds"
newPendingBuildCountQuery = buildSubsystem + separator + newPendingBuildCount
errorBuildReason = "BuildPodError"
)

var (
// decided not to have a separate counter for failed builds, which have reasons,
// vs. the other "finished" builds phases, where the reason is not set
terminalBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+terminalBuildCount,
"Counts total terminal builds by phase",
terminalBuildCountQuery,
"Counts total successful/aborted builds by phase",
[]string{"phase"},
nil,
)
failedBuildCountDesc = prometheus.NewDesc(
failedBuildCountQuery,
"Counts total failed builds by reason",
[]string{"reason"},
nil,
)
activeBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+activeBuildCount,
"Show the start time in unix epoch form of running builds by namespace, name, and phase",
activeBuildCountQuery,
"Show the start time in unix epoch form of running builds by namespace and name",
[]string{"namespace", "name"},
nil,
)
newPendingBuildCountDesc = prometheus.NewDesc(
newPendingBuildCountQuery,
"Show the creation time in unix epoch form of new or pending builds by namespace, name, and phase",
[]string{"namespace", "name", "phase"},
nil,
)
bc = buildCollector{}
registered = false
failedPhase = strings.ToLower(string(buildapi.BuildPhaseFailed))
errorPhase = strings.ToLower(string(buildapi.BuildPhaseError))
cancelledPhase = strings.ToLower(string(buildapi.BuildPhaseCancelled))
completePhase = strings.ToLower(string(buildapi.BuildPhaseComplete))
)
Expand Down Expand Up @@ -77,46 +93,61 @@ func (bc *buildCollector) Collect(ch chan<- prometheus.Metric) {

// since we do not collect terminal build metrics on a per build basis, collectBuild will return counts
// to be added to the total amount posted to prometheus
var failed, error, cancelled, complete int
var cancelledCount, completeCount int
reasons := map[string]int{}
for _, b := range result {
f, e, cc, cp := bc.collectBuild(ch, b)
failed = failed + f
error = error + e
cancelled = cancelled + cc
complete = complete + cp
cc, cp, r := bc.collectBuild(ch, b)
for key, value := range r {
reasons[key] = reasons[key] + value
}
cancelledCount = cancelledCount + cc
completeCount = completeCount + cp
}
// explicitly note there are no failed builds
if len(reasons) == 0 {
addCountGauge(ch, failedBuildCountDesc, "", float64(0))
}
for reason, count := range reasons {
addCountGauge(ch, failedBuildCountDesc, reason, float64(count))
}
addCountGauge(ch, terminalBuildCountDesc, failedPhase, float64(failed))
addCountGauge(ch, terminalBuildCountDesc, errorPhase, float64(error))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelled))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(complete))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelledCount))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(completeCount))
}

func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, phase string, v float64) {
lv := []string{phase}
func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, label string, v float64) {
lv := []string{label}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
}

func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, desc *prometheus.Desc) {
if b.Status.StartTimestamp != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name, strings.ToLower(string(b.Status.Phase))}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(b.Status.StartTimestamp.Unix()), lv...)
func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, time *metav1.Time, desc *prometheus.Desc, phase string) {
if time != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name}
if len(phase) > 0 {
lv = append(lv, strings.ToLower(phase))
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(time.Unix()), lv...)
}
}

func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (failed, error, cancelled, complete int) {
func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (cancelledCount, completeCount int, reasonsCount map[string]int) {

switch b.Status.Phase {
// remember, new and pending builds don't have a start time
case buildapi.BuildPhaseNew:
case buildapi.BuildPhasePending:
addTimeGauge(ch, b, &b.CreationTimestamp, newPendingBuildCountDesc, string(b.Status.Phase))
case buildapi.BuildPhaseRunning:
addTimeGauge(ch, b, activeBuildCountDesc)
addTimeGauge(ch, b, b.Status.StartTimestamp, activeBuildCountDesc, "")
case buildapi.BuildPhaseFailed:
failed++
// currently only failed builds have reasons
reasonsCount[string(b.Status.Reason)] = 1
case buildapi.BuildPhaseError:
error++
// it was decided to couple this one under failed, using the custom 'BuildPodError'
reasonsCount[errorBuildReason] = 1
case buildapi.BuildPhaseCancelled:
cancelled++
cancelledCount++
case buildapi.BuildPhaseComplete:
complete++
completeCount++
}
return
}
Loading

0 comments on commit 456dce5

Please sign in to comment.