Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

prometheus alerts for openshift build subsystem #16495

Merged
merged 1 commit into from
Sep 28, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions examples/prometheus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,29 @@ Returns a running count (not a rate) of docker operations that have failed since
> kubelet_pleg_relist_latency_microseconds

Returns PLEG (pod lifecycle event generator) latency metrics. This represents the latency experienced by calls from the kubelet to the container runtime (i.e. docker or CRI-O). High PLEG latency is often related to disk I/O performance on the docker storage partition.

### OpenShift build related queries

> count(openshift_build_running_phase_start_time_seconds{} < time() - 600)

Returns the number of builds that have been running for more than 10 minutes (600 seconds).

> count(openshift_build_new_pending_phase_creation_time_seconds{} < time() - 600)

Returns the number of build that have been waiting at least 10 minutes (600 seconds) to start.

> sum(openshift_build_failed_phase_total{})

Returns the number of failed builds, regardless of the failure reason.

> sum(openshift_build_terminal_phase_total{phase="complete"})

Returns the number of successfully completed builds.

> openshift_build_failed_phase_total{}

Returns the latest totals, per failure reason, for any failed builds.

> openshift_build_failed_phase_total{} offset 5m

Returns the failed builds totals, per failure reason, from 5 minutes ago.
106 changes: 69 additions & 37 deletions pkg/build/metrics/prometheus/metrics.go
Original file line number Diff line number Diff line change
@@ -1,43 +1,59 @@
package prometheus

import (
kselector "k8s.io/apimachinery/pkg/labels"
"strings"

"github.com/golang/glog"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kselector "k8s.io/apimachinery/pkg/labels"

buildapi "github.com/openshift/origin/pkg/build/apis/build"
internalversion "github.com/openshift/origin/pkg/build/generated/listers/build/internalversion"
"github.com/prometheus/client_golang/prometheus"
)

const (
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
separator = "_"
buildSubsystem = "openshift_build"
terminalBuildCount = "terminal_phase_total"
terminalBuildCountQuery = buildSubsystem + separator + terminalBuildCount
failedBuildCount = "failed_phase_total"
failedBuildCountQuery = buildSubsystem + separator + failedBuildCount
activeBuildCount = "running_phase_start_time_seconds"
activeBuildCountQuery = buildSubsystem + separator + activeBuildCount
newPendingBuildCount = "new_pending_phase_creation_time_seconds"
newPendingBuildCountQuery = buildSubsystem + separator + newPendingBuildCount
errorBuildReason = "BuildPodError"
)

var (
// decided not to have a separate counter for failed builds, which have reasons,
// vs. the other "finished" builds phases, where the reason is not set
terminalBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+terminalBuildCount,
"Counts total terminal builds by phase",
terminalBuildCountQuery,
"Counts total successful/aborted builds by phase",
[]string{"phase"},
nil,
)
failedBuildCountDesc = prometheus.NewDesc(
failedBuildCountQuery,
"Counts total failed builds by reason",
[]string{"reason"},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this dimension just roll into the above gauge?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could, but didn't for specific reasons - see above comment.

nil,
)
activeBuildCountDesc = prometheus.NewDesc(
buildSubsystem+separator+activeBuildCount,
"Show the start time in unix epoch form of running builds by namespace, name, and phase",
activeBuildCountQuery,
"Show the start time in unix epoch form of running builds by namespace and name",
[]string{"namespace", "name"},
nil,
)
newPendingBuildCountDesc = prometheus.NewDesc(
newPendingBuildCountQuery,
"Show the creation time in unix epoch form of new or pending builds by namespace, name, and phase",
[]string{"namespace", "name", "phase"},
nil,
)
bc = buildCollector{}
registered = false
failedPhase = strings.ToLower(string(buildapi.BuildPhaseFailed))
errorPhase = strings.ToLower(string(buildapi.BuildPhaseError))
cancelledPhase = strings.ToLower(string(buildapi.BuildPhaseCancelled))
completePhase = strings.ToLower(string(buildapi.BuildPhaseComplete))
)
Expand Down Expand Up @@ -77,46 +93,62 @@ func (bc *buildCollector) Collect(ch chan<- prometheus.Metric) {

// since we do not collect terminal build metrics on a per build basis, collectBuild will return counts
// to be added to the total amount posted to prometheus
var failed, error, cancelled, complete int
var cancelledCount, completeCount int
reasons := map[string]int{}
for _, b := range result {
f, e, cc, cp := bc.collectBuild(ch, b)
failed = failed + f
error = error + e
cancelled = cancelled + cc
complete = complete + cp
cc, cp, r := bc.collectBuild(ch, b)
for key, value := range r {
reasons[key] = reasons[key] + value
}
cancelledCount = cancelledCount + cc
completeCount = completeCount + cp
}
// explicitly note there are no failed builds
if len(reasons) == 0 {
addCountGauge(ch, failedBuildCountDesc, "", float64(0))
}
for reason, count := range reasons {
addCountGauge(ch, failedBuildCountDesc, reason, float64(count))
}
addCountGauge(ch, terminalBuildCountDesc, failedPhase, float64(failed))
addCountGauge(ch, terminalBuildCountDesc, errorPhase, float64(error))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelled))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(complete))
addCountGauge(ch, terminalBuildCountDesc, cancelledPhase, float64(cancelledCount))
addCountGauge(ch, terminalBuildCountDesc, completePhase, float64(completeCount))
}

func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, phase string, v float64) {
lv := []string{phase}
func addCountGauge(ch chan<- prometheus.Metric, desc *prometheus.Desc, label string, v float64) {
lv := []string{label}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, v, lv...)
}

func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, desc *prometheus.Desc) {
if b.Status.StartTimestamp != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name, strings.ToLower(string(b.Status.Phase))}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(b.Status.StartTimestamp.Unix()), lv...)
func addTimeGauge(ch chan<- prometheus.Metric, b *buildapi.Build, time *metav1.Time, desc *prometheus.Desc, phase string) {
if time != nil {
lv := []string{b.ObjectMeta.Namespace, b.ObjectMeta.Name}
if len(phase) > 0 {
lv = append(lv, strings.ToLower(phase))
}
ch <- prometheus.MustNewConstMetric(desc, prometheus.GaugeValue, float64(time.Unix()), lv...)
}
}

func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (failed, error, cancelled, complete int) {
func (bc *buildCollector) collectBuild(ch chan<- prometheus.Metric, b *buildapi.Build) (cancelledCount, completeCount int, reasonsCount map[string]int) {

reasonsCount = map[string]int{}
switch b.Status.Phase {
// remember, new and pending builds don't have a start time
case buildapi.BuildPhaseNew:
case buildapi.BuildPhasePending:
addTimeGauge(ch, b, &b.CreationTimestamp, newPendingBuildCountDesc, string(b.Status.Phase))
case buildapi.BuildPhaseRunning:
addTimeGauge(ch, b, activeBuildCountDesc)
addTimeGauge(ch, b, b.Status.StartTimestamp, activeBuildCountDesc, "")
case buildapi.BuildPhaseFailed:
failed++
// currently only failed builds have reasons
reasonsCount[string(b.Status.Reason)] = 1
case buildapi.BuildPhaseError:
error++
// it was decided to couple this one under failed, using the custom 'BuildPodError'
reasonsCount[errorBuildReason] = 1
case buildapi.BuildPhaseCancelled:
cancelled++
cancelledCount++
case buildapi.BuildPhaseComplete:
complete++
completeCount++
}
return
}
Loading