Skip to content

Commit

Permalink
prometheus alerts for openshift build subsystem
Browse files Browse the repository at this point in the history
  • Loading branch information
gabemontero committed Sep 21, 2017
1 parent 97f4072 commit e07d075
Show file tree
Hide file tree
Showing 4 changed files with 163 additions and 85 deletions.
23 changes: 21 additions & 2 deletions examples/prometheus/prometheus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,15 @@ parameters:
name: SESSION_SECRET
generate: expression
from: "[a-zA-Z0-9]{43}"
- description: The threshold for the active build alert
name: PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD
value: "100"
- description: The allowable active build duration for the active build alert
name: PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION
value: "360"
- description: The allowable ration of failed to complete builds for the failed build alert
name: PROMETHEUS_FAILED_BUILD_RATIO
value: "0.5"
objects:
# Authorize the prometheus service account to read data about the cluster
- apiVersion: v1
Expand Down Expand Up @@ -243,12 +252,22 @@ objects:
- name: example-rules
interval: 30s # defaults to global interval
rules:
- alert: Node Down
- alert: NodeDown
expr: up{job="kubernetes-nodes"} == 0
annotations:
miqTarget: "ContainerNode"
severity: "HIGH"
message: "{{$labels.instance}} is down"
message: "{{$labels.instance}} is down"
- alert: HangingBuild
expr: count(openshift_build_running_phase_start_time_seconds{} < time() - ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION}) > ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD}
annotations:
severity: "MEDIUM"
message: "{{$labels.instance}} indicates at least ${PROMETHEUS_ACTIVE_BUILD_ALERT_THRESHOLD} OpenShift builds are taking more than ${PROMETHEUS_ACTIVE_BUILD_ALERT_DURATION} seconds to complete"
- alert: BuildFailureRate
expr: (count(openshift_build_terminal_phase_total{phase="failed"}) / count(openshift_build_terminal_phase_total{phase="complete"})) > ${PROMETHEUS_FAILED_BUILD_RATIO}
annotations:
severity: "MEDIUM"
message: "{{$labels.instance}} shows that the ratio of failed to complete OpenShift builds exceeds ${PROMETHEUS_FAILED_BUILD_RATIO}"
prometheus.yml: |
rule_files:
- 'prometheus.rules'
Expand Down
23 changes: 21 additions & 2 deletions pkg/oc/bootstrap/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

179 changes: 100 additions & 79 deletions test/extended/prometheus/prometheus_builds.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,15 @@ import (
exutil "github.com/openshift/origin/test/extended/util"
)

var (
execPodName, ns, host, bearerToken string
statsPort int
)

var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
defer g.GinkgoRecover()
var (
oc = exutil.NewCLI("prometheus", exutil.KubeConfigPath())

execPodName, ns, host, bearerToken string
statsPort int
)

g.BeforeEach(func() {
Expand Down Expand Up @@ -52,20 +54,35 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
err = expectBearerTokenURLStatusCodeExec(ns, execPodName, fmt.Sprintf("https://%s:%d/graph", host, statsPort), bearerToken, 200)
o.Expect(err).NotTo(o.HaveOccurred())

executeOpenShiftBuild(oc, appTemplate)
g.By("verifying the build alerts defined in the template are present")
alertDefs, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/alerts", host, statsPort), bearerToken)
o.Expect(alertDefs).To(o.ContainSubstring("HangingBuild"))
o.Expect(alertDefs).To(o.ContainSubstring("BuildFailureRate"))

g.By("verifying a service account token is able to query build metrics from the Prometheus API")
metricTests := map[string][]metricTest{
// NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running;
// so far the prometheus query interval and the length of the frontend build have
// been sufficient for reliable success here, but bear in mind the timing windows
// if this particular metricTest starts flaking
br := startOpenShiftBuild(oc, appTemplate)

g.By("verifying a service account token is able to query active build metrics from the Prometheus API")
// NOTE - activeBuildCountQuery is dependent on prometheus querying while the build is running;
// timing has been a bit tricky when attempting to query after the build is complete based on the
// default prometheus scrapping window, so we do the active query while the build is running
activeTests := map[string][]metricTest{
activeBuildCountQuery: {
metricTest{
labels: map[string]string{"phase": "running"},
greaterThan: "0",
},
},
}

runQueries(activeTests)

g.By("verifying build completed successfully")
err = exutil.WaitForBuildResult(oc.Client().Builds(oc.Namespace()), br)
o.Expect(err).NotTo(o.HaveOccurred())
br.AssertSuccess()

g.By("verifying a service account token is able to query terminal build metrics from the Prometheus API")
terminalTests := map[string][]metricTest{
terminalBuildCountQuery: {
metricTest{
labels: map[string]string{"phase": "complete"},
Expand All @@ -85,71 +102,7 @@ var _ = g.Describe("[Feature:Prometheus][builds] Prometheus", func() {
},
},
}
// expect all correct metrics within 60 seconds
lastErrsMap := map[string]error{}
for i := 0; i < 60; i++ {
for query, tcs := range metricTests {
g.By("perform prometheus metric query " + query)
contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken)
o.Expect(err).NotTo(o.HaveOccurred())

correctMetrics := map[int]bool{}
for i, tc := range tcs {
result := prometheusResponse{}
json.Unmarshal([]byte(contents), &result)
metrics := result.Data.Result

for _, sample := range metrics {
// first see if a metric has all the label names and label values we are looking for
foundCorrectLabels := true
for labelName, labelValue := range tc.labels {
if v, ok := sample.Metric[model.LabelName(labelName)]; ok {
if string(v) != labelValue {
foundCorrectLabels = false
break
}
} else {
foundCorrectLabels = false
break
}
}

// if found metric with correct set of labels, now see if the metric value is what we are expecting
if foundCorrectLabels {
switch {
case len(tc.equals) > 0:
if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x {
correctMetrics[i] = true
break
}
case len(tc.greaterThan) > 0:
if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x {
correctMetrics[i] = true
break
}
}
}

}
}

if len(correctMetrics) == len(tcs) {
delete(metricTests, query) // delete in case there are retries on remaining tests
delete(lastErrsMap, query)
} else {
// maintain separate map of errors for diagnostics
lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics)
}
}

if len(metricTests) == 0 {
break
}

time.Sleep(time.Second)
}

o.Expect(lastErrsMap).To(o.BeEmpty())
runQueries(terminalTests)
})
})
})
Expand All @@ -170,7 +123,75 @@ type metricTest struct {
greaterThan string
}

func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) {
func runQueries(metricTests map[string][]metricTest) {
// expect all correct metrics within 60 seconds
lastErrsMap := map[string]error{}
for i := 0; i < 60; i++ {
for query, tcs := range metricTests {
g.By("perform prometheus metric query " + query)
contents, err := getBearerTokenURLViaPod(ns, execPodName, fmt.Sprintf("https://%s:%d/api/v1/query?query=%s", host, statsPort, query), bearerToken)
o.Expect(err).NotTo(o.HaveOccurred())

correctMetrics := map[int]bool{}
for i, tc := range tcs {
result := prometheusResponse{}
json.Unmarshal([]byte(contents), &result)
metrics := result.Data.Result

for _, sample := range metrics {
// first see if a metric has all the label names and label values we are looking for
foundCorrectLabels := true
for labelName, labelValue := range tc.labels {
if v, ok := sample.Metric[model.LabelName(labelName)]; ok {
if string(v) != labelValue {
foundCorrectLabels = false
break
}
} else {
foundCorrectLabels = false
break
}
}

// if found metric with correct set of labels, now see if the metric value is what we are expecting
if foundCorrectLabels {
switch {
case len(tc.equals) > 0:
if x, err := strconv.ParseFloat(tc.equals, 64); err == nil && float64(sample.Value) == x {
correctMetrics[i] = true
break
}
case len(tc.greaterThan) > 0:
if x, err := strconv.ParseFloat(tc.greaterThan, 64); err == nil && float64(sample.Value) > x {
correctMetrics[i] = true
break
}
}
}

}
}

if len(correctMetrics) == len(tcs) {
delete(metricTests, query) // delete in case there are retries on remaining tests
delete(lastErrsMap, query)
} else {
// maintain separate map of errors for diagnostics
lastErrsMap[query] = fmt.Errorf("query %s with results %s only had correct metrics %v", query, contents, correctMetrics)
}
}

if len(metricTests) == 0 {
break
}

time.Sleep(time.Second)
}

o.Expect(lastErrsMap).To(o.BeEmpty())
}

func startOpenShiftBuild(oc *exutil.CLI, appTemplate string) *exutil.BuildResult {
g.By("waiting for builder service account")
err := exutil.WaitForBuilderAccount(oc.KubeClient().Core().ServiceAccounts(oc.Namespace()))
o.Expect(err).NotTo(o.HaveOccurred())
Expand All @@ -185,8 +206,8 @@ func executeOpenShiftBuild(oc *exutil.CLI, appTemplate string) {
err = oc.AsAdmin().Run("tag").Args("openshift/nodejs:latest", oc.Namespace()+"/nodejs-010-centos7:latest").Execute()
o.Expect(err).NotTo(o.HaveOccurred())

g.By("start build, wait for completion")
br, err := exutil.StartBuildAndWait(oc, "frontend")
g.By("start build")
br, err := exutil.StartBuildResult(oc, "frontend")
o.Expect(err).NotTo(o.HaveOccurred())
br.AssertSuccess()
return br
}
23 changes: 21 additions & 2 deletions test/extended/testdata/bindata.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e07d075

Please sign in to comment.