Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prometheus metrics to internal controller #132

Merged
merged 11 commits into from
Nov 2, 2018
16 changes: 12 additions & 4 deletions Gopkg.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Gopkg.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ required = ["sigs.k8s.io/testing_frameworks/integration",
"github.com/go-openapi/spec",
"k8s.io/kube-openapi/pkg/common",
"k8s.io/apiextensions-apiserver",
"github.com/prometheus/client_golang/prometheus",
]

[[constraint]]
Expand Down Expand Up @@ -54,6 +55,10 @@ required = ["sigs.k8s.io/testing_frameworks/integration",
name = "go.uber.org/zap"
version = "1.8.0"

[[constraint]]
name = "github.com/prometheus/client_golang"
version = "0.9.0"

# these are not listed explicitly until we get version tags,
# since dep doesn't like bare revision dependencies

Expand Down
7 changes: 7 additions & 0 deletions pkg/builder/builder_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
. "github.com/onsi/gomega"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/envtest"
"sigs.k8s.io/controller-runtime/pkg/metrics"
logf "sigs.k8s.io/controller-runtime/pkg/runtime/log"
)

Expand All @@ -43,9 +44,15 @@ var _ = BeforeSuite(func(done Done) {
cfg, err = testenv.Start()
Expect(err).NotTo(HaveOccurred())

// Prevent the metrics listener being created
metrics.DefaultBindAddress = "0"

close(done)
}, 60)

var _ = AfterSuite(func() {
testenv.Stop()

// Put the DefaultBindAddress back
metrics.DefaultBindAddress = ":8080"
})
7 changes: 7 additions & 0 deletions pkg/controller/controller_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"sigs.k8s.io/controller-runtime/pkg/envtest"
"sigs.k8s.io/controller-runtime/pkg/metrics"
logf "sigs.k8s.io/controller-runtime/pkg/runtime/log"
)

Expand All @@ -48,9 +49,15 @@ var _ = BeforeSuite(func(done Done) {
clientset, err = kubernetes.NewForConfig(cfg)
Expect(err).NotTo(HaveOccurred())

// Prevent the metrics listener being created
metrics.DefaultBindAddress = "0"

close(done)
}, 60)

var _ = AfterSuite(func() {
testenv.Stop()

// Put the DefaultBindAddress back
metrics.DefaultBindAddress = ":8080"
})
15 changes: 15 additions & 0 deletions pkg/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,21 @@ var _ = Describe("controller.Controller", func() {

close(done)
})

It("should not return an error if two controllers are registered with different names", func(done Done) {
m, err := manager.New(cfg, manager.Options{})
Expect(err).NotTo(HaveOccurred())

c1, err := controller.New("c1", m, controller.Options{Reconciler: rec})
Expect(err).NotTo(HaveOccurred())
Expect(c1).ToNot(BeNil())

c2, err := controller.New("c2", m, controller.Options{Reconciler: rec})
Expect(err).NotTo(HaveOccurred())
Expect(c2).ToNot(BeNil())

close(done)
})
})
})

Expand Down
12 changes: 12 additions & 0 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cache"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/handler"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/runtime/inject"
Expand Down Expand Up @@ -171,6 +172,10 @@ func (c *Controller) Start(stop <-chan struct{}) error {
func (c *Controller) processNextWorkItem() bool {
// This code copy-pasted from the sample-Controller.

// Update metrics after processing each item
reconcileStartTS := time.Now()
defer c.updateMetrics(time.Now().Sub(reconcileStartTS))

obj, shutdown := c.Queue.Get()
if obj == nil {
// Sometimes the Queue gives us nil items when it starts up
Expand Down Expand Up @@ -207,6 +212,7 @@ func (c *Controller) processNextWorkItem() bool {
if result, err := c.Do.Reconcile(req); err != nil {
c.Queue.AddRateLimited(req)
log.Error(err, "Reconciler error", "Controller", c.Name, "Request", req)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Inc()

return false
} else if result.RequeueAfter > 0 {
Expand All @@ -233,3 +239,9 @@ func (c *Controller) InjectFunc(f inject.Func) error {
c.SetFields = f
return nil
}

// updateMetrics updates prometheus metrics within the controller
func (c *Controller) updateMetrics(reconcileTime time.Duration) {
ctrlmetrics.QueueLength.WithLabelValues(c.Name).Set(float64(c.Queue.Len()))
ctrlmetrics.ReconcileTime.WithLabelValues(c.Name).Observe(reconcileTime.Seconds())
}
87 changes: 87 additions & 0 deletions pkg/internal/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import (

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"github.com/prometheus/client_golang/prometheus"
dto "github.com/prometheus/client_model/go"
"k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
Expand All @@ -31,6 +33,7 @@ import (
"sigs.k8s.io/controller-runtime/pkg/cache/informertest"
"sigs.k8s.io/controller-runtime/pkg/controller/controllertest"
"sigs.k8s.io/controller-runtime/pkg/handler"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/internal/controller/metrics"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
"sigs.k8s.io/controller-runtime/pkg/reconcile/reconciletest"
Expand Down Expand Up @@ -404,6 +407,90 @@ var _ = Describe("controller", func() {
It("should create a new go routine for MaxConcurrentReconciles", func() {
// TODO(community): write this test
})

Context("should update prometheus metrics", func() {
It("should requeue a Request if there is an error and continue processing items", func(done Done) {
ctrlmetrics.QueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_reconcile_queue_length",
Help: "Length of reconcile queue per controller",
}, []string{"controller"})
ctrlmetrics.ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconcile errors per controller",
}, []string{"controller"})

fakeReconcile.Err = fmt.Errorf("expected error: reconcile")
go func() {
defer GinkgoRecover()
Expect(ctrl.Start(stop)).NotTo(HaveOccurred())
}()
ctrl.Queue.Add(request)

// Reduce the jitterperiod so we don't have to wait a second before the reconcile function is rerun.
ctrl.JitterPeriod = time.Millisecond

By("Invoking Reconciler which will give an error")
Expect(<-reconciled).To(Equal(request))
var queueLength, reconcileErrs dto.Metric
Eventually(func() error {
ctrlmetrics.QueueLength.WithLabelValues(ctrl.Name).Write(&queueLength)
if queueLength.GetGauge().GetValue() != 1.0 {
return fmt.Errorf("metrics not updated")
}
return nil
}, 2.0).Should(Succeed())
Eventually(func() error {
ctrlmetrics.ReconcileErrors.WithLabelValues(ctrl.Name).Write(&reconcileErrs)
if reconcileErrs.GetCounter().GetValue() != 1.0 {
return fmt.Errorf("metrics not updated")
}
return nil
}, 2.0).Should(Succeed())

By("Invoking Reconciler a second time without error")
fakeReconcile.Err = nil
Expect(<-reconciled).To(Equal(request))

By("Removing the item from the queue")
Eventually(ctrl.Queue.Len).Should(Equal(0))
Eventually(func() int { return ctrl.Queue.NumRequeues(request) }).Should(Equal(0))

close(done)
}, 2.0)

It("should add a reconcile time to the reconcile time histogram", func(done Done) {
ctrlmetrics.ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_second",
Help: "Length of time per reconcile per controller",
}, []string{"controller"})

go func() {
defer GinkgoRecover()
Expect(ctrl.Start(stop)).NotTo(HaveOccurred())
}()
ctrl.Queue.Add(request)

By("Invoking Reconciler")
Expect(<-reconciled).To(Equal(request))

By("Removing the item from the queue")
Eventually(ctrl.Queue.Len).Should(Equal(0))
Eventually(func() int { return ctrl.Queue.NumRequeues(request) }).Should(Equal(0))

var reconcileTime dto.Metric
Eventually(func() error {
histObserver := ctrlmetrics.ReconcileTime.WithLabelValues(ctrl.Name)
hist := histObserver.(prometheus.Histogram)
hist.Write(&reconcileTime)
if reconcileTime.GetHistogram().GetSampleCount() != uint64(1) {
return fmt.Errorf("metrics not updated")
}
return nil
}, 2.0).Should(Succeed())

close(done)
}, 4.0)
})
})
})

Expand Down
51 changes: 51 additions & 0 deletions pkg/internal/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
Copyright 2018 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
// QueueLength is a prometheus metric which counts the current reconcile
// queue length per controller
QueueLength = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "controller_runtime_reconcile_queue_length",
Help: "Length of reconcile queue per controller",
}, []string{"controller"})

// ReconcileErrors is a prometheus counter metrics which holds the total
// number of errors from the Reconciler
ReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{
Copy link

@lilic lilic Sep 28, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Besides the ReconcileErrors I would suggest adding also ReconcileTotal . That way we can see if in the past 5mins the rate of errors was too high.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

small typo in the suggested new metric name, should probably be ReconcileTotal

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@droot @DirectXMan12 Do you think this would be a worthwhile metric to integrate into this PR?

Name: "controller_runtime_reconcile_errors_total",
Help: "Total number of reconcile errors per controller",
}, []string{"controller"})

// ReconcileTime is a prometheus metric which keeps track of the duration
// of reconciles
ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "controller_runtime_reconcile_time_seconds",
Help: "Length of time per reconcile per controller",
}, []string{"controller"})
)

func init() {
metrics.Registry.MustRegister(QueueLength)
metrics.Registry.MustRegister(ReconcileErrors)
metrics.Registry.MustRegister(ReconcileTime)
}
Loading