Skip to content

Commit

Permalink
reconciler/managed: add crossplane_resource_drift_seconds metric
Browse files Browse the repository at this point in the history
  • Loading branch information
sttts committed Jul 27, 2023
1 parent 0d8cbce commit 29e3853
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 0 deletions.
97 changes: 97 additions & 0 deletions pkg/reconciler/managed/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/*
Copyright 2023 The Crossplane Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package managed

import (
"context"
"sync"
"time"

"github.com/prometheus/client_golang/prometheus"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/tools/cache"
"sigs.k8s.io/controller-runtime/pkg/cluster"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/metrics"

"github.com/crossplane/crossplane-runtime/pkg/resource"
)

func init() {
metrics.Registry.MustRegister(drift)
}

var subSystem = "managed-reconciler"

var (
drift = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Subsystem: subSystem,
Name: "crossplane_resource_drift_seconds",
Help: "How long since the previous reconcile when a resource was found to be out of sync; excludes restart of the provider",
Buckets: prometheus.ExponentialBuckets(10e-9, 10, 10),
}, []string{"group", "kind", "retries"})
)

// driftRecorder records the time since the last observation of a resource
// and records the time since on update as a metric. This represents an upper
// bound for the duration the drift existed.
type driftRecorder struct {
lastObservation sync.Map
gvk schema.GroupVersionKind

cluster cluster.Cluster
}

var _ manager.Runnable = &driftRecorder{}

func (r *driftRecorder) Start(ctx context.Context) error {
inf, err := r.cluster.GetCache().GetInformerForKind(ctx, r.gvk)
if err != nil {
return err
}

registered, err := inf.AddEventHandler(cache.ResourceEventHandlerFuncs{
DeleteFunc: func(obj interface{}) {
if final, ok := obj.(cache.DeletedFinalStateUnknown); ok {
obj = final.Obj
}
managed := obj.(resource.Managed)
r.lastObservation.Delete(managed.GetName())
},
})
if err != nil {
return err
}
defer inf.RemoveEventHandler(registered)

<-ctx.Done()

return nil
}

func (r *driftRecorder) recordUnchanged(name string) {
r.lastObservation.Store(name, time.Now())
}

func (r *driftRecorder) recordUpdate(name string) {
last, ok := r.lastObservation.Load(name)
if !ok {
return
}

drift.WithLabelValues(r.gvk.Group, r.gvk.Kind).Observe(time.Since(last.(time.Time)).Seconds())
}
15 changes: 15 additions & 0 deletions pkg/reconciler/managed/reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,8 @@ type Reconciler struct {

features feature.Flags

driftRecorder driftRecorder

// The below structs embed the set of interfaces used to implement the
// managed resource reconciler. We do this primarily for readability, so
// that the reconciler logic reads r.external.Connect(),
Expand Down Expand Up @@ -671,12 +673,15 @@ func NewReconciler(m manager.Manager, of resource.ManagedKind, o ...ReconcilerOp
creationGracePeriod: defaultGracePeriod,
timeout: reconcileTimeout,
managed: defaultMRManaged(m),
driftRecorder: driftRecorder{cluster: m},
external: defaultMRExternal(),
supportedManagementPolicies: defaultSupportedManagementPolicies(),
log: logging.NewNopLogger(),
record: event.NewNopRecorder(),
}

m.Add(&r.driftRecorder) // nolint:errcheck

for _, ro := range o {
ro(r)
}
Expand Down Expand Up @@ -1079,6 +1084,13 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
// https://github.com/crossplane/crossplane/issues/289
log.Debug("External resource is up to date", "requeue-after", time.Now().Add(r.pollInterval))
managed.SetConditions(xpv1.ReconcileSuccess())

// record that we intentionally did not update the managed resource
// because no drift was detected. We call this so late in the reconcile
// because all the cases above could contribute (for different reasons)
// that the external object would not have been updated.
r.driftRecorder.recordUnchanged(managed.GetName())

return reconcile.Result{RequeueAfter: r.pollInterval}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
}

Expand Down Expand Up @@ -1106,6 +1118,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
return reconcile.Result{Requeue: true}, errors.Wrap(r.client.Status().Update(ctx, managed), errUpdateManagedStatus)
}

// record the drift after the successful update.
r.driftRecorder.recordUpdate(managed.GetName())

if _, err := r.managed.PublishConnection(ctx, managed, update.ConnectionDetails); err != nil {
// If this is the first time we encounter this issue we'll be requeued
// implicitly when we update our status with the new error condition. If
Expand Down

0 comments on commit 29e3853

Please sign in to comment.