Skip to content

Commit

Permalink
Add autoupdate controller metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
hugoShaka committed Jan 14, 2025
1 parent 3b1518c commit e7159a6
Show file tree
Hide file tree
Showing 6 changed files with 720 additions and 53 deletions.
30 changes: 24 additions & 6 deletions lib/autoupdate/rollout/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"

"github.com/gravitational/teleport"
"github.com/gravitational/teleport/api/utils/retryutils"
Expand All @@ -45,13 +46,14 @@ type Controller struct {
clock clockwork.Clock
log *slog.Logger
period time.Duration
metrics *metrics
}

// NewController creates a new Controller for the autoupdate_agent_rollout kind.
// The period can be specified to control the sync frequency. This is mainly
// used to speed up tests or for demo purposes. When empty, the controller picks
// a sane default value.
func NewController(client Client, log *slog.Logger, clock clockwork.Clock, period time.Duration) (*Controller, error) {
func NewController(client Client, log *slog.Logger, clock clockwork.Clock, period time.Duration, reg prometheus.Registerer) (*Controller, error) {
if client == nil {
return nil, trace.BadParameter("missing client")
}
Expand All @@ -61,6 +63,9 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock, perio
if clock == nil {
return nil, trace.BadParameter("missing clock")
}
if reg == nil {
return nil, trace.BadParameter("missing prometheus.Registerer")
}

if period <= 0 {
period = defaultReconcilerPeriod
Expand All @@ -77,13 +82,17 @@ func NewController(client Client, log *slog.Logger, clock clockwork.Clock, perio
return nil, trace.Wrap(err, "failed to initialize time-based strategy")
}

m := newMetrics(reg)

return &Controller{
clock: clock,
log: log,
metrics: m,
clock: clock,
log: log,
reconciler: reconciler{
clt: client,
log: log,
clock: clock,
clt: client,
log: log,
clock: clock,
metrics: m,
rolloutStrategies: []rolloutStrategy{
timeBased,
haltOnError,
Expand Down Expand Up @@ -122,13 +131,22 @@ func (c *Controller) Run(ctx context.Context) error {
// tryAndCatch tries to run the controller reconciliation logic and recovers from potential panic by converting them
// into errors. This ensures that a critical bug in the reconciler cannot bring down the whole Teleport cluster.
func (c *Controller) tryAndCatch(ctx context.Context) (err error) {
startTime := c.clock.Now()
// If something terribly bad happens during the reconciliation, we recover and return an error
defer func() {
if r := recover(); r != nil {
c.log.ErrorContext(ctx, "Recovered from panic in the autoupdate_agent_rollout controller", "panic", r)
err = trace.NewAggregate(err, trace.Errorf("Panic recovered during reconciliation: %v", r))
c.metrics.observeReconciliation(metricsReconciliationResultLabelValuePanic, c.clock.Now().Sub(startTime))
}
}()

err = trace.Wrap(c.reconciler.reconcile(ctx))
endTime := c.clock.Now()
result := metricsReconciliationResultLabelValueSuccess
if err != nil {
result = metricsReconciliationResultLabelValueFail
}
c.metrics.observeReconciliation(result, endTime.Sub(startTime))
return
}
Loading

0 comments on commit e7159a6

Please sign in to comment.