Skip to content

Commit

Permalink
feat(*): add exponential backoff logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Thibaut-Padok committed Apr 25, 2023
1 parent 90201b3 commit 1c2fa79
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 37 deletions.
2 changes: 2 additions & 0 deletions cmd/controllers/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command {
defaultDriftDetectionTimer, _ := time.ParseDuration("20m")
defaultOnErrorTimer, _ := time.ParseDuration("1m")
defaultWaitActionTimer, _ := time.ParseDuration("1m")
defaultFailureGracePeriod, _ := time.ParseDuration("15s")

cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository"}, "list of controllers to start")

cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.")
cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true")
cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election")
cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers")
Expand Down
23 changes: 12 additions & 11 deletions docs/contents/usage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables.

#### Controllers' configuration

| Environment variable | Description | Default |
| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |
| Environment variable | Description | Default |
| :--------------------------------------------------: | :------------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |

#### Server's configuration

Expand Down
7 changes: 4 additions & 3 deletions internal/burrito/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,10 @@ type LeaderElectionConfig struct {
}

type ControllerTimers struct {
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"waitAction"`
WaitAction time.Duration `yaml:"onError"`
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"waitAction"`
WaitAction time.Duration `yaml:"onError"`
FailureGracePeriod time.Duration `yaml:"failureGracePeriod"`
}

type RepositoryConfig struct {
Expand Down
38 changes: 17 additions & 21 deletions internal/controllers/terraformlayer/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
}

func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
// TODO(Thibault)
condition := metav1.Condition{
Type: "IsInFailureGracePeriod",
ObservedGeneration: t.GetObjectMeta().GetGeneration(),
Expand All @@ -138,21 +137,15 @@ func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (m
condition.Status = metav1.ConditionFalse
return condition, false
}

if !ok {
condition.Reason = "NoFailureYet"
condition.Message = "No failure has been detected yet"
condition.Status = metav1.ConditionFalse
return condition, false
}
lastFailure, err := time.Parse(time.UnixDate, lastFailureDate)
lastFailureDate, err := GetLastActionTime(t)
if err != nil {
condition.Reason = "ParseError"
condition.Message = "Could not parse time from annotation"
condition.Reason = "CantDetectFailure"
condition.Message = fmt.Sprintf("Error while parsing annotation %v", err)
condition.Status = metav1.ConditionFalse
return condition, false
}
nextFailure := lastFailure.Add(r.Config.Controller.Timers.FailureGracePeriod)

nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t))
now := time.Now()
if nextFailure.After(now) {
condition.Reason = "InFailureGracePeriod"
Expand All @@ -161,7 +154,7 @@ func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (m
return condition, true
}
condition.Reason = "FailureGracePeriodOver"
condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", nextFailure)
condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure))
condition.Status = metav1.ConditionFalse
return condition, false
}
Expand Down Expand Up @@ -192,25 +185,28 @@ func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Conditi
return condition, true
}

func getLastActionTime(layer configv1alpha1.TerraformLayer) (time.Time, error) {
func GetLastActionTime(layer *configv1alpha1.TerraformLayer) (time.Time, error) {
var lastActionTime time.Time
lastPlanTime, ok := layer.Annotations[annotations.LastPlanDate]
lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate]
if !ok {
return time.Now(), errors.New("never ran a plan on this layer")
}
lastActionTime, err := time.Parse(time.UnixDate, lastPlanTime)
lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation)
if err != nil {
return time.Now(), err
}
lastApplyTime, ok := layer.Annotations[annotations.LastApplyDate]

lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate]
if !ok {
return lastActionTime, nil
}
lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation)
if err != nil {
return time.Now(), err
}

if lastApplyTime.After(lastActionTime) {
lastActionTime, err = time.Parse(time.UnixDate, lastApplyTime)
if err != nil {
return time.Now(), err
}
lastActionTime = lastApplyTime
}
return lastActionTime, nil
}
16 changes: 14 additions & 2 deletions internal/controllers/terraformlayer/states.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"strings"
"time"

configv1alpha1 "github.com/padok-team/burrito/api/v1alpha1"
"github.com/padok-team/burrito/internal/lock"
Expand Down Expand Up @@ -48,8 +49,19 @@ type FailureGracePeriod struct{}

func (s *FailureGracePeriod) getHandler() Handler {
return func(ctx context.Context, r *Reconciler, layer *configv1alpha1.TerraformLayer, repository *configv1alpha1.TerraformRepository) ctrl.Result {
// TODO(Thibault)
return ctrl.Result{RequeueAfter: }
lastActionTime, ok := GetLastActionTime(layer)
if ok != nil {
log.Errorf("could not get lastActionTime on layer %s, due to %s", layer.Name, ok)
return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.OnError}
}
expTime := GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, layer)
endIdleTime := lastActionTime.Add(expTime)
now := time.Now()
if endIdleTime.After(now) {
log.Infof("the grace period is over for layer %v, new retry", layer.Name)
return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.WaitAction}
}
return ctrl.Result{RequeueAfter: now.Sub(endIdleTime)}
}
}

Expand Down

0 comments on commit 1c2fa79

Please sign in to comment.