diff --git a/cmd/controllers/start.go b/cmd/controllers/start.go index 846eb56c2..807bda02a 100644 --- a/cmd/controllers/start.go +++ b/cmd/controllers/start.go @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command { defaultDriftDetectionTimer, _ := time.ParseDuration("20m") defaultOnErrorTimer, _ := time.ParseDuration("1m") defaultWaitActionTimer, _ := time.ParseDuration("1m") + defaultFailureGracePeriod, _ := time.ParseDuration("15s") cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository"}, "list of controllers to start") cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.") cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.") cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.") + cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.") cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true") cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election") cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers") diff --git a/docs/contents/usage/README.md b/docs/contents/usage/README.md index 20916da90..edc37f9f2 100644 --- a/docs/contents/usage/README.md +++ b/docs/contents/usage/README.md @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables. #### Controllers' configuration -| Environment variable | Description | Default | -| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: | -| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` | -| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` | -| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` | -| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` | -| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` | -| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` | -| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` | -| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` | -| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` | +| Environment variable | Description | Default | +| :--------------------------------------------------: | :------------------------------------------------------------------------: | :------------------------------: | +| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` | +| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` | +| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` | +| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` | +| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` | +| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` | +| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` | +| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` | +| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` | +| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` | #### Server's configuration diff --git a/internal/burrito/config/config.go b/internal/burrito/config/config.go index d715b1bca..9975f4383 100644 --- a/internal/burrito/config/config.go +++ b/internal/burrito/config/config.go @@ -48,9 +48,10 @@ type LeaderElectionConfig struct { } type ControllerTimers struct { - DriftDetection time.Duration `yaml:"driftDetection"` - OnError time.Duration `yaml:"waitAction"` - WaitAction time.Duration `yaml:"onError"` + DriftDetection time.Duration `yaml:"driftDetection"` + OnError time.Duration `yaml:"waitAction"` + WaitAction time.Duration `yaml:"onError"` + FailureGracePeriod time.Duration `yaml:"failureGracePeriod"` } type RepositoryConfig struct { diff --git a/internal/controllers/terraformlayer/conditions.go b/internal/controllers/terraformlayer/conditions.go index 9e9157b6b..c00ec4320 100644 --- a/internal/controllers/terraformlayer/conditions.go +++ b/internal/controllers/terraformlayer/conditions.go @@ -125,7 +125,6 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C } func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) { - // TODO(Thibault) condition := metav1.Condition{ Type: "IsInFailureGracePeriod", ObservedGeneration: t.GetObjectMeta().GetGeneration(), @@ -138,21 +137,15 @@ func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (m condition.Status = metav1.ConditionFalse return condition, false } - - if !ok { - condition.Reason = "NoFailureYet" - condition.Message = "No failure has been detected yet" - condition.Status = metav1.ConditionFalse - return condition, false - } - lastFailure, err := time.Parse(time.UnixDate, lastFailureDate) + lastFailureDate, err := GetLastActionTime(t) if err != nil { - condition.Reason = "ParseError" - condition.Message = "Could not parse time from annotation" + condition.Reason = "CantDetectFailure" + condition.Message = fmt.Sprintf("Error while parsing annotation %v", err) condition.Status = metav1.ConditionFalse return condition, false } - nextFailure := lastFailure.Add(r.Config.Controller.Timers.FailureGracePeriod) + + nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t)) now := time.Now() if nextFailure.After(now) { condition.Reason = "InFailureGracePeriod" @@ -161,7 +154,7 @@ func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (m return condition, true } condition.Reason = "FailureGracePeriodOver" - condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", nextFailure) + condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure)) condition.Status = metav1.ConditionFalse return condition, false } @@ -192,25 +185,28 @@ func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Conditi return condition, true } -func getLastActionTime(layer configv1alpha1.TerraformLayer) (time.Time, error) { +func GetLastActionTime(layer *configv1alpha1.TerraformLayer) (time.Time, error) { var lastActionTime time.Time - lastPlanTime, ok := layer.Annotations[annotations.LastPlanDate] + lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate] if !ok { return time.Now(), errors.New("never ran a plan on this layer") } - lastActionTime, err := time.Parse(time.UnixDate, lastPlanTime) + lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation) if err != nil { return time.Now(), err } - lastApplyTime, ok := layer.Annotations[annotations.LastApplyDate] + + lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate] if !ok { return lastActionTime, nil } + lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation) + if err != nil { + return time.Now(), err + } + if lastApplyTime.After(lastActionTime) { - lastActionTime, err = time.Parse(time.UnixDate, lastApplyTime) - if err != nil { - return time.Now(), err - } + lastActionTime = lastApplyTime } return lastActionTime, nil } diff --git a/internal/controllers/terraformlayer/states.go b/internal/controllers/terraformlayer/states.go index 5f032b0e7..c7bc4e7f7 100644 --- a/internal/controllers/terraformlayer/states.go +++ b/internal/controllers/terraformlayer/states.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "strings" + "time" configv1alpha1 "github.com/padok-team/burrito/api/v1alpha1" "github.com/padok-team/burrito/internal/lock" @@ -48,8 +49,19 @@ type FailureGracePeriod struct{} func (s *FailureGracePeriod) getHandler() Handler { return func(ctx context.Context, r *Reconciler, layer *configv1alpha1.TerraformLayer, repository *configv1alpha1.TerraformRepository) ctrl.Result { - // TODO(Thibault) - return ctrl.Result{RequeueAfter: } + lastActionTime, ok := GetLastActionTime(layer) + if ok != nil { + log.Errorf("could not get lastActionTime on layer %s, due to %s", layer.Name, ok) + return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.OnError} + } + expTime := GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, layer) + endIdleTime := lastActionTime.Add(expTime) + now := time.Now() + if endIdleTime.After(now) { + log.Infof("the grace period is over for layer %v, new retry", layer.Name) + return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.WaitAction} + } + return ctrl.Result{RequeueAfter: now.Sub(endIdleTime)} } }