Skip to content

Commit

Permalink
feat(terraformlayer): exponential time before retry (#102)
Browse files Browse the repository at this point in the history
* feat(terraformlayer): exponential time before retry

* test: add integration tests for failure grace period

* chore: return error last

---------

Co-authored-by: spoukke <sacha.bernheim@hey.com>
  • Loading branch information
Thibaut-Padok and spoukke authored May 9, 2023
1 parent 840046e commit 6f7d0bf
Show file tree
Hide file tree
Showing 9 changed files with 460 additions and 105 deletions.
4 changes: 3 additions & 1 deletion cmd/controllers/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command {
defaultDriftDetectionTimer, _ := time.ParseDuration("20m")
defaultOnErrorTimer, _ := time.ParseDuration("1m")
defaultWaitActionTimer, _ := time.ParseDuration("1m")
defaultFailureGracePeriod, _ := time.ParseDuration("15s")

cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository", "pullrequest"}, "list of controllers to start")

cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred in the controllers. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.")
cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true")
cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election")
cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers")
Expand Down
23 changes: 12 additions & 11 deletions docs/contents/usage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables.

#### Controllers' configuration

| Environment variable | Description | Default |
| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |
| Environment variable | Description | Default |
| :--------------------------------------------------: | :------------------------------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred in the controllers | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |

#### Server's configuration

Expand Down
12 changes: 7 additions & 5 deletions internal/burrito/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ type LeaderElectionConfig struct {
}

type ControllerTimers struct {
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"onError"`
WaitAction time.Duration `yaml:"waitAction"`
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"waitAction"`
WaitAction time.Duration `yaml:"onError"`
FailureGracePeriod time.Duration `yaml:"failureGracePeriod"`
}

type RepositoryConfig struct {
Expand Down Expand Up @@ -170,8 +171,9 @@ func TestConfig() *Config {
},
Controller: ControllerConfig{
Timers: ControllerTimers{
DriftDetection: 20 * time.Minute,
WaitAction: 5 * time.Minute,
DriftDetection: 20 * time.Minute,
WaitAction: 5 * time.Minute,
FailureGracePeriod: 15 * time.Second,
},
},
Runner: RunnerConfig{
Expand Down
94 changes: 81 additions & 13 deletions internal/controllers/terraformlayer/conditions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package terraformlayer

import (
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -105,6 +106,12 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash == "" {
condition.Reason = "LastApplyFailed"
condition.Message = "Last apply run has failed."
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash != planHash {
condition.Reason = "NewPlanAvailable"
condition.Message = "Apply will run."
Expand All @@ -117,28 +124,89 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
return condition, true
}

func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
condition := metav1.Condition{
Type: "HasFailed",
Type: "IsInFailureGracePeriod",
ObservedGeneration: t.GetObjectMeta().GetGeneration(),
Status: metav1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now()),
}
result, ok := t.Annotations[annotations.Failure]
if !ok {
condition.Reason = "NoRunYet"
condition.Message = "Terraform has not ran yet"
if failure, ok := t.Annotations[annotations.Failure]; !ok || failure == "0" {
condition.Reason = "NoFailureYet"
condition.Message = "No failure has been detected yet"
condition.Status = metav1.ConditionFalse
return condition, false
}
if string(result) == "0" {
condition.Reason = "RunExitedGracefully"
condition.Message = "Last run exited gracefully"
lastFailureDate, err := GetLastActionTime(r, t)
if err != nil {
condition.Reason = "CouldNotGetLastActionTime"
condition.Message = "Could not get last action time from layer annotations"
condition.Status = metav1.ConditionFalse
return condition, false
}
condition.Status = metav1.ConditionTrue
condition.Reason = "TerraformRunFailure"
condition.Message = "Terraform has failed, look at the runner logs"
return condition, true

nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t))
now := r.Clock.Now()
if nextFailure.After(now) {
condition.Reason = "InFailureGracePeriod"
condition.Message = fmt.Sprintf("The failure grace period is still active (until %s).", nextFailure)
condition.Status = metav1.ConditionTrue
return condition, true
}
condition.Reason = "FailureGracePeriodOver"
condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure))
condition.Status = metav1.ConditionFalse
return condition, false
}

// func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
// condition := metav1.Condition{
// Type: "HasFailed",
// ObservedGeneration: t.GetObjectMeta().GetGeneration(),
// Status: metav1.ConditionUnknown,
// LastTransitionTime: metav1.NewTime(time.Now()),
// }
// result, ok := t.Annotations[annotations.Failure]
// if !ok {
// condition.Reason = "NoRunYet"
// condition.Message = "Terraform has not ran yet"
// condition.Status = metav1.ConditionFalse
// return condition, false
// }
// if string(result) == "0" {
// condition.Reason = "RunExitedGracefully"
// condition.Message = "Last run exited gracefully"
// condition.Status = metav1.ConditionFalse
// return condition, false
// }
// condition.Status = metav1.ConditionTrue
// condition.Reason = "TerraformRunFailure"
// condition.Message = "Terraform has failed, look at the runner logs"
// return condition, true
// }

func GetLastActionTime(r *Reconciler, layer *configv1alpha1.TerraformLayer) (time.Time, error) {
var lastActionTime time.Time
lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate]
if !ok {
return r.Clock.Now(), errors.New("never ran a plan on this layer")
}
lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation)
if err != nil {
return r.Clock.Now(), err
}

lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate]
if !ok {
return lastActionTime, nil
}
lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation)
if err != nil {
return r.Clock.Now(), err
}

if lastApplyTime.After(lastActionTime) {
lastActionTime = lastApplyTime
}
return lastActionTime, nil
}
25 changes: 25 additions & 0 deletions internal/controllers/terraformlayer/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package terraformlayer

import (
"context"
"math"
"strconv"
"time"

"github.com/padok-team/burrito/internal/annotations"
"github.com/padok-team/burrito/internal/burrito/config"
"github.com/padok-team/burrito/internal/lock"
"github.com/padok-team/burrito/internal/storage"
Expand Down Expand Up @@ -128,6 +131,28 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r)
}

func GetLayerExponentialBackOffTime(DefaultRequeueAfter time.Duration, layer *configv1alpha1.TerraformLayer) time.Duration {
var n, ok = layer.Annotations[annotations.Failure]
var err error
attempts := 0

if ok {
attempts, err = strconv.Atoi(n)
if err != nil {
log.Errorf("failed to convert failure annotations : %v to int. Error : %v", n, err)
}
}
if attempts < 1 {
return DefaultRequeueAfter
}
return GetExponentialBackOffTime(DefaultRequeueAfter, attempts)
}

func GetExponentialBackOffTime(DefaultRequeueAfter time.Duration, attempts int) time.Duration {
var x float64 = float64(attempts)
return time.Duration(int32(math.Exp(x))) * DefaultRequeueAfter
}

func ignorePredicate() predicate.Predicate {
return predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
Expand Down
Loading

0 comments on commit 6f7d0bf

Please sign in to comment.