Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(terraformlayer): exponential time before retry #102

Merged
merged 3 commits into from
May 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/controllers/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command {
defaultDriftDetectionTimer, _ := time.ParseDuration("20m")
defaultOnErrorTimer, _ := time.ParseDuration("1m")
defaultWaitActionTimer, _ := time.ParseDuration("1m")
defaultFailureGracePeriod, _ := time.ParseDuration("15s")

cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository", "pullrequest"}, "list of controllers to start")

cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred in the controllers. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.")
cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true")
cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election")
cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers")
Expand Down
23 changes: 12 additions & 11 deletions docs/contents/usage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables.

#### Controllers' configuration

| Environment variable | Description | Default |
| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |
| Environment variable | Description | Default |
| :--------------------------------------------------: | :------------------------------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred in the controllers | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |

#### Server's configuration

Expand Down
12 changes: 7 additions & 5 deletions internal/burrito/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ type LeaderElectionConfig struct {
}

type ControllerTimers struct {
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"onError"`
WaitAction time.Duration `yaml:"waitAction"`
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"waitAction"`
WaitAction time.Duration `yaml:"onError"`
FailureGracePeriod time.Duration `yaml:"failureGracePeriod"`
}

type RepositoryConfig struct {
Expand Down Expand Up @@ -170,8 +171,9 @@ func TestConfig() *Config {
},
Controller: ControllerConfig{
Timers: ControllerTimers{
DriftDetection: 20 * time.Minute,
WaitAction: 5 * time.Minute,
DriftDetection: 20 * time.Minute,
WaitAction: 5 * time.Minute,
FailureGracePeriod: 15 * time.Second,
},
},
Runner: RunnerConfig{
Expand Down
94 changes: 81 additions & 13 deletions internal/controllers/terraformlayer/conditions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package terraformlayer

import (
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -105,6 +106,12 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash == "" {
condition.Reason = "LastApplyFailed"
condition.Message = "Last apply run has failed."
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash != planHash {
condition.Reason = "NewPlanAvailable"
condition.Message = "Apply will run."
Expand All @@ -117,28 +124,89 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
return condition, true
}

func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
condition := metav1.Condition{
Type: "HasFailed",
Type: "IsInFailureGracePeriod",
ObservedGeneration: t.GetObjectMeta().GetGeneration(),
Status: metav1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now()),
}
result, ok := t.Annotations[annotations.Failure]
if !ok {
condition.Reason = "NoRunYet"
condition.Message = "Terraform has not ran yet"
if failure, ok := t.Annotations[annotations.Failure]; !ok || failure == "0" {
condition.Reason = "NoFailureYet"
condition.Message = "No failure has been detected yet"
condition.Status = metav1.ConditionFalse
return condition, false
}
if string(result) == "0" {
condition.Reason = "RunExitedGracefully"
condition.Message = "Last run exited gracefully"
lastFailureDate, err := GetLastActionTime(r, t)
if err != nil {
condition.Reason = "CouldNotGetLastActionTime"
condition.Message = "Could not get last action time from layer annotations"
condition.Status = metav1.ConditionFalse
return condition, false
}
condition.Status = metav1.ConditionTrue
condition.Reason = "TerraformRunFailure"
condition.Message = "Terraform has failed, look at the runner logs"
return condition, true

nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t))
now := r.Clock.Now()
if nextFailure.After(now) {
condition.Reason = "InFailureGracePeriod"
condition.Message = fmt.Sprintf("The failure grace period is still active (until %s).", nextFailure)
condition.Status = metav1.ConditionTrue
return condition, true
}
condition.Reason = "FailureGracePeriodOver"
condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure))
condition.Status = metav1.ConditionFalse
return condition, false
}

// func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
// condition := metav1.Condition{
// Type: "HasFailed",
// ObservedGeneration: t.GetObjectMeta().GetGeneration(),
// Status: metav1.ConditionUnknown,
// LastTransitionTime: metav1.NewTime(time.Now()),
// }
// result, ok := t.Annotations[annotations.Failure]
// if !ok {
// condition.Reason = "NoRunYet"
// condition.Message = "Terraform has not ran yet"
// condition.Status = metav1.ConditionFalse
// return condition, false
// }
// if string(result) == "0" {
// condition.Reason = "RunExitedGracefully"
// condition.Message = "Last run exited gracefully"
// condition.Status = metav1.ConditionFalse
// return condition, false
// }
// condition.Status = metav1.ConditionTrue
// condition.Reason = "TerraformRunFailure"
// condition.Message = "Terraform has failed, look at the runner logs"
// return condition, true
// }

func GetLastActionTime(r *Reconciler, layer *configv1alpha1.TerraformLayer) (time.Time, error) {
var lastActionTime time.Time
lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate]
if !ok {
return r.Clock.Now(), errors.New("never ran a plan on this layer")
}
lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation)
if err != nil {
return r.Clock.Now(), err
}

lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate]
if !ok {
return lastActionTime, nil
}
lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation)
if err != nil {
return r.Clock.Now(), err
}

if lastApplyTime.After(lastActionTime) {
lastActionTime = lastApplyTime
}
return lastActionTime, nil
}
25 changes: 25 additions & 0 deletions internal/controllers/terraformlayer/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package terraformlayer

import (
"context"
"math"
"strconv"
"time"

"github.com/padok-team/burrito/internal/annotations"
"github.com/padok-team/burrito/internal/burrito/config"
"github.com/padok-team/burrito/internal/lock"
"github.com/padok-team/burrito/internal/storage"
Expand Down Expand Up @@ -128,6 +131,28 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r)
}

func GetLayerExponentialBackOffTime(DefaultRequeueAfter time.Duration, layer *configv1alpha1.TerraformLayer) time.Duration {
var n, ok = layer.Annotations[annotations.Failure]
var err error
attempts := 0

if ok {
attempts, err = strconv.Atoi(n)
if err != nil {
log.Errorf("failed to convert failure annotations : %v to int. Error : %v", n, err)
}
}
if attempts < 1 {
return DefaultRequeueAfter
}
return GetExponentialBackOffTime(DefaultRequeueAfter, attempts)
}

func GetExponentialBackOffTime(DefaultRequeueAfter time.Duration, attempts int) time.Duration {
var x float64 = float64(attempts)
return time.Duration(int32(math.Exp(x))) * DefaultRequeueAfter
}

func ignorePredicate() predicate.Predicate {
return predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
Expand Down
Loading