Skip to content

Commit

Permalink
feat(terraformlayer): exponential time before retry
Browse files Browse the repository at this point in the history
  • Loading branch information
Thibaut-Padok authored and spoukke committed May 9, 2023
1 parent 840046e commit 2c861db
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 23 deletions.
4 changes: 3 additions & 1 deletion cmd/controllers/start.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command {
defaultDriftDetectionTimer, _ := time.ParseDuration("20m")
defaultOnErrorTimer, _ := time.ParseDuration("1m")
defaultWaitActionTimer, _ := time.ParseDuration("1m")
defaultFailureGracePeriod, _ := time.ParseDuration("15s")

cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository", "pullrequest"}, "list of controllers to start")

cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred in the controllers. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.")
cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.")
cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true")
cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election")
cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers")
Expand Down
23 changes: 12 additions & 11 deletions docs/contents/usage/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables.

#### Controllers' configuration

| Environment variable | Description | Default |
| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |
| Environment variable | Description | Default |
| :--------------------------------------------------: | :------------------------------------------------------------------------------------------: | :------------------------------: |
| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` |
| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` |
| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred in the controllers | `1m` |
| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` |
| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` |
| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` |
| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` |
| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` |
| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` |
| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` |

#### Server's configuration

Expand Down
7 changes: 4 additions & 3 deletions internal/burrito/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,10 @@ type LeaderElectionConfig struct {
}

type ControllerTimers struct {
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"onError"`
WaitAction time.Duration `yaml:"waitAction"`
DriftDetection time.Duration `yaml:"driftDetection"`
OnError time.Duration `yaml:"waitAction"`
WaitAction time.Duration `yaml:"onError"`
FailureGracePeriod time.Duration `yaml:"failureGracePeriod"`
}

type RepositoryConfig struct {
Expand Down
79 changes: 78 additions & 1 deletion internal/controllers/terraformlayer/conditions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package terraformlayer

import (
"errors"
"fmt"
"time"

Expand All @@ -16,7 +17,16 @@ func (r *Reconciler) IsPlanArtifactUpToDate(t *configv1alpha1.TerraformLayer) (m
Status: metav1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now()),
}
value, ok := t.Annotations[annotations.LastPlanDate]
value, ok := t.Annotations[annotations.LastPlanSum]
// If the annotation exists and the value is "", an issue occured during plan.
// If the annotation is empty, no plan has ever ran. We can fallback to other conditions.
if value == "" && ok {
condition.Reason = "LastPlanFailed"
condition.Message = "Last plan run has failed"
condition.Status = metav1.ConditionFalse
return condition, false
}
value, ok = t.Annotations[annotations.LastPlanDate]
if !ok {
condition.Reason = "NoPlanHasRunYet"
condition.Message = "No plan has run on this layer yet"
Expand Down Expand Up @@ -105,6 +115,12 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash == "" {
condition.Reason = "LastApplyFailed"
condition.Message = "Last apply run has failed."
condition.Status = metav1.ConditionFalse
return condition, false
}
if applyHash != planHash {
condition.Reason = "NewPlanAvailable"
condition.Message = "Apply will run."
Expand All @@ -117,6 +133,41 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C
return condition, true
}

func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
condition := metav1.Condition{
Type: "IsInFailureGracePeriod",
ObservedGeneration: t.GetObjectMeta().GetGeneration(),
Status: metav1.ConditionUnknown,
LastTransitionTime: metav1.NewTime(time.Now()),
}
if failure, ok := t.Annotations[annotations.Failure]; !ok || failure == "0" {
condition.Reason = "NoFailureYet"
condition.Message = "No failure has been detected yet"
condition.Status = metav1.ConditionFalse
return condition, false
}
lastFailureDate, err := GetLastActionTime(t)
if err != nil {
condition.Reason = "CouldNotGetLastActionTime"
condition.Message = "Could not get last action time from layer annotations"
condition.Status = metav1.ConditionFalse
return condition, false
}

nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t))
now := time.Now()
if nextFailure.After(now) {
condition.Reason = "InFailureGracePeriod"
condition.Message = fmt.Sprintf("The failure grace period is still active (until %s).", nextFailure)
condition.Status = metav1.ConditionTrue
return condition, true
}
condition.Reason = "FailureGracePeriodOver"
condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure))
condition.Status = metav1.ConditionFalse
return condition, false
}

func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) {
condition := metav1.Condition{
Type: "HasFailed",
Expand All @@ -142,3 +193,29 @@ func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Conditi
condition.Message = "Terraform has failed, look at the runner logs"
return condition, true
}

func GetLastActionTime(layer *configv1alpha1.TerraformLayer) (time.Time, error) {
var lastActionTime time.Time
lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate]
if !ok {
return time.Now(), errors.New("never ran a plan on this layer")
}
lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation)
if err != nil {
return time.Now(), err
}

lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate]
if !ok {
return lastActionTime, nil
}
lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation)
if err != nil {
return time.Now(), err
}

if lastApplyTime.After(lastActionTime) {
lastActionTime = lastApplyTime
}
return lastActionTime, nil
}
25 changes: 25 additions & 0 deletions internal/controllers/terraformlayer/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ package terraformlayer

import (
"context"
"math"
"strconv"
"time"

"github.com/padok-team/burrito/internal/annotations"
"github.com/padok-team/burrito/internal/burrito/config"
"github.com/padok-team/burrito/internal/lock"
"github.com/padok-team/burrito/internal/storage"
Expand Down Expand Up @@ -128,6 +131,28 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error {
Complete(r)
}

func GetLayerExponentialBackOffTime(DefaultRequeueAfter time.Duration, layer *configv1alpha1.TerraformLayer) time.Duration {
var n, ok = layer.Annotations[annotations.Failure]
var err error
attempts := 0

if ok {
attempts, err = strconv.Atoi(n)
if err != nil {
log.Errorf("failed to convert failure annotations : %v to int. Error : %v", n, err)
}
}
if attempts < 1 {
return DefaultRequeueAfter
}
return GetExponentialBackOffTime(DefaultRequeueAfter, attempts)
}

func GetExponentialBackOffTime(DefaultRequeueAfter time.Duration, attempts int) time.Duration {
var x float64 = float64(attempts)
return time.Duration(int32(math.Exp(x))) * DefaultRequeueAfter
}

func ignorePredicate() predicate.Predicate {
return predicate.Funcs{
UpdateFunc: func(e event.UpdateEvent) bool {
Expand Down
138 changes: 138 additions & 0 deletions internal/controllers/terraformlayer/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
utils "github.com/padok-team/burrito/internal/controllers/testing"
storage "github.com/padok-team/burrito/internal/storage/mock"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/types"
Expand Down Expand Up @@ -372,3 +373,140 @@ var _ = AfterSuite(func() {
err := testEnv.Stop()
Expect(err).NotTo(HaveOccurred())
})

func TestGetLayerExponentialBackOffTime(t *testing.T) {
tt := []struct {
name string
defaultTime time.Duration
layer *configv1alpha1.TerraformLayer
expectedTime time.Duration
}{
{
"Exponential backoff : No retry",
time.Minute,
&configv1alpha1.TerraformLayer{
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
time.Minute,
},
{
"Exponential backoff : Success",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "0"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
time.Minute,
},
{
"Exponential backoff : 1 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "1"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
2 * time.Minute,
},
{
"Exponential backoff : 2 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "2"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
7 * time.Minute,
},
{
"Exponential backoff : 3 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "3"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
20 * time.Minute,
},
{
"Exponential backoff : 5 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "5"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
148 * time.Minute,
},
{
"Exponential backoff : 10 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "10"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
22026 * time.Minute,
},
{
"Exponential backoff : 17 retry",
time.Minute,
&configv1alpha1.TerraformLayer{
ObjectMeta: metav1.ObjectMeta{
Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "17"},
},
Spec: configv1alpha1.TerraformLayerSpec{
TerraformConfig: configv1alpha1.TerraformConfig{
Version: "1.0.1",
},
},
},
24154952 * time.Minute,
},
}

for _, tc := range tt {
t.Run(tc.name, func(t *testing.T) {
result := controller.GetLayerExponentialBackOffTime(tc.defaultTime, tc.layer)
// var n, _ = tc.layer.Annotations["runner.terraform.padok.cloud/failure"]
if tc.expectedTime != result {
t.Errorf("different version computed: expected %s go %s", tc.expectedTime, result)
}
})
}
}
Loading

0 comments on commit 2c861db

Please sign in to comment.