diff --git a/cmd/controllers/start.go b/cmd/controllers/start.go index 2505073b..f8410196 100644 --- a/cmd/controllers/start.go +++ b/cmd/controllers/start.go @@ -26,12 +26,14 @@ func buildControllersStartCmd(app *burrito.App) *cobra.Command { defaultDriftDetectionTimer, _ := time.ParseDuration("20m") defaultOnErrorTimer, _ := time.ParseDuration("1m") defaultWaitActionTimer, _ := time.ParseDuration("1m") + defaultFailureGracePeriod, _ := time.ParseDuration("15s") cmd.Flags().StringSliceVar(&app.Config.Controller.Types, "types", []string{"layer", "repository", "pullrequest"}, "list of controllers to start") cmd.Flags().DurationVar(&app.Config.Controller.Timers.DriftDetection, "drift-detection-period", defaultDriftDetectionTimer, "period between two plans. Must end with s, m or h.") - cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred. Must end with s, m or h.") + cmd.Flags().DurationVar(&app.Config.Controller.Timers.OnError, "on-error-period", defaultOnErrorTimer, "period between two runners launch when an error occurred in the controllers. Must end with s, m or h.") cmd.Flags().DurationVar(&app.Config.Controller.Timers.WaitAction, "wait-action-period", defaultWaitActionTimer, "period between two runners when a layer is locked. Must end with s, m or h.") + cmd.Flags().DurationVar(&app.Config.Controller.Timers.FailureGracePeriod, "failure-grace-period", defaultFailureGracePeriod, "initial time before retry, goes exponential function of number failure. Must end with s, m or h.") cmd.Flags().BoolVar(&app.Config.Controller.LeaderElection.Enabled, "leader-election", true, "whether leader election is enabled or not, default to true") cmd.Flags().StringVar(&app.Config.Controller.LeaderElection.ID, "leader-election-id", "6d185457.terraform.padok.cloud", "lease id used for leader election") cmd.Flags().StringVar(&app.Config.Controller.HealthProbeBindAddress, "health-probe-bind-address", ":8081", "address to bind the metrics server embedded in the controllers") diff --git a/docs/contents/usage/README.md b/docs/contents/usage/README.md index 20916da9..d8e98641 100644 --- a/docs/contents/usage/README.md +++ b/docs/contents/usage/README.md @@ -217,17 +217,18 @@ You can configure `burrito` with environment variables. #### Controllers' configuration -| Environment variable | Description | Default | -| :-----------------------------------------: | :--------------------------------------------------------------------: | :------------------------------: | -| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` | -| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` | -| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred | `1m` | -| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` | -| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` | -| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` | -| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` | -| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` | -| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` | +| Environment variable | Description | Default | +| :--------------------------------------------------: | :------------------------------------------------------------------------------------------: | :------------------------------: | +| `BURRITO_CONTROLLER_TYPES` | list of controllers to start | `layer,repository` | +| `BURRITO_CONTROLLER_TIMERS_DRIFTDETECTION` | period between two plans for drift detection | `20m` | +| `BURRITO_CONTROLLER_TIMERS_ONERROR` | period between two runners launch when an error occurred in the controllers | `1m` | +| `BURRITO_CONTROLLER_TIMERS_WAITACTION` | period between two runners launch when a layer is locked | `1m` | +| `BURRITO_CONTROLLER_TIMERS_FAILUREGRACEPERIOD` | initial time before retry, goes exponential function of number failure | `15s` | +| `BURRITO_CONTROLLER_LEADERELECTION_ENABLED` | whether leader election is enabled or not | `true` | +| `BURRITO_CONTROLLER_LEADERELECTION_ID` | lease id used for leader election | `6d185457.terraform.padok.cloud` | +| `BURRITO_CONTROLLER_HEALTHPROBEBINDADDRESS` | address to bind the health probe server embedded in the controllers | `:8081` | +| `BURRITO_CONTROLLER_METRICSBINDADDRESS` | address to bind the metrics server embedded in the controllers | `:8080` | +| `BURRITO_CONTROLLER_KUBERNETESWEBHOOKPORT` | port used by the validating webhook server embedded in the controllers | `9443` | #### Server's configuration diff --git a/internal/burrito/config/config.go b/internal/burrito/config/config.go index e7b5b526..a87753fd 100644 --- a/internal/burrito/config/config.go +++ b/internal/burrito/config/config.go @@ -61,9 +61,10 @@ type LeaderElectionConfig struct { } type ControllerTimers struct { - DriftDetection time.Duration `yaml:"driftDetection"` - OnError time.Duration `yaml:"onError"` - WaitAction time.Duration `yaml:"waitAction"` + DriftDetection time.Duration `yaml:"driftDetection"` + OnError time.Duration `yaml:"waitAction"` + WaitAction time.Duration `yaml:"onError"` + FailureGracePeriod time.Duration `yaml:"failureGracePeriod"` } type RepositoryConfig struct { @@ -170,8 +171,9 @@ func TestConfig() *Config { }, Controller: ControllerConfig{ Timers: ControllerTimers{ - DriftDetection: 20 * time.Minute, - WaitAction: 5 * time.Minute, + DriftDetection: 20 * time.Minute, + WaitAction: 5 * time.Minute, + FailureGracePeriod: 15 * time.Second, }, }, Runner: RunnerConfig{ diff --git a/internal/controllers/terraformlayer/conditions.go b/internal/controllers/terraformlayer/conditions.go index ab56b23d..0d3c184a 100644 --- a/internal/controllers/terraformlayer/conditions.go +++ b/internal/controllers/terraformlayer/conditions.go @@ -1,6 +1,7 @@ package terraformlayer import ( + "errors" "fmt" "time" @@ -105,6 +106,12 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C condition.Status = metav1.ConditionFalse return condition, false } + if applyHash == "" { + condition.Reason = "LastApplyFailed" + condition.Message = "Last apply run has failed." + condition.Status = metav1.ConditionFalse + return condition, false + } if applyHash != planHash { condition.Reason = "NewPlanAvailable" condition.Message = "Apply will run." @@ -117,28 +124,89 @@ func (r *Reconciler) IsApplyUpToDate(t *configv1alpha1.TerraformLayer) (metav1.C return condition, true } -func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) { +func (r *Reconciler) IsInFailureGracePeriod(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) { condition := metav1.Condition{ - Type: "HasFailed", + Type: "IsInFailureGracePeriod", ObservedGeneration: t.GetObjectMeta().GetGeneration(), Status: metav1.ConditionUnknown, LastTransitionTime: metav1.NewTime(time.Now()), } - result, ok := t.Annotations[annotations.Failure] - if !ok { - condition.Reason = "NoRunYet" - condition.Message = "Terraform has not ran yet" + if failure, ok := t.Annotations[annotations.Failure]; !ok || failure == "0" { + condition.Reason = "NoFailureYet" + condition.Message = "No failure has been detected yet" condition.Status = metav1.ConditionFalse return condition, false } - if string(result) == "0" { - condition.Reason = "RunExitedGracefully" - condition.Message = "Last run exited gracefully" + lastFailureDate, err := GetLastActionTime(r, t) + if err != nil { + condition.Reason = "CouldNotGetLastActionTime" + condition.Message = "Could not get last action time from layer annotations" condition.Status = metav1.ConditionFalse return condition, false } - condition.Status = metav1.ConditionTrue - condition.Reason = "TerraformRunFailure" - condition.Message = "Terraform has failed, look at the runner logs" - return condition, true + + nextFailure := lastFailureDate.Add(GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, t)) + now := r.Clock.Now() + if nextFailure.After(now) { + condition.Reason = "InFailureGracePeriod" + condition.Message = fmt.Sprintf("The failure grace period is still active (until %s).", nextFailure) + condition.Status = metav1.ConditionTrue + return condition, true + } + condition.Reason = "FailureGracePeriodOver" + condition.Message = fmt.Sprintf("The failure grace period is over (since %s).", now.Sub(nextFailure)) + condition.Status = metav1.ConditionFalse + return condition, false +} + +// func (r *Reconciler) HasFailed(t *configv1alpha1.TerraformLayer) (metav1.Condition, bool) { +// condition := metav1.Condition{ +// Type: "HasFailed", +// ObservedGeneration: t.GetObjectMeta().GetGeneration(), +// Status: metav1.ConditionUnknown, +// LastTransitionTime: metav1.NewTime(time.Now()), +// } +// result, ok := t.Annotations[annotations.Failure] +// if !ok { +// condition.Reason = "NoRunYet" +// condition.Message = "Terraform has not ran yet" +// condition.Status = metav1.ConditionFalse +// return condition, false +// } +// if string(result) == "0" { +// condition.Reason = "RunExitedGracefully" +// condition.Message = "Last run exited gracefully" +// condition.Status = metav1.ConditionFalse +// return condition, false +// } +// condition.Status = metav1.ConditionTrue +// condition.Reason = "TerraformRunFailure" +// condition.Message = "Terraform has failed, look at the runner logs" +// return condition, true +// } + +func GetLastActionTime(r *Reconciler, layer *configv1alpha1.TerraformLayer) (time.Time, error) { + var lastActionTime time.Time + lastPlanTimeAnnotation, ok := layer.Annotations[annotations.LastPlanDate] + if !ok { + return r.Clock.Now(), errors.New("never ran a plan on this layer") + } + lastActionTime, err := time.Parse(time.UnixDate, lastPlanTimeAnnotation) + if err != nil { + return r.Clock.Now(), err + } + + lastApplyTimeAnnotation, ok := layer.Annotations[annotations.LastApplyDate] + if !ok { + return lastActionTime, nil + } + lastApplyTime, err := time.Parse(time.UnixDate, lastApplyTimeAnnotation) + if err != nil { + return r.Clock.Now(), err + } + + if lastApplyTime.After(lastActionTime) { + lastActionTime = lastApplyTime + } + return lastActionTime, nil } diff --git a/internal/controllers/terraformlayer/controller.go b/internal/controllers/terraformlayer/controller.go index 5cc7d07d..55949270 100644 --- a/internal/controllers/terraformlayer/controller.go +++ b/internal/controllers/terraformlayer/controller.go @@ -18,8 +18,11 @@ package terraformlayer import ( "context" + "math" + "strconv" "time" + "github.com/padok-team/burrito/internal/annotations" "github.com/padok-team/burrito/internal/burrito/config" "github.com/padok-team/burrito/internal/lock" "github.com/padok-team/burrito/internal/storage" @@ -128,6 +131,28 @@ func (r *Reconciler) SetupWithManager(mgr ctrl.Manager) error { Complete(r) } +func GetLayerExponentialBackOffTime(DefaultRequeueAfter time.Duration, layer *configv1alpha1.TerraformLayer) time.Duration { + var n, ok = layer.Annotations[annotations.Failure] + var err error + attempts := 0 + + if ok { + attempts, err = strconv.Atoi(n) + if err != nil { + log.Errorf("failed to convert failure annotations : %v to int. Error : %v", n, err) + } + } + if attempts < 1 { + return DefaultRequeueAfter + } + return GetExponentialBackOffTime(DefaultRequeueAfter, attempts) +} + +func GetExponentialBackOffTime(DefaultRequeueAfter time.Duration, attempts int) time.Duration { + var x float64 = float64(attempts) + return time.Duration(int32(math.Exp(x))) * DefaultRequeueAfter +} + func ignorePredicate() predicate.Predicate { return predicate.Funcs{ UpdateFunc: func(e event.UpdateEvent) bool { diff --git a/internal/controllers/terraformlayer/controller_test.go b/internal/controllers/terraformlayer/controller_test.go index 8f293316..cbfa7004 100644 --- a/internal/controllers/terraformlayer/controller_test.go +++ b/internal/controllers/terraformlayer/controller_test.go @@ -16,6 +16,7 @@ import ( utils "github.com/padok-team/burrito/internal/controllers/testing" storage "github.com/padok-team/burrito/internal/storage/mock" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/apimachinery/pkg/types" @@ -80,13 +81,13 @@ var _ = BeforeSuite(func() { }) -func getResult(name types.NamespacedName) (reconcile.Result, error, error, *configv1alpha1.TerraformLayer) { +func getResult(name types.NamespacedName) (reconcile.Result, *configv1alpha1.TerraformLayer, error, error) { result, reconcileError := reconciler.Reconcile(context.TODO(), reconcile.Request{ NamespacedName: name, }) layer := &configv1alpha1.TerraformLayer{} err := k8sClient.Get(context.TODO(), name, layer) - return result, reconcileError, err, layer + return result, layer, reconcileError, err } func getLinkedPods(cl client.Client, layer *configv1alpha1.TerraformLayer, action controller.Action, namespace string) (*corev1.PodList, error) { @@ -121,7 +122,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-1", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -150,7 +151,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-2", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -179,7 +180,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-3", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -208,7 +209,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-4", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -242,7 +243,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-5", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -276,7 +277,7 @@ var _ = Describe("Layer", func() { Name: "nominal-case-6", Namespace: "default", } - result, reconcileError, err, layer = getResult(name) + result, layer, reconcileError, err = getResult(name) }) It("should still exists", func() { Expect(err).NotTo(HaveOccurred()) @@ -305,66 +306,108 @@ var _ = Describe("Layer", func() { }) }) }) - // Describe("Error Case", func() { - // Describe("When a TerraformLayer has errored once on plan", Ordered, func() { - // BeforeAll(func() { - // name = types.NamespacedName{ - // Name: "error-case-1", - // Namespace: "default", - // } - // result, reconcileError, err, layer = getResult(name) - // }) - // It("should still exists", func() { - // Expect(err).NotTo(HaveOccurred()) - // }) - // It("should not return an error", func() { - // Expect(reconcileError).NotTo(HaveOccurred()) - // }) - // It("should end in PlanNeeded state", func() { - // Expect(layer.Status.State).To(Equal("PlanNeeded")) - // }) - // It("should be locked", func() { - // Expect(lock.IsLocked(context.TODO(), k8sClient, layer)).To(BeTrue()) - // }) - // It("should set RequeueAfter to WaitAction", func() { - // Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) - // }) - // It("should have created a plan pod", func() { - // pods, err := getLinkedPods(k8sClient, layer, controller.PlanAction, name.Namespace) - // Expect(err).NotTo(HaveOccurred()) - // Expect(len(pods.Items)).To(Equal(1)) - // }) - // }) - // Describe("When a TerraformLayer has errored once on apply", Ordered, func() { - // BeforeAll(func() { - // name = types.NamespacedName{ - // Name: "error-case-2", - // Namespace: "default", - // } - // result, reconcileError, err, layer = getResult(name) - // }) - // It("should still exists", func() { - // Expect(err).NotTo(HaveOccurred()) - // }) - // It("should not return an error", func() { - // Expect(reconcileError).NotTo(HaveOccurred()) - // }) - // It("should end in ApplyNeeded state", func() { - // Expect(layer.Status.State).To(Equal("ApplyNeeded")) - // }) - // It("should be locked", func() { - // Expect(lock.IsLocked(context.TODO(), k8sClient, layer)).To(BeTrue()) - // }) - // It("should set RequeueAfter to WaitAction", func() { - // Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) - // }) - // It("should have created a plan pod", func() { - // pods, err := getLinkedPods(k8sClient, layer, controller.ApplyAction, name.Namespace) - // Expect(err).NotTo(HaveOccurred()) - // Expect(len(pods.Items)).To(Equal(1)) - // }) - // }) - // }) + Describe("Error Case", func() { + Describe("When a TerraformLayer has errored once on plan and still in grace period", Ordered, func() { + BeforeAll(func() { + name = types.NamespacedName{ + Name: "error-case-1", + Namespace: "default", + } + result, layer, reconcileError, err = getResult(name) + }) + It("should still exists", func() { + Expect(err).NotTo(HaveOccurred()) + }) + It("should not return an error", func() { + Expect(reconcileError).NotTo(HaveOccurred()) + }) + It("should end in FailureGracePeriod state", func() { + Expect(layer.Status.State).To(Equal("FailureGracePeriod")) + }) + It("should set RequeueAfter to WaitAction", func() { + Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) + }) + }) + Describe("When a TerraformLayer has errored once on apply and still in grace period", Ordered, func() { + BeforeAll(func() { + name = types.NamespacedName{ + Name: "error-case-2", + Namespace: "default", + } + result, layer, reconcileError, err = getResult(name) + }) + It("should still exists", func() { + Expect(err).NotTo(HaveOccurred()) + }) + It("should not return an error", func() { + Expect(reconcileError).NotTo(HaveOccurred()) + }) + It("should end in FailureGracePeriod state", func() { + Expect(layer.Status.State).To(Equal("FailureGracePeriod")) + }) + It("should set RequeueAfter to WaitAction", func() { + Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) + }) + }) + }) + Describe("When a TerraformLayer has errored once on plan and not in grace period anymore", Ordered, func() { + BeforeAll(func() { + name = types.NamespacedName{ + Name: "error-case-3", + Namespace: "default", + } + result, layer, reconcileError, err = getResult(name) + }) + It("should still exists", func() { + Expect(err).NotTo(HaveOccurred()) + }) + It("should not return an error", func() { + Expect(reconcileError).NotTo(HaveOccurred()) + }) + It("should end in PlanNeeded state", func() { + Expect(layer.Status.State).To(Equal("PlanNeeded")) + }) + It("should set RequeueAfter to WaitAction", func() { + Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) + }) + It("should be locked", func() { + Expect(lock.IsLocked(context.TODO(), k8sClient, layer)).To(BeTrue()) + }) + It("should have created a plan pod", func() { + pods, err := getLinkedPods(k8sClient, layer, controller.PlanAction, name.Namespace) + Expect(err).NotTo(HaveOccurred()) + Expect(len(pods.Items)).To(Equal(1)) + }) + }) + Describe("When a TerraformLayer has errored once on apply and not in grace period anymore", Ordered, func() { + BeforeAll(func() { + name = types.NamespacedName{ + Name: "error-case-4", + Namespace: "default", + } + result, layer, reconcileError, err = getResult(name) + }) + It("should still exists", func() { + Expect(err).NotTo(HaveOccurred()) + }) + It("should not return an error", func() { + Expect(reconcileError).NotTo(HaveOccurred()) + }) + It("should end in ApplyNeeded state", func() { + Expect(layer.Status.State).To(Equal("ApplyNeeded")) + }) + It("should set RequeueAfter to WaitAction", func() { + Expect(result.RequeueAfter).To(Equal(reconciler.Config.Controller.Timers.WaitAction)) + }) + It("should be locked", func() { + Expect(lock.IsLocked(context.TODO(), k8sClient, layer)).To(BeTrue()) + }) + It("should have created an apply pod", func() { + pods, err := getLinkedPods(k8sClient, layer, controller.ApplyAction, name.Namespace) + Expect(err).NotTo(HaveOccurred()) + Expect(len(pods.Items)).To(Equal(1)) + }) + }) }) var _ = AfterSuite(func() { @@ -372,3 +415,140 @@ var _ = AfterSuite(func() { err := testEnv.Stop() Expect(err).NotTo(HaveOccurred()) }) + +func TestGetLayerExponentialBackOffTime(t *testing.T) { + tt := []struct { + name string + defaultTime time.Duration + layer *configv1alpha1.TerraformLayer + expectedTime time.Duration + }{ + { + "Exponential backoff : No retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + time.Minute, + }, + { + "Exponential backoff : Success", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "0"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + time.Minute, + }, + { + "Exponential backoff : 1 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "1"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 2 * time.Minute, + }, + { + "Exponential backoff : 2 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "2"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 7 * time.Minute, + }, + { + "Exponential backoff : 3 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "3"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 20 * time.Minute, + }, + { + "Exponential backoff : 5 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "5"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 148 * time.Minute, + }, + { + "Exponential backoff : 10 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "10"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 22026 * time.Minute, + }, + { + "Exponential backoff : 17 retry", + time.Minute, + &configv1alpha1.TerraformLayer{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"runner.terraform.padok.cloud/failure": "17"}, + }, + Spec: configv1alpha1.TerraformLayerSpec{ + TerraformConfig: configv1alpha1.TerraformConfig{ + Version: "1.0.1", + }, + }, + }, + 24154952 * time.Minute, + }, + } + + for _, tc := range tt { + t.Run(tc.name, func(t *testing.T) { + result := controller.GetLayerExponentialBackOffTime(tc.defaultTime, tc.layer) + // var n, _ = tc.layer.Annotations["runner.terraform.padok.cloud/failure"] + if tc.expectedTime != result { + t.Errorf("different version computed: expected %s go %s", tc.expectedTime, result) + } + }) + } +} diff --git a/internal/controllers/terraformlayer/states.go b/internal/controllers/terraformlayer/states.go index e94df8c6..6a15b085 100644 --- a/internal/controllers/terraformlayer/states.go +++ b/internal/controllers/terraformlayer/states.go @@ -23,9 +23,12 @@ func (r *Reconciler) GetState(ctx context.Context, layer *configv1alpha1.Terrafo c1, isPlanArtifactUpToDate := r.IsPlanArtifactUpToDate(layer) c2, isApplyUpToDate := r.IsApplyUpToDate(layer) c3, isLastRelevantCommitPlanned := r.IsLastRelevantCommitPlanned(layer) - // c3, hasFailed := HasFailed(r) - conditions := []metav1.Condition{c1, c2, c3} + c4, isInFailureGracePeriod := r.IsInFailureGracePeriod(layer) + conditions := []metav1.Condition{c1, c2, c3, c4} switch { + case isInFailureGracePeriod: + log.Infof("layer %s is in failure grace period", layer.Name) + return &FailureGracePeriod{}, conditions case isPlanArtifactUpToDate && isApplyUpToDate && isLastRelevantCommitPlanned: log.Infof("layer %s is up to date, waiting for a new drift detection cycle", layer.Name) return &Idle{}, conditions @@ -41,6 +44,26 @@ func (r *Reconciler) GetState(ctx context.Context, layer *configv1alpha1.Terrafo } } +type FailureGracePeriod struct{} + +func (s *FailureGracePeriod) getHandler() Handler { + return func(ctx context.Context, r *Reconciler, layer *configv1alpha1.TerraformLayer, repository *configv1alpha1.TerraformRepository) ctrl.Result { + lastActionTime, ok := GetLastActionTime(r, layer) + if ok != nil { + log.Errorf("could not get lastActionTime on layer %s,: %s", layer.Name, ok) + return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.OnError} + } + expTime := GetLayerExponentialBackOffTime(r.Config.Controller.Timers.FailureGracePeriod, layer) + endIdleTime := lastActionTime.Add(expTime) + now := r.Clock.Now() + if endIdleTime.After(now) { + log.Infof("the grace period is over for layer %v, new retry", layer.Name) + return ctrl.Result{RequeueAfter: r.Config.Controller.Timers.WaitAction} + } + return ctrl.Result{RequeueAfter: now.Sub(endIdleTime)} + } +} + type Idle struct{} func (s *Idle) getHandler() Handler { diff --git a/internal/controllers/terraformlayer/testdata/error-case.yaml b/internal/controllers/terraformlayer/testdata/error-case.yaml index 856dbce3..03d95b95 100644 --- a/internal/controllers/terraformlayer/testdata/error-case.yaml +++ b/internal/controllers/terraformlayer/testdata/error-case.yaml @@ -51,3 +51,56 @@ spec: enabled: true version: 0.45.4 version: 1.3.1 +--- +apiVersion: config.terraform.padok.cloud/v1alpha1 +kind: TerraformLayer +metadata: + labels: + app.kubernetes.io/instance: in-cluster-burrito + name: error-case-3 + namespace: default + annotations: + runner.terraform.padok.cloud/failure: "1" + runner.terraform.padok.cloud/plan-commit: ca9b6c80ac8fb5cd837ae9b374b79ff33f472558 + runner.terraform.padok.cloud/plan-date: Sun May 7 11:21:53 UTC 2023 + runner.terraform.padok.cloud/plan-sum: "" +spec: + branch: main + path: error-case-three/ + remediationStrategy: autoApply + repository: + name: burrito + namespace: default + terraform: + terragrunt: + enabled: true + version: 0.45.4 + version: 1.3.1 +--- +apiVersion: config.terraform.padok.cloud/v1alpha1 +kind: TerraformLayer +metadata: + labels: + app.kubernetes.io/instance: in-cluster-burrito + name: error-case-4 + namespace: default + annotations: + runner.terraform.padok.cloud/failure: "1" + runner.terraform.padok.cloud/plan-commit: ca9b6c80ac8fb5cd837ae9b374b79ff33f472558 + runner.terraform.padok.cloud/plan-date: Sun May 8 11:15:53 UTC 2023 + runner.terraform.padok.cloud/plan-sum: AuP6pMNxWsbSZKnxZvxD842wy0qaF9JCX8HW1nFeL1I= + runner.terraform.padok.cloud/apply-commit: 840046e9db8c1348445d0018c86347967b066df0 + runner.terraform.padok.cloud/apply-date: Sun May 8 11:20:53 UTC 2023 + runner.terraform.padok.cloud/apply-sum: "" +spec: + branch: main + path: error-case-four/ + remediationStrategy: autoApply + repository: + name: burrito + namespace: default + terraform: + terragrunt: + enabled: true + version: 0.45.4 + version: 1.3.1 \ No newline at end of file diff --git a/internal/runner/runner.go b/internal/runner/runner.go index 066b4921..0b5b9a48 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -82,18 +82,17 @@ func (r *Runner) Exec() { switch r.config.Runner.Action { case "plan": sum, err = r.plan() + ann[annotations.LastPlanDate] = time.Now().Format(time.UnixDate) if err == nil { - ann[annotations.LastPlanDate] = time.Now().Format(time.UnixDate) ann[annotations.LastPlanCommit] = commit } - if sum != "" { - ann[annotations.LastPlanSum] = sum - } + ann[annotations.LastPlanSum] = sum case "apply": sum, err = r.apply() + ann[annotations.LastApplyDate] = time.Now().Format(time.UnixDate) + ann[annotations.LastApplySum] = sum if err == nil { ann[annotations.LastApplyCommit] = commit - ann[annotations.LastApplySum] = sum } default: err = errors.New("unrecognized runner action, If this is happening there might be a version mismatch between the controller and runner") @@ -107,6 +106,8 @@ func (r *Runner) Exec() { } number++ ann[annotations.Failure] = strconv.Itoa(number) + } else { + ann[annotations.Failure] = "0" } err = annotations.Add(context.TODO(), r.client, r.layer, ann) if err != nil {