From 16a61c64bf84548b8b4548d66af062b36cb45291 Mon Sep 17 00:00:00 2001
From: JuHyung Son <sonju0427@gmail.com>
Date: Thu, 18 Jan 2024 18:02:59 +0900
Subject: [PATCH] handling backend ai call failures (#314)

* add analysis failure count

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add backoff field in crd

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add circuit breaker

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add a circuit breaker variable, argument, remove modifying given spec

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

---------

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>
---
 api/v1alpha1/k8sgpt_types.go                 | 12 ++++++--
 api/v1alpha1/k8sgpt_types_test.go            |  7 ++++-
 api/v1alpha1/zz_generated.deepcopy.go        | 20 ++++++++++++++
 chart/operator/templates/k8sgpt-crd.yaml     | 12 ++++++++
 config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml | 12 ++++++++
 controllers/k8sgpt_controller.go             | 29 +++++++++++++++++++-
 pkg/client/analysis.go                       |  4 +--
 7 files changed, 90 insertions(+), 6 deletions(-)

diff --git a/api/v1alpha1/k8sgpt_types.go b/api/v1alpha1/k8sgpt_types.go
index 93352180..55e31ff3 100644
--- a/api/v1alpha1/k8sgpt_types.go
+++ b/api/v1alpha1/k8sgpt_types.go
@@ -73,11 +73,19 @@ type WebhookRef struct {
 	Secret   *SecretRef `json:"secret,omitempty"`
 }
 
+type BackOff struct {
+	// +kubebuilder:default:=true
+	Enabled bool `json:"enabled"`
+	// +kubebuilder:default:=5
+	MaxRetries int `json:"maxRetries"`
+}
+
 type AISpec struct {
 	// +kubebuilder:default:=openai
 	// +kubebuilder:validation:Enum=openai;localai;azureopenai;amazonbedrock;cohere;amazonsagemaker
-	Backend string `json:"backend"`
-	BaseUrl string `json:"baseUrl,omitempty"`
+	Backend string   `json:"backend"`
+	BackOff *BackOff `json:"backOff,omitempty"`
+	BaseUrl string   `json:"baseUrl,omitempty"`
 	// +kubebuilder:default:=gpt-3.5-turbo
 	Model   string     `json:"model,omitempty"`
 	Engine  string     `json:"engine,omitempty"`
diff --git a/api/v1alpha1/k8sgpt_types_test.go b/api/v1alpha1/k8sgpt_types_test.go
index ab2c0684..5e11292d 100644
--- a/api/v1alpha1/k8sgpt_types_test.go
+++ b/api/v1alpha1/k8sgpt_types_test.go
@@ -33,7 +33,10 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Name: "k8s-gpt-secret",
 			Key:  "k8s-gpt",
 		}
-
+		backOff = BackOff{
+			Enabled:    true,
+			MaxRetries: 5,
+		}
 		kind       = "K8sGPT"
 		baseUrl    = "https://api.k8s-gpt.localhost"
 		model      = "345M"
@@ -54,6 +57,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Spec: K8sGPTSpec{
 				AI: &AISpec{
 					Backend:   OpenAI,
+					BackOff:   &backOff,
 					BaseUrl:   baseUrl,
 					Model:     model,
 					Enabled:   true,
@@ -81,6 +85,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Spec: K8sGPTSpec{
 				AI: &AISpec{
 					Backend:   OpenAI,
+					BackOff:   &backOff,
 					BaseUrl:   baseUrl,
 					Model:     model,
 					Secret:    &secretRef,
diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
index 1ad08c5f..1cbff8c1 100644
--- a/api/v1alpha1/zz_generated.deepcopy.go
+++ b/api/v1alpha1/zz_generated.deepcopy.go
@@ -28,6 +28,11 @@ import (
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *AISpec) DeepCopyInto(out *AISpec) {
 	*out = *in
+	if in.BackOff != nil {
+		in, out := &in.BackOff, &out.BackOff
+		*out = new(BackOff)
+		**out = **in
+	}
 	if in.Secret != nil {
 		in, out := &in.Secret, &out.Secret
 		*out = new(SecretRef)
@@ -60,6 +65,21 @@ func (in *AzureBackend) DeepCopy() *AzureBackend {
 	return out
 }
 
+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
+func (in *BackOff) DeepCopyInto(out *BackOff) {
+	*out = *in
+}
+
+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackOff.
+func (in *BackOff) DeepCopy() *BackOff {
+	if in == nil {
+		return nil
+	}
+	out := new(BackOff)
+	in.DeepCopyInto(out)
+	return out
+}
+
 // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
 func (in *Backstage) DeepCopyInto(out *Backstage) {
 	*out = *in
diff --git a/chart/operator/templates/k8sgpt-crd.yaml b/chart/operator/templates/k8sgpt-crd.yaml
index 35d8d47a..85afa3d2 100644
--- a/chart/operator/templates/k8sgpt-crd.yaml
+++ b/chart/operator/templates/k8sgpt-crd.yaml
@@ -40,6 +40,18 @@ spec:
                   anonymized:
                     default: true
                     type: boolean
+                  backOff:
+                    properties:
+                      enabled:
+                        default: true
+                        type: boolean
+                      maxRetries:
+                        default: 5
+                        type: integer
+                    required:
+                      - enabled
+                      - maxRetries
+                    type: object
                   backend:
                     default: openai
                     enum:
diff --git a/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml b/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml
index 16407d4c..5793b0cb 100644
--- a/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml
+++ b/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml
@@ -40,6 +40,18 @@ spec:
                   anonymized:
                     default: true
                     type: boolean
+                  backOff:
+                    properties:
+                      enabled:
+                        default: true
+                        type: boolean
+                      maxRetries:
+                        default: 5
+                        type: integer
+                    required:
+                    - enabled
+                    - maxRetries
+                    type: object
                   backend:
                     default: openai
                     enum:
diff --git a/controllers/k8sgpt_controller.go b/controllers/k8sgpt_controller.go
index 83b2dafe..66e2e41c 100644
--- a/controllers/k8sgpt_controller.go
+++ b/controllers/k8sgpt_controller.go
@@ -72,6 +72,10 @@ var (
 		Name: "k8sgpt_number_of_failed_backend_ai_calls",
 		Help: "The total number of failed backend AI calls",
 	}, []string{"backend", "deployment", "namespace"})
+	// analysisRetryCount is for the number of analysis failures
+	analysisRetryCount int
+	// allowBackendAIRequest a circuit breaker that switching on/off backend AI calls
+	allowBackendAIRequest = true
 )
 
 // K8sGPTReconciler reconciles a K8sGPT object
@@ -133,6 +137,17 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 		return r.finishReconcile(nil, false)
 	}
 
+	if k8sgptConfig.Spec.AI.BackOff == nil {
+		k8sgptConfig.Spec.AI.BackOff = &corev1alpha1.BackOff{
+			Enabled:    true,
+			MaxRetries: 5,
+		}
+		if err := r.Update(ctx, k8sgptConfig); err != nil {
+			k8sgptReconcileErrorCount.Inc()
+			return r.finishReconcile(err, false)
+		}
+	}
+
 	// Check and see if the instance is new or has a K8sGPT deployment in flight
 	deployment := v1.Deployment{}
 	err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace,
@@ -203,17 +218,29 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 			}
 		}
 
-		response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig)
+		response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig, allowBackendAIRequest)
 		if err != nil {
 			if k8sgptConfig.Spec.AI.Enabled {
 				k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{
 					"backend":    k8sgptConfig.Spec.AI.Backend,
 					"deployment": deployment.Name,
 					"namespace":  deployment.Namespace}).Inc()
+
+				if k8sgptConfig.Spec.AI.BackOff.Enabled {
+					if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries {
+						allowBackendAIRequest = false
+						fmt.Printf("Disabled AI backend %s due to failures exceeding max retries\n", k8sgptConfig.Spec.AI.Backend)
+						analysisRetryCount = 0
+					}
+					analysisRetryCount++
+				}
 			}
 			k8sgptReconcileErrorCount.Inc()
 			return r.finishReconcile(err, false)
 		}
+		// Reset analysisRetryCount
+		analysisRetryCount = 0
+
 		// Update metrics count
 		if k8sgptConfig.Spec.AI.Enabled && len(response.Results) > 0 {
 			k8sgptNumberOfBackendAICalls.With(prometheus.Labels{
diff --git a/pkg/client/analysis.go b/pkg/client/analysis.go
index 1fbafae6..4a98410b 100644
--- a/pkg/client/analysis.go
+++ b/pkg/client/analysis.go
@@ -12,11 +12,11 @@ import (
 	v1 "k8s.io/api/apps/v1"
 )
 
-func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT) (*common.K8sGPTReponse, error) {
+func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT, allowAIRequest bool) (*common.K8sGPTReponse, error) {
 
 	client := rpc.NewServerServiceClient(c.conn)
 	req := &schemav1.AnalyzeRequest{
-		Explain:   config.Spec.AI.Enabled,
+		Explain:   config.Spec.AI.Enabled && allowAIRequest,
 		Nocache:   config.Spec.NoCache,
 		Backend:   config.Spec.AI.Backend,
 		Filters:   config.Spec.Filters,