handling backend ai call failures (#314)

* add analysis failure count Signed-off-by: JuHyung-Son <sonju0427@gmail.com> * add backoff field in crd Signed-off-by: JuHyung-Son <sonju0427@gmail.com> * add circuit breaker Signed-off-by: JuHyung-Son <sonju0427@gmail.com> * add a circuit breaker variable, argument, remove modifying given spec Signed-off-by: JuHyung-Son <sonju0427@gmail.com> --------- Signed-off-by: JuHyung-Son <sonju0427@gmail.com>
k8sgpt-ai · Jan 18, 2024 · 16a61c6 · 16a61c6
1 parent efae4a9
commit 16a61c6
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 6 deletions.
diff --git a/api/v1alpha1/k8sgpt_types.go b/api/v1alpha1/k8sgpt_types.go
@@ -73,11 +73,19 @@ type WebhookRef struct {
 	Secret   *SecretRef `json:"secret,omitempty"`
 }
 
+type BackOff struct {
+	// +kubebuilder:default:=true
+	Enabled bool `json:"enabled"`
+	// +kubebuilder:default:=5
+	MaxRetries int `json:"maxRetries"`
+}
+
 type AISpec struct {
 	// +kubebuilder:default:=openai
 	// +kubebuilder:validation:Enum=openai;localai;azureopenai;amazonbedrock;cohere;amazonsagemaker
-	Backend string `json:"backend"`
-	BaseUrl string `json:"baseUrl,omitempty"`
+	Backend string   `json:"backend"`
+	BackOff *BackOff `json:"backOff,omitempty"`
+	BaseUrl string   `json:"baseUrl,omitempty"`
 	// +kubebuilder:default:=gpt-3.5-turbo
 	Model   string     `json:"model,omitempty"`
 	Engine  string     `json:"engine,omitempty"`

diff --git a/api/v1alpha1/k8sgpt_types_test.go b/api/v1alpha1/k8sgpt_types_test.go
@@ -33,7 +33,10 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Name: "k8s-gpt-secret",
 			Key:  "k8s-gpt",
 		}
-
+		backOff = BackOff{
+			Enabled:    true,
+			MaxRetries: 5,
+		}
 		kind       = "K8sGPT"
 		baseUrl    = "https://api.k8s-gpt.localhost"
 		model      = "345M"
@@ -54,6 +57,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Spec: K8sGPTSpec{
 				AI: &AISpec{
 					Backend:   OpenAI,
+					BackOff:   &backOff,
 					BaseUrl:   baseUrl,
 					Model:     model,
 					Enabled:   true,
@@ -81,6 +85,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
 			Spec: K8sGPTSpec{
 				AI: &AISpec{
 					Backend:   OpenAI,
+					BackOff:   &backOff,
 					BaseUrl:   baseUrl,
 					Model:     model,
 					Secret:    &secretRef,

diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go
diff --git a/chart/operator/templates/k8sgpt-crd.yaml b/chart/operator/templates/k8sgpt-crd.yaml
@@ -40,6 +40,18 @@ spec:
                   anonymized:
                     default: true
                     type: boolean
+                  backOff:
+                    properties:
+                      enabled:
+                        default: true
+                        type: boolean
+                      maxRetries:
+                        default: 5
+                        type: integer
+                    required:
+                      - enabled
+                      - maxRetries
+                    type: object
                   backend:
                     default: openai
                     enum:

diff --git a/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml b/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml
@@ -40,6 +40,18 @@ spec:
                   anonymized:
                     default: true
                     type: boolean
+                  backOff:
+                    properties:
+                      enabled:
+                        default: true
+                        type: boolean
+                      maxRetries:
+                        default: 5
+                        type: integer
+                    required:
+                    - enabled
+                    - maxRetries
+                    type: object
                   backend:
                     default: openai
                     enum:

diff --git a/controllers/k8sgpt_controller.go b/controllers/k8sgpt_controller.go
@@ -72,6 +72,10 @@ var (
 		Name: "k8sgpt_number_of_failed_backend_ai_calls",
 		Help: "The total number of failed backend AI calls",
 	}, []string{"backend", "deployment", "namespace"})
+	// analysisRetryCount is for the number of analysis failures
+	analysisRetryCount int
+	// allowBackendAIRequest a circuit breaker that switching on/off backend AI calls
+	allowBackendAIRequest = true
 )
 
 // K8sGPTReconciler reconciles a K8sGPT object
@@ -133,6 +137,17 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 		return r.finishReconcile(nil, false)
 	}
 
+	if k8sgptConfig.Spec.AI.BackOff == nil {
+		k8sgptConfig.Spec.AI.BackOff = &corev1alpha1.BackOff{
+			Enabled:    true,
+			MaxRetries: 5,
+		}
+		if err := r.Update(ctx, k8sgptConfig); err != nil {
+			k8sgptReconcileErrorCount.Inc()
+			return r.finishReconcile(err, false)
+		}
+	}
+
 	// Check and see if the instance is new or has a K8sGPT deployment in flight
 	deployment := v1.Deployment{}
 	err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace,
@@ -203,17 +218,29 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
 			}
 		}
 
-		response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig)
+		response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig, allowBackendAIRequest)
 		if err != nil {
 			if k8sgptConfig.Spec.AI.Enabled {
 				k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{
 					"backend":    k8sgptConfig.Spec.AI.Backend,
 					"deployment": deployment.Name,
 					"namespace":  deployment.Namespace}).Inc()
+
+				if k8sgptConfig.Spec.AI.BackOff.Enabled {
+					if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries {
+						allowBackendAIRequest = false
+						fmt.Printf("Disabled AI backend %s due to failures exceeding max retries\n", k8sgptConfig.Spec.AI.Backend)
+						analysisRetryCount = 0
+					}
+					analysisRetryCount++
+				}
 			}
 			k8sgptReconcileErrorCount.Inc()
 			return r.finishReconcile(err, false)
 		}
+		// Reset analysisRetryCount
+		analysisRetryCount = 0
+
 		// Update metrics count
 		if k8sgptConfig.Spec.AI.Enabled && len(response.Results) > 0 {
 			k8sgptNumberOfBackendAICalls.With(prometheus.Labels{

diff --git a/pkg/client/analysis.go b/pkg/client/analysis.go
@@ -12,11 +12,11 @@ import (
 	v1 "k8s.io/api/apps/v1"
 )
 
-func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT) (*common.K8sGPTReponse, error) {
+func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT, allowAIRequest bool) (*common.K8sGPTReponse, error) {
 
 	client := rpc.NewServerServiceClient(c.conn)
 	req := &schemav1.AnalyzeRequest{
-		Explain:   config.Spec.AI.Enabled,
+		Explain:   config.Spec.AI.Enabled && allowAIRequest,
 		Nocache:   config.Spec.NoCache,
 		Backend:   config.Spec.AI.Backend,
 		Filters:   config.Spec.Filters,