From 16a61c64bf84548b8b4548d66af062b36cb45291 Mon Sep 17 00:00:00 2001 From: JuHyung Son Date: Thu, 18 Jan 2024 18:02:59 +0900 Subject: [PATCH] handling backend ai call failures (#314) * add analysis failure count Signed-off-by: JuHyung-Son * add backoff field in crd Signed-off-by: JuHyung-Son * add circuit breaker Signed-off-by: JuHyung-Son * add a circuit breaker variable, argument, remove modifying given spec Signed-off-by: JuHyung-Son --------- Signed-off-by: JuHyung-Son --- api/v1alpha1/k8sgpt_types.go | 12 ++++++-- api/v1alpha1/k8sgpt_types_test.go | 7 ++++- api/v1alpha1/zz_generated.deepcopy.go | 20 ++++++++++++++ chart/operator/templates/k8sgpt-crd.yaml | 12 ++++++++ config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml | 12 ++++++++ controllers/k8sgpt_controller.go | 29 +++++++++++++++++++- pkg/client/analysis.go | 4 +-- 7 files changed, 90 insertions(+), 6 deletions(-) diff --git a/api/v1alpha1/k8sgpt_types.go b/api/v1alpha1/k8sgpt_types.go index 93352180..55e31ff3 100644 --- a/api/v1alpha1/k8sgpt_types.go +++ b/api/v1alpha1/k8sgpt_types.go @@ -73,11 +73,19 @@ type WebhookRef struct { Secret *SecretRef `json:"secret,omitempty"` } +type BackOff struct { + // +kubebuilder:default:=true + Enabled bool `json:"enabled"` + // +kubebuilder:default:=5 + MaxRetries int `json:"maxRetries"` +} + type AISpec struct { // +kubebuilder:default:=openai // +kubebuilder:validation:Enum=openai;localai;azureopenai;amazonbedrock;cohere;amazonsagemaker - Backend string `json:"backend"` - BaseUrl string `json:"baseUrl,omitempty"` + Backend string `json:"backend"` + BackOff *BackOff `json:"backOff,omitempty"` + BaseUrl string `json:"baseUrl,omitempty"` // +kubebuilder:default:=gpt-3.5-turbo Model string `json:"model,omitempty"` Engine string `json:"engine,omitempty"` diff --git a/api/v1alpha1/k8sgpt_types_test.go b/api/v1alpha1/k8sgpt_types_test.go index ab2c0684..5e11292d 100644 --- a/api/v1alpha1/k8sgpt_types_test.go +++ b/api/v1alpha1/k8sgpt_types_test.go @@ -33,7 +33,10 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() { Name: "k8s-gpt-secret", Key: "k8s-gpt", } - + backOff = BackOff{ + Enabled: true, + MaxRetries: 5, + } kind = "K8sGPT" baseUrl = "https://api.k8s-gpt.localhost" model = "345M" @@ -54,6 +57,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() { Spec: K8sGPTSpec{ AI: &AISpec{ Backend: OpenAI, + BackOff: &backOff, BaseUrl: baseUrl, Model: model, Enabled: true, @@ -81,6 +85,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() { Spec: K8sGPTSpec{ AI: &AISpec{ Backend: OpenAI, + BackOff: &backOff, BaseUrl: baseUrl, Model: model, Secret: &secretRef, diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 1ad08c5f..1cbff8c1 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -28,6 +28,11 @@ import ( // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AISpec) DeepCopyInto(out *AISpec) { *out = *in + if in.BackOff != nil { + in, out := &in.BackOff, &out.BackOff + *out = new(BackOff) + **out = **in + } if in.Secret != nil { in, out := &in.Secret, &out.Secret *out = new(SecretRef) @@ -60,6 +65,21 @@ func (in *AzureBackend) DeepCopy() *AzureBackend { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *BackOff) DeepCopyInto(out *BackOff) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new BackOff. +func (in *BackOff) DeepCopy() *BackOff { + if in == nil { + return nil + } + out := new(BackOff) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Backstage) DeepCopyInto(out *Backstage) { *out = *in diff --git a/chart/operator/templates/k8sgpt-crd.yaml b/chart/operator/templates/k8sgpt-crd.yaml index 35d8d47a..85afa3d2 100644 --- a/chart/operator/templates/k8sgpt-crd.yaml +++ b/chart/operator/templates/k8sgpt-crd.yaml @@ -40,6 +40,18 @@ spec: anonymized: default: true type: boolean + backOff: + properties: + enabled: + default: true + type: boolean + maxRetries: + default: 5 + type: integer + required: + - enabled + - maxRetries + type: object backend: default: openai enum: diff --git a/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml b/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml index 16407d4c..5793b0cb 100644 --- a/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml +++ b/config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml @@ -40,6 +40,18 @@ spec: anonymized: default: true type: boolean + backOff: + properties: + enabled: + default: true + type: boolean + maxRetries: + default: 5 + type: integer + required: + - enabled + - maxRetries + type: object backend: default: openai enum: diff --git a/controllers/k8sgpt_controller.go b/controllers/k8sgpt_controller.go index 83b2dafe..66e2e41c 100644 --- a/controllers/k8sgpt_controller.go +++ b/controllers/k8sgpt_controller.go @@ -72,6 +72,10 @@ var ( Name: "k8sgpt_number_of_failed_backend_ai_calls", Help: "The total number of failed backend AI calls", }, []string{"backend", "deployment", "namespace"}) + // analysisRetryCount is for the number of analysis failures + analysisRetryCount int + // allowBackendAIRequest a circuit breaker that switching on/off backend AI calls + allowBackendAIRequest = true ) // K8sGPTReconciler reconciles a K8sGPT object @@ -133,6 +137,17 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr return r.finishReconcile(nil, false) } + if k8sgptConfig.Spec.AI.BackOff == nil { + k8sgptConfig.Spec.AI.BackOff = &corev1alpha1.BackOff{ + Enabled: true, + MaxRetries: 5, + } + if err := r.Update(ctx, k8sgptConfig); err != nil { + k8sgptReconcileErrorCount.Inc() + return r.finishReconcile(err, false) + } + } + // Check and see if the instance is new or has a K8sGPT deployment in flight deployment := v1.Deployment{} err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace, @@ -203,17 +218,29 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr } } - response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig) + response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig, allowBackendAIRequest) if err != nil { if k8sgptConfig.Spec.AI.Enabled { k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{ "backend": k8sgptConfig.Spec.AI.Backend, "deployment": deployment.Name, "namespace": deployment.Namespace}).Inc() + + if k8sgptConfig.Spec.AI.BackOff.Enabled { + if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries { + allowBackendAIRequest = false + fmt.Printf("Disabled AI backend %s due to failures exceeding max retries\n", k8sgptConfig.Spec.AI.Backend) + analysisRetryCount = 0 + } + analysisRetryCount++ + } } k8sgptReconcileErrorCount.Inc() return r.finishReconcile(err, false) } + // Reset analysisRetryCount + analysisRetryCount = 0 + // Update metrics count if k8sgptConfig.Spec.AI.Enabled && len(response.Results) > 0 { k8sgptNumberOfBackendAICalls.With(prometheus.Labels{ diff --git a/pkg/client/analysis.go b/pkg/client/analysis.go index 1fbafae6..4a98410b 100644 --- a/pkg/client/analysis.go +++ b/pkg/client/analysis.go @@ -12,11 +12,11 @@ import ( v1 "k8s.io/api/apps/v1" ) -func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT) (*common.K8sGPTReponse, error) { +func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT, allowAIRequest bool) (*common.K8sGPTReponse, error) { client := rpc.NewServerServiceClient(c.conn) req := &schemav1.AnalyzeRequest{ - Explain: config.Spec.AI.Enabled, + Explain: config.Spec.AI.Enabled && allowAIRequest, Nocache: config.Spec.NoCache, Backend: config.Spec.AI.Backend, Filters: config.Spec.Filters,