Skip to content

Commit

Permalink
handling backend ai call failures (#314)
Browse files Browse the repository at this point in the history
* add analysis failure count

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add backoff field in crd

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add circuit breaker

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

* add a circuit breaker variable, argument, remove modifying given spec

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>

---------

Signed-off-by: JuHyung-Son <sonju0427@gmail.com>
  • Loading branch information
JuHyung-Son committed Jan 18, 2024
1 parent efae4a9 commit 16a61c6
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 6 deletions.
12 changes: 10 additions & 2 deletions api/v1alpha1/k8sgpt_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,19 @@ type WebhookRef struct {
Secret *SecretRef `json:"secret,omitempty"`
}

type BackOff struct {
// +kubebuilder:default:=true
Enabled bool `json:"enabled"`
// +kubebuilder:default:=5
MaxRetries int `json:"maxRetries"`
}

type AISpec struct {
// +kubebuilder:default:=openai
// +kubebuilder:validation:Enum=openai;localai;azureopenai;amazonbedrock;cohere;amazonsagemaker
Backend string `json:"backend"`
BaseUrl string `json:"baseUrl,omitempty"`
Backend string `json:"backend"`
BackOff *BackOff `json:"backOff,omitempty"`
BaseUrl string `json:"baseUrl,omitempty"`
// +kubebuilder:default:=gpt-3.5-turbo
Model string `json:"model,omitempty"`
Engine string `json:"engine,omitempty"`
Expand Down
7 changes: 6 additions & 1 deletion api/v1alpha1/k8sgpt_types_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
Name: "k8s-gpt-secret",
Key: "k8s-gpt",
}

backOff = BackOff{
Enabled: true,
MaxRetries: 5,
}
kind = "K8sGPT"
baseUrl = "https://api.k8s-gpt.localhost"
model = "345M"
Expand All @@ -54,6 +57,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
Spec: K8sGPTSpec{
AI: &AISpec{
Backend: OpenAI,
BackOff: &backOff,
BaseUrl: baseUrl,
Model: model,
Enabled: true,
Expand Down Expand Up @@ -81,6 +85,7 @@ var _ = Describe("The test cases for the K8sGPT CRDs", func() {
Spec: K8sGPTSpec{
AI: &AISpec{
Backend: OpenAI,
BackOff: &backOff,
BaseUrl: baseUrl,
Model: model,
Secret: &secretRef,
Expand Down
20 changes: 20 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions chart/operator/templates/k8sgpt-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ spec:
anonymized:
default: true
type: boolean
backOff:
properties:
enabled:
default: true
type: boolean
maxRetries:
default: 5
type: integer
required:
- enabled
- maxRetries
type: object
backend:
default: openai
enum:
Expand Down
12 changes: 12 additions & 0 deletions config/crd/bases/core.k8sgpt.ai_k8sgpts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@ spec:
anonymized:
default: true
type: boolean
backOff:
properties:
enabled:
default: true
type: boolean
maxRetries:
default: 5
type: integer
required:
- enabled
- maxRetries
type: object
backend:
default: openai
enum:
Expand Down
29 changes: 28 additions & 1 deletion controllers/k8sgpt_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ var (
Name: "k8sgpt_number_of_failed_backend_ai_calls",
Help: "The total number of failed backend AI calls",
}, []string{"backend", "deployment", "namespace"})
// analysisRetryCount is for the number of analysis failures
analysisRetryCount int
// allowBackendAIRequest a circuit breaker that switching on/off backend AI calls
allowBackendAIRequest = true
)

// K8sGPTReconciler reconciles a K8sGPT object
Expand Down Expand Up @@ -133,6 +137,17 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
return r.finishReconcile(nil, false)
}

if k8sgptConfig.Spec.AI.BackOff == nil {
k8sgptConfig.Spec.AI.BackOff = &corev1alpha1.BackOff{
Enabled: true,
MaxRetries: 5,
}
if err := r.Update(ctx, k8sgptConfig); err != nil {
k8sgptReconcileErrorCount.Inc()
return r.finishReconcile(err, false)
}
}

// Check and see if the instance is new or has a K8sGPT deployment in flight
deployment := v1.Deployment{}
err = r.Get(ctx, client.ObjectKey{Namespace: k8sgptConfig.Namespace,
Expand Down Expand Up @@ -203,17 +218,29 @@ func (r *K8sGPTReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
}
}

response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig)
response, err := k8sgptClient.ProcessAnalysis(deployment, k8sgptConfig, allowBackendAIRequest)
if err != nil {
if k8sgptConfig.Spec.AI.Enabled {
k8sgptNumberOfFailedBackendAICalls.With(prometheus.Labels{
"backend": k8sgptConfig.Spec.AI.Backend,
"deployment": deployment.Name,
"namespace": deployment.Namespace}).Inc()

if k8sgptConfig.Spec.AI.BackOff.Enabled {
if analysisRetryCount > k8sgptConfig.Spec.AI.BackOff.MaxRetries {
allowBackendAIRequest = false
fmt.Printf("Disabled AI backend %s due to failures exceeding max retries\n", k8sgptConfig.Spec.AI.Backend)
analysisRetryCount = 0
}
analysisRetryCount++
}
}
k8sgptReconcileErrorCount.Inc()
return r.finishReconcile(err, false)
}
// Reset analysisRetryCount
analysisRetryCount = 0

// Update metrics count
if k8sgptConfig.Spec.AI.Enabled && len(response.Results) > 0 {
k8sgptNumberOfBackendAICalls.With(prometheus.Labels{
Expand Down
4 changes: 2 additions & 2 deletions pkg/client/analysis.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ import (
v1 "k8s.io/api/apps/v1"
)

func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT) (*common.K8sGPTReponse, error) {
func (c *Client) ProcessAnalysis(deployment v1.Deployment, config *v1alpha1.K8sGPT, allowAIRequest bool) (*common.K8sGPTReponse, error) {

client := rpc.NewServerServiceClient(c.conn)
req := &schemav1.AnalyzeRequest{
Explain: config.Spec.AI.Enabled,
Explain: config.Spec.AI.Enabled && allowAIRequest,
Nocache: config.Spec.NoCache,
Backend: config.Spec.AI.Backend,
Filters: config.Spec.Filters,
Expand Down

0 comments on commit 16a61c6

Please sign in to comment.