From 1d39271155f5ba8c6e2b66abd2ac92a63d4fa842 Mon Sep 17 00:00:00 2001 From: bjee19 <139261241+bjee19@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:38:45 -0700 Subject: [PATCH] Do not send telemetry data if failure in collection (#1731) Fix error handling in telemetry and usage reporting Problem: We send empty telemetry data if there is any failure in collection of data. We also do the same for NIM usage reporting. Solution: Don't send telemetry data in case of failure of collection. Also does not send data for NIM usage reporting if there was any failure. Refactored some tests to comply with code coverage. --- internal/framework/events/events_test.go | 2 +- internal/mode/static/telemetry/job_worker.go | 1 + .../mode/static/telemetry/job_worker_test.go | 22 ++- internal/mode/static/usage/job_worker.go | 3 + internal/mode/static/usage/job_worker_test.go | 179 +++++++++++++----- 5 files changed, 157 insertions(+), 50 deletions(-) diff --git a/internal/framework/events/events_test.go b/internal/framework/events/events_test.go index 7d2415c2d..e36b53945 100644 --- a/internal/framework/events/events_test.go +++ b/internal/framework/events/events_test.go @@ -31,5 +31,5 @@ func TestEventLoop_SwapBatches(t *testing.T) { g.Expect(eventLoop.currentBatch).To(HaveLen(len(nextBatch))) g.Expect(eventLoop.currentBatch).To(Equal(nextBatch)) g.Expect(eventLoop.nextBatch).To(BeEmpty()) - g.Expect(cap(eventLoop.nextBatch)).To(Equal(3)) + g.Expect(eventLoop.nextBatch).To(HaveCap(3)) } diff --git a/internal/mode/static/telemetry/job_worker.go b/internal/mode/static/telemetry/job_worker.go index d77189761..4f1b11d14 100644 --- a/internal/mode/static/telemetry/job_worker.go +++ b/internal/mode/static/telemetry/job_worker.go @@ -27,6 +27,7 @@ func CreateTelemetryJobWorker( data, err := dataCollector.Collect(ctx) if err != nil { logger.Error(err, "Failed to collect telemetry data") + return } // Export telemetry diff --git a/internal/mode/static/telemetry/job_worker_test.go b/internal/mode/static/telemetry/job_worker_test.go index 64a1c289c..88cc7fbc4 100644 --- a/internal/mode/static/telemetry/job_worker_test.go +++ b/internal/mode/static/telemetry/job_worker_test.go @@ -2,6 +2,7 @@ package telemetry_test import ( "context" + "errors" "testing" "time" @@ -13,7 +14,7 @@ import ( "github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/telemetry/telemetryfakes" ) -func TestCreateTelemetryJobWorker(t *testing.T) { +func TestCreateTelemetryJobWorker_Succeeds(t *testing.T) { g := NewWithT(t) exporter := &telemetryfakes.FakeExporter{} @@ -36,3 +37,22 @@ func TestCreateTelemetryJobWorker(t *testing.T) { _, data := exporter.ExportArgsForCall(0) g.Expect(data).To(Equal(&expData)) } + +func TestCreateTelemetryJobWorker_CollectFails(t *testing.T) { + g := NewWithT(t) + + exporter := &telemetryfakes.FakeExporter{} + dataCollector := &telemetryfakes.FakeDataCollector{} + + worker := telemetry.CreateTelemetryJobWorker(zap.New(), exporter, dataCollector) + + expData := telemetry.Data{} + dataCollector.CollectReturns(expData, errors.New("failed to collect cluster information")) + + timeout := 10 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + worker(ctx) + g.Expect(exporter.ExportCallCount()).To(Equal(0)) +} diff --git a/internal/mode/static/usage/job_worker.go b/internal/mode/static/usage/job_worker.go index f71fdf64f..43b7b9115 100644 --- a/internal/mode/static/usage/job_worker.go +++ b/internal/mode/static/usage/job_worker.go @@ -23,16 +23,19 @@ func CreateUsageJobWorker( nodeCount, err := CollectNodeCount(ctx, k8sClient) if err != nil { logger.Error(err, "Failed to collect node count") + return } podCount, err := GetTotalNGFPodCount(ctx, k8sClient) if err != nil { logger.Error(err, "Failed to collect replica count") + return } clusterUID, err := telemetry.CollectClusterID(ctx, k8sClient) if err != nil { logger.Error(err, "Failed to collect cluster UID") + return } clusterDetails := ClusterDetails{ diff --git a/internal/mode/static/usage/job_worker_test.go b/internal/mode/static/usage/job_worker_test.go index d0c9270a6..c94bf1569 100644 --- a/internal/mode/static/usage/job_worker_test.go +++ b/internal/mode/static/usage/job_worker_test.go @@ -2,6 +2,7 @@ package usage_test import ( "context" + "errors" "testing" "time" @@ -9,17 +10,18 @@ import ( appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log/zap" + "github.com/nginxinc/nginx-gateway-fabric/internal/framework/events/eventsfakes" "github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/config" "github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/usage" "github.com/nginxinc/nginx-gateway-fabric/internal/mode/static/usage/usagefakes" ) func TestCreateUsageJobWorker(t *testing.T) { - g := NewWithT(t) - replicas := int32(1) ngfReplicaSet := &appsv1.ReplicaSet{ ObjectMeta: metav1.ObjectMeta{ @@ -34,64 +36,145 @@ func TestCreateUsageJobWorker(t *testing.T) { }, } - ngfPod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "nginx-gateway", - Name: "ngf-pod", - OwnerReferences: []metav1.OwnerReference{ - { - Kind: "ReplicaSet", - Name: "ngf-replicaset", + tests := []struct { + name string + listCalls func(_ context.Context, object client.ObjectList, _ ...client.ListOption) error + getCalls func(_ context.Context, _ types.NamespacedName, object client.Object, _ ...client.GetOption) error + expData usage.ClusterDetails + expErr bool + }{ + { + name: "succeeds", + listCalls: func(_ context.Context, object client.ObjectList, _ ...client.ListOption) error { + switch typedList := object.(type) { + case *v1.NodeList: + typedList.Items = append(typedList.Items, v1.Node{}) + return nil + case *appsv1.ReplicaSetList: + typedList.Items = append(typedList.Items, *ngfReplicaSet) + return nil + } + return nil + }, + getCalls: func(_ context.Context, _ types.NamespacedName, object client.Object, _ ...client.GetOption) error { + switch typedObject := object.(type) { + case *v1.Namespace: + typedObject.Name = metav1.NamespaceSystem + typedObject.UID = "1234abcd" + return nil + } + return nil + }, + expData: usage.ClusterDetails{ + Metadata: usage.Metadata{ + UID: "1234abcd", + DisplayName: "my-cluster", + }, + NodeCount: 1, + PodDetails: usage.PodDetails{ + CurrentPodCounts: usage.CurrentPodsCount{ + PodCount: 1, + }, }, }, + expErr: false, }, - } - - kubeSystem := &v1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: metav1.NamespaceSystem, - UID: "1234abcd", - }, - } - - k8sClient := fake.NewFakeClient(&v1.Node{}, ngfReplicaSet, ngfPod, kubeSystem) - reporter := &usagefakes.FakeReporter{} - - worker := usage.CreateUsageJobWorker( - zap.New(), - k8sClient, - reporter, - config.Config{ - GatewayPodConfig: config.GatewayPodConfig{ - Namespace: "nginx-gateway", - Name: "ngf-pod", + { + name: "collect node count fails", + listCalls: func(_ context.Context, object client.ObjectList, _ ...client.ListOption) error { + switch object.(type) { + case *v1.NodeList: + return errors.New("failed to collect node list") + } + return nil }, - UsageReportConfig: &config.UsageReportConfig{ - ClusterDisplayName: "my-cluster", + getCalls: func(_ context.Context, _ types.NamespacedName, _ client.Object, _ ...client.GetOption) error { + return nil }, + expData: usage.ClusterDetails{}, + expErr: true, }, - ) - - expData := usage.ClusterDetails{ - Metadata: usage.Metadata{ - UID: "1234abcd", - DisplayName: "my-cluster", + { + name: "collect replica count fails", + listCalls: func(_ context.Context, object client.ObjectList, _ ...client.ListOption) error { + switch typedList := object.(type) { + case *v1.NodeList: + typedList.Items = append(typedList.Items, v1.Node{}) + return nil + case *appsv1.ReplicaSetList: + return errors.New("failed to collect replica set list") + } + return nil + }, + getCalls: func(_ context.Context, _ types.NamespacedName, _ client.Object, _ ...client.GetOption) error { + return nil + }, + expData: usage.ClusterDetails{}, + expErr: true, }, - NodeCount: 1, - PodDetails: usage.PodDetails{ - CurrentPodCounts: usage.CurrentPodsCount{ - PodCount: 1, + { + name: "collect cluster UID fails", + listCalls: func(_ context.Context, object client.ObjectList, _ ...client.ListOption) error { + switch typedList := object.(type) { + case *v1.NodeList: + typedList.Items = append(typedList.Items, v1.Node{}) + return nil + case *appsv1.ReplicaSetList: + typedList.Items = append(typedList.Items, *ngfReplicaSet) + return nil + } + return nil + }, + getCalls: func(_ context.Context, _ types.NamespacedName, object client.Object, _ ...client.GetOption) error { + switch object.(type) { + case *v1.Namespace: + return errors.New("failed to collect namespace") + } + return nil }, + expData: usage.ClusterDetails{}, + expErr: true, }, } - timeout := 10 * time.Second - ctx, cancel := context.WithTimeout(context.Background(), timeout) - defer cancel() + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + g := NewWithT(t) + + k8sClientReader := &eventsfakes.FakeReader{} + k8sClientReader.ListCalls(test.listCalls) + k8sClientReader.GetCalls(test.getCalls) - worker(ctx) - _, data := reporter.ReportArgsForCall(0) - g.Expect(data).To(Equal(expData)) + reporter := &usagefakes.FakeReporter{} + + worker := usage.CreateUsageJobWorker( + zap.New(), + k8sClientReader, + reporter, + config.Config{ + GatewayPodConfig: config.GatewayPodConfig{ + Namespace: "nginx-gateway", + Name: "ngf-pod", + }, + UsageReportConfig: &config.UsageReportConfig{ + ClusterDisplayName: "my-cluster", + }, + }, + ) + + timeout := 10 * time.Second + ctx, cancel := context.WithTimeout(context.Background(), timeout) + defer cancel() + + worker(ctx) + if test.expErr { + g.Expect(reporter.ReportCallCount()).To(Equal(0)) + } else { + _, data := reporter.ReportArgsForCall(0) + g.Expect(data).To(Equal(test.expData)) + } + }) + } } func TestGetTotalNGFPodCount(t *testing.T) {