Skip to content

Commit

Permalink
Detect panics in e2e tests
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer buringerst@vmware.com
  • Loading branch information
sbueringer committed Aug 19, 2024
1 parent defa62d commit 8097eee
Show file tree
Hide file tree
Showing 2 changed files with 154 additions and 1 deletion.
51 changes: 50 additions & 1 deletion test/framework/deployment_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package framework

import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
Expand All @@ -31,12 +32,15 @@ import (

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
"github.com/prometheus/common/expfmt"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/intstr"
utilversion "k8s.io/apimachinery/pkg/util/version"
"k8s.io/apimachinery/pkg/util/wait"
Expand Down Expand Up @@ -349,7 +353,7 @@ type WatchPodMetricsInput struct {
// WatchPodMetrics captures metrics from all pods every 5s. It expects to find port 8080 open on the controller.
func WatchPodMetrics(ctx context.Context, input WatchPodMetricsInput) {
// Dump machine metrics every 5 seconds
ticker := time.NewTicker(time.Second * 5)
ticker := time.NewTicker(time.Second * 10)
Expect(ctx).NotTo(BeNil(), "ctx is required for dumpContainerMetrics")
Expect(input.ClientSet).NotTo(BeNil(), "input.ClientSet is required for dumpContainerMetrics")
Expect(input.Deployment).NotTo(BeNil(), "input.Deployment is required for dumpContainerMetrics")
Expand Down Expand Up @@ -397,8 +401,10 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
Do(ctx)
data, err := res.Raw()

var errorRetrievingMetrics bool
if err != nil {
// Failing to dump metrics should not cause the test to fail
errorRetrievingMetrics = true
data = []byte(fmt.Sprintf("Error retrieving metrics for pod %s: %v\n%s", klog.KRef(pod.Namespace, pod.Name), err, string(data)))
metricsFile = path.Join(metricsDir, "metrics-error.txt")
}
Expand All @@ -407,7 +413,50 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
// Failing to dump metrics should not cause the test to fail
log.Logf("Error writing metrics for pod %s: %v", klog.KRef(pod.Namespace, pod.Name), err)
}

if !errorRetrievingMetrics {
Expect(verifyMetrics(data)).To(Succeed())
}
}
}

func verifyMetrics(data []byte) error {
var parser expfmt.TextParser
mf, err := parser.TextToMetricFamilies(bytes.NewReader(data))
if err != nil {
return errors.Wrapf(err, "failed to parse data to metrics families")
}

var errs []error
for metric, metricFamily := range mf {
if metric == "controller_runtime_reconcile_panics_total" {
for _, controllerPanicMetric := range metricFamily.Metric {
if controllerPanicMetric.Counter != nil && controllerPanicMetric.Counter.Value != nil && *controllerPanicMetric.Counter.Value > 0 {
controllerName := "unknown"
for _, label := range controllerPanicMetric.Label {
if *label.Name == "controller" {
controllerName = *label.Value
}
}
errs = append(errs, fmt.Errorf("panic occurred in %q controller", controllerName))
}
}
}

if metric == "controller_runtime_webhook_panics_total" {
for _, webhookPanicMetric := range metricFamily.Metric {
if webhookPanicMetric.Counter != nil && webhookPanicMetric.Counter.Value != nil && *webhookPanicMetric.Counter.Value > 0 {
errs = append(errs, fmt.Errorf("panic occurred in webhook"))
}
}
}
}

if len(errs) > 0 {
return kerrors.NewAggregate(errs)
}

return nil
}

// WaitForDNSUpgradeInput is the input for WaitForDNSUpgrade.
Expand Down
104 changes: 104 additions & 0 deletions test/framework/deployment_helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package framework

import (
"testing"

. "github.com/onsi/gomega"
)

func Test_verifyMetrics(t *testing.T) {
tests := []struct {
name string
data []byte
wantErr string
}{
{
name: "no panic metric exists",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
`),
},
{
name: "no panic occurred",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 0
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 0
`),
},
{
name: "panic occurred in controller",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 1
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 0
`),
wantErr: "panic occurred in \"cluster\" controller",
},
{
name: "panic occurred in webhook",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 0
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 1
`),
wantErr: "panic occurred in webhook",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

err := verifyMetrics(tt.data)
if tt.wantErr == "" {
g.Expect(err).ToNot(HaveOccurred())
} else {
g.Expect(err).To(HaveOccurred())
g.Expect(err.Error()).To(Equal(tt.wantErr))
}
})
}
}

0 comments on commit 8097eee

Please sign in to comment.