Skip to content

Commit

Permalink
Kubernetes tests: Make waiting for pod robust to invalid response types.
Browse files Browse the repository at this point in the history
If an invalid type is encountered over the watch channel, this logs a
warning about it and falls back to the poll-based method, which is also
made faster in such a case.

Also add per-minute rate-limited logging in case a pod is still running
after 1 minute of waiting for it.

PiperOrigin-RevId: 694634839
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Nov 9, 2024
1 parent 7c2bcdd commit 6cd5bbc
Show file tree
Hide file tree
Showing 6 changed files with 106 additions and 9 deletions.
48 changes: 47 additions & 1 deletion test/kubernetes/benchmarks/BUILD
Original file line number Diff line number Diff line change
@@ -1,13 +1,46 @@
load("//tools:defs.bzl", "go_test")
load("//tools:defs.bzl", "go_test", "pkg_tar")

package(
default_applicable_licenses = ["//:license"],
default_visibility = ["//:sandbox"],
licenses = ["notice"],
)

_PACKAGE = "//test/kubernetes/benchmarks"

_ALL_BENCHMARK_TARGETS = [
"%s:%s" %
(
_PACKAGE,
f.replace(".go", ""),
)
for f in glob(["**/*_test.go"])
]

genquery(
name = "all_benchmark_targets",
testonly = True,
expression = " union ".join(_ALL_BENCHMARK_TARGETS),
scope = _ALL_BENCHMARK_TARGETS,
)

[pkg_tar(
name = "%s_tar" % (src[src.index(":") + 1:],),
testonly = True,
srcs = [src],
extension = "tar.bz2",
) for src in _ALL_BENCHMARK_TARGETS]

filegroup(
name = "all_benchmark_test_binaries_tar",
testonly = True,
srcs = ["%s_tar" % (src[src.index(":") + 1:],) for src in _ALL_BENCHMARK_TARGETS],
)

go_test(
name = "abslbuild_test",
srcs = ["abslbuild_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -26,6 +59,7 @@ go_test(
go_test(
name = "startup_test",
srcs = ["startup_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -43,6 +77,7 @@ go_test(
go_test(
name = "redis_test",
srcs = ["redis_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -61,6 +96,7 @@ go_test(
go_test(
name = "ruby_dev_test",
srcs = ["ruby_dev_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -80,6 +116,7 @@ go_test(
go_test(
name = "ffmpeg_test",
srcs = ["ffmpeg_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -98,6 +135,7 @@ go_test(
go_test(
name = "grpc_test",
srcs = ["grpc_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -116,6 +154,7 @@ go_test(
go_test(
name = "nginx_test",
srcs = ["nginx_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -135,6 +174,7 @@ go_test(
go_test(
name = "postgresql_test",
srcs = ["postgresql_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -153,6 +193,7 @@ go_test(
go_test(
name = "tensorflow_test",
srcs = ["tensorflow_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -171,6 +212,7 @@ go_test(
go_test(
name = "wordpress_test",
srcs = ["wordpress_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -190,6 +232,7 @@ go_test(
go_test(
name = "pytorch_test",
srcs = ["pytorch_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -210,6 +253,7 @@ go_test(
embedsrcs = [
"//test/kubernetes/benchmarks/resources:files", # keep
],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -230,6 +274,7 @@ go_test(
go_test(
name = "stablediffusion_test",
srcs = ["stablediffusion_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand All @@ -248,6 +293,7 @@ go_test(
go_test(
name = "gsutil_test",
srcs = ["gsutil_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand Down
5 changes: 4 additions & 1 deletion test/kubernetes/benchmarks/abslbuild_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"path"
"strings"
"testing"
"time"

"gvisor.dev/gvisor/test/kubernetes/benchmarks/profiling"
"gvisor.dev/gvisor/test/kubernetes/benchmetric"
Expand Down Expand Up @@ -126,7 +127,9 @@ func doABSLBuild(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesCont
}
defer cluster.DeletePod(ctx, pod)

containerDuration, err := benchmetric.GetTimedContainerDuration(ctx, cluster, pod, name)
waitDeadlineCtx, cancel := context.WithTimeout(ctx, 30*time.Minute)
containerDuration, err := benchmetric.GetTimedContainerDuration(waitDeadlineCtx, cluster, pod, name)
cancel()
if err != nil {
t.Fatalf("Failed to get container duration: %v", err)
}
Expand Down
2 changes: 2 additions & 0 deletions test/kubernetes/k8sctx/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ package(

go_library(
name = "k8sctx",
testonly = True,
srcs = [
"k8sctx.go",
"k8sctx_impl.go",
],
nogo = False,
visibility = [
"//visibility:public",
],
Expand Down
1 change: 1 addition & 0 deletions test/kubernetes/testcluster/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ go_library(
"//visibility:public",
],
deps = [
"//pkg/log",
"//test/kubernetes:test_range_config_go_proto",
"@io_k8s_api//apps/v1:go_default_library",
"@io_k8s_api//core/v1:go_default_library",
Expand Down
58 changes: 51 additions & 7 deletions test/kubernetes/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ import (
"context"
"fmt"
"io"
"reflect"
"strconv"
"strings"
"time"

"golang.org/x/sync/errgroup"
cspb "google.golang.org/genproto/googleapis/container/v1"
"gvisor.dev/gvisor/pkg/log"
testpb "gvisor.dev/gvisor/test/kubernetes/test_range_config_go_proto"
appsv1 "k8s.io/api/apps/v1"
v13 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -132,8 +134,31 @@ type TestCluster struct {
testNodepoolRuntimeOverride RuntimeType
}

type testClusterConstructorKey int

const (
// testClusterConstructor is the key for the context value that holds the
// constructor function for TestCluster.
// Defaults to newTestCluster.
testClusterConstructor testClusterConstructorKey = iota
)

// WithTestClusterConstructor returns a context that contains a custom
// constructor for TestCluster.
func WithTestClusterConstructor(ctx context.Context, constructor func(context.Context, *testpb.Cluster) (*TestCluster, error)) context.Context {
return context.WithValue(ctx, testClusterConstructor, constructor)
}

// NewTestCluster returns a new TestCluster client.
func NewTestCluster(cluster *testpb.Cluster) (*TestCluster, error) {
func NewTestCluster(ctx context.Context, cluster *testpb.Cluster) (*TestCluster, error) {
constructor, ok := ctx.Value(testClusterConstructor).(func(context.Context, *testpb.Cluster) (*TestCluster, error))
if !ok || constructor == nil || reflect.ValueOf(constructor).IsNil() {
constructor = newTestCluster
}
return constructor(ctx, cluster)
}

func newTestCluster(_ context.Context, cluster *testpb.Cluster) (*TestCluster, error) {
config, err := clientcmd.BuildConfigFromFlags("" /*masterURL*/, cluster.GetCredentialFile())
if err != nil {
return nil, fmt.Errorf("BuildConfigFromFlags: %w", err)
Expand Down Expand Up @@ -335,22 +360,38 @@ func (t *TestCluster) doWaitForPod(ctx context.Context, pod *v13.Pod, phase v13.
if err != nil {
return fmt.Errorf("watch: %w", err)
}
podLogger := log.BasicRateLimitedLogger(5 * time.Minute)
incompatibleTypeLogger := log.BasicRateLimitedLogger(5 * time.Minute)
startLogTime := time.Now().Add(3 * time.Minute)

var p *v13.Pod
incompatibleType := false
pollCh := time.NewTicker(10 * time.Second)
defer pollCh.Stop()
pollLoop:
for {
select {
case <-ctx.Done():
return ctx.Err()
case <-pollCh.C:
p, err = t.GetPod(ctx, pod)
if err != nil {
return fmt.Errorf("failed to poll pod: %w", err)
}
case e := <-w.ResultChan():
var ok bool
p, ok = e.Object.(*v13.Pod)
if !ok {
return fmt.Errorf("invalid object watched: %T", p)
}
case <-time.After(10 * time.Second):
p, err = t.GetPod(ctx, pod)
if err != nil {
return fmt.Errorf("failed to poll pod: %w", err)
if !incompatibleType {
log.Warningf("Received unexpected type of watched pod: got %T (%v), expected %T; falling back to polling-based wait.", e.Object, e.Object, p)
incompatibleType = true
pollCh.Stop()
pollCh = time.NewTicker(250 * time.Millisecond)
} else {
incompatibleTypeLogger.Infof("Received another unexpected type of watched pod: got %T (%v), expected %T.", e.Object, e.Object, p)
}
time.Sleep(10 * time.Millisecond)
continue pollLoop
}
}
if ctx.Err() != nil {
Expand All @@ -372,6 +413,9 @@ func (t *TestCluster) doWaitForPod(ctx context.Context, pod *v13.Pod, phase v13.
case phase:
return nil
}
if time.Now().After(startLogTime) {
podLogger.Infof("Still waiting for pod %q after %v; pod status: %v", pod.GetName(), time.Since(startLogTime), p.Status)
}
}
}

Expand Down
1 change: 1 addition & 0 deletions test/kubernetes/tests/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ package(
go_test(
name = "hello_test",
srcs = ["hello_test.go"],
nogo = False,
tags = [
"local",
"noguitar",
Expand Down

0 comments on commit 6cd5bbc

Please sign in to comment.