From 730843aeb95e1291179135c30b7535f87cb7cfa5 Mon Sep 17 00:00:00 2001
From: Steve Kuznetsov <skuznets@redhat.com>
Date: Thu, 13 Jul 2023 10:03:10 -0600
Subject: [PATCH] pkg/{cache,client}: add options for cache miss policy

This commit allows users to opt out of the "start informers in the
background" behavior that the current cache implementation uses.
Additionally, when opting out of this behavior, the client can be
configured to do a live lookup on a cache miss. The default behaviors
are:

  pkg/cache: backfill data on a miss (today's default, unchanged)
  pkg/client: live lookup when cache is configured to miss

Signed-off-by: Steve Kuznetsov <skuznets@redhat.com>
---
 pkg/cache/cache.go              | 29 +++++++++++++
 pkg/cache/cache_test.go         | 57 +++++++++++++++++++++++++
 pkg/cache/errors/errors.go      | 20 +++++++++
 pkg/cache/informer_cache.go     | 30 ++++++++++---
 pkg/cache/internal/informers.go |  5 ++-
 pkg/client/client.go            | 44 ++++++++++++++++++-
 pkg/client/client_test.go       | 75 +++++++++++++++++++++++++++++++++
 7 files changed, 251 insertions(+), 9 deletions(-)
 create mode 100644 pkg/cache/errors/errors.go

diff --git a/pkg/cache/cache.go b/pkg/cache/cache.go
index f01de43810..07ab96b165 100644
--- a/pkg/cache/cache.go
+++ b/pkg/cache/cache.go
@@ -166,8 +166,31 @@ type Options struct {
 	//
 	// This is a global setting for all objects, and can be overridden by the ByObject setting.
 	UnsafeDisableDeepCopy *bool
+
+	// MissPolicy determines how the cache should behave when no informers are started for the
+	// resource type a client is asking for in a Get or a List. See the CacheMissPolicies for
+	// documentation for each policy.
+	MissPolicy MissPolicy
 }
 
+// MissPolicy determines how the cache should behave when no informers are started for the
+// resource type a client is asking for in a Get or a List.
+type MissPolicy string
+
+const (
+	// MissPolicyBackfill configures the cache to spin up an informer for a resource when
+	// the first request for that resource comes in. This means that a Get or a List may take
+	// longer than normal to succeed on the first invocation as the cache waits until it's fully
+	// back-filled.
+	// This is the default policy.
+	MissPolicyBackfill MissPolicy = "backfill"
+
+	// MissPolicyFail configures the cache to return a ErrResourceNotCached error when a user
+	// requests a resource the cache is not configured to hold. This error is distinct from an
+	// errors.NotFound.
+	MissPolicyFail MissPolicy = "fail"
+)
+
 // ByObject offers more fine-grained control over the cache's ListWatch by object.
 type ByObject struct {
 	// Label represents a label selector for the object.
@@ -232,6 +255,7 @@ func New(config *rest.Config, opts Options) (Cache, error) {
 			Namespace:    opts.Namespaces[0],
 			ByGVK:        byGVK,
 		}),
+		missPolicy: opts.MissPolicy,
 	}, nil
 }
 
@@ -267,6 +291,11 @@ func defaultOpts(config *rest.Config, opts Options) (Options, error) {
 	if opts.SyncPeriod == nil {
 		opts.SyncPeriod = &defaultSyncPeriod
 	}
+
+	// Backfill by default
+	if opts.MissPolicy == "" {
+		opts.MissPolicy = MissPolicyBackfill
+	}
 	return opts, nil
 }
 
diff --git a/pkg/cache/cache_test.go b/pkg/cache/cache_test.go
index 100825854a..716740f772 100644
--- a/pkg/cache/cache_test.go
+++ b/pkg/cache/cache_test.go
@@ -18,6 +18,7 @@ package cache_test
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"reflect"
 	"sort"
@@ -39,6 +40,7 @@ import (
 	"k8s.io/client-go/rest"
 	kcache "k8s.io/client-go/tools/cache"
 	"k8s.io/utils/pointer"
+	cacheerrors "sigs.k8s.io/controller-runtime/pkg/cache/errors"
 
 	"sigs.k8s.io/controller-runtime/pkg/cache"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -117,6 +119,11 @@ func deletePod(pod client.Object) {
 var _ = Describe("Informer Cache", func() {
 	CacheTest(cache.New, cache.Options{})
 })
+
+var _ = Describe("Informer Cache with miss policy = fail", func() {
+	CacheTestMissPolicyFail(cache.New, cache.Options{MissPolicy: cache.MissPolicyFail})
+})
+
 var _ = Describe("Multi-Namespace Informer Cache", func() {
 	CacheTest(cache.MultiNamespacedCacheBuilder([]string{testNamespaceOne, testNamespaceTwo, "default"}), cache.Options{})
 })
@@ -414,6 +421,56 @@ var _ = Describe("Cache with selectors", func() {
 	})
 })
 
+func CacheTestMissPolicyFail(createCacheFunc func(config *rest.Config, opts cache.Options) (cache.Cache, error), opts cache.Options) {
+	Describe("Cache test with MissPolicy = fail", func() {
+		var (
+			informerCache       cache.Cache
+			informerCacheCtx    context.Context
+			informerCacheCancel context.CancelFunc
+			errNotCached        *cacheerrors.ErrResourceNotCached
+		)
+
+		BeforeEach(func() {
+			informerCacheCtx, informerCacheCancel = context.WithCancel(context.Background())
+			Expect(cfg).NotTo(BeNil())
+
+			By("creating the informer cache")
+			var err error
+			informerCache, err = createCacheFunc(cfg, opts)
+			Expect(err).NotTo(HaveOccurred())
+			By("running the cache and waiting for it to sync")
+			// pass as an arg so that we don't race between close and re-assign
+			go func(ctx context.Context) {
+				defer GinkgoRecover()
+				Expect(informerCache.Start(ctx)).To(Succeed())
+			}(informerCacheCtx)
+			Expect(informerCache.WaitForCacheSync(informerCacheCtx)).To(BeTrue())
+		})
+
+		AfterEach(func() {
+			informerCacheCancel()
+		})
+
+		Describe("as a Reader", func() {
+			Context("with structured objects", func() {
+				It("should not be able to list objects that haven't been watched previously", func() {
+					By("listing all services in the cluster")
+					listObj := &corev1.ServiceList{}
+					fmt.Printf("%#v\n", informerCache.List(context.Background(), listObj))
+					Expect(errors.As(informerCache.List(context.Background(), listObj), &errNotCached)).To(BeTrue())
+				})
+
+				It("should not be able to get objects that haven't been watched previously", func() {
+					By("getting the Kubernetes service")
+					svc := &corev1.Service{}
+					svcKey := client.ObjectKey{Namespace: "default", Name: "kubernetes"}
+					Expect(errors.As(informerCache.Get(context.Background(), svcKey, svc), &errNotCached)).To(BeTrue())
+				})
+			})
+		})
+	})
+}
+
 func CacheTest(createCacheFunc func(config *rest.Config, opts cache.Options) (cache.Cache, error), opts cache.Options) {
 	Describe("Cache test", func() {
 		var (
diff --git a/pkg/cache/errors/errors.go b/pkg/cache/errors/errors.go
new file mode 100644
index 0000000000..2519d014a2
--- /dev/null
+++ b/pkg/cache/errors/errors.go
@@ -0,0 +1,20 @@
+package errors
+
+import (
+	"fmt"
+
+	"k8s.io/apimachinery/pkg/runtime/schema"
+)
+
+// ErrResourceNotCached indicates that the resource type the client asked the cache for is not cached, and
+// the cache is configured to fail on a cache miss.
+type ErrResourceNotCached struct {
+	GVK schema.GroupVersionKind
+}
+
+// Error returns the error
+func (r ErrResourceNotCached) Error() string {
+	return fmt.Sprintf("%s is not cached and the cache is configured to fail on miss", r.GVK.String())
+}
+
+var _ error = (*ErrResourceNotCached)(nil)
diff --git a/pkg/cache/informer_cache.go b/pkg/cache/informer_cache.go
index 771244d52a..3ff4a0a6c0 100644
--- a/pkg/cache/informer_cache.go
+++ b/pkg/cache/informer_cache.go
@@ -27,6 +27,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/apimachinery/pkg/runtime/schema"
 	"k8s.io/client-go/tools/cache"
+	"sigs.k8s.io/controller-runtime/pkg/cache/errors"
 	"sigs.k8s.io/controller-runtime/pkg/cache/internal"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
@@ -50,6 +51,7 @@ func (*ErrCacheNotStarted) Error() string {
 type informerCache struct {
 	scheme *runtime.Scheme
 	*internal.Informers
+	missPolicy MissPolicy
 }
 
 // Get implements Reader.
@@ -59,7 +61,7 @@ func (ic *informerCache) Get(ctx context.Context, key client.ObjectKey, out clie
 		return err
 	}
 
-	started, cache, err := ic.Informers.Get(ctx, gvk, out)
+	started, cache, err := ic.getInformerForKind(ctx, gvk, out)
 	if err != nil {
 		return err
 	}
@@ -77,7 +79,7 @@ func (ic *informerCache) List(ctx context.Context, out client.ObjectList, opts .
 		return err
 	}
 
-	started, cache, err := ic.Informers.Get(ctx, *gvk, cacheTypeObj)
+	started, cache, err := ic.getInformerForKind(ctx, *gvk, cacheTypeObj)
 	if err != nil {
 		return err
 	}
@@ -130,8 +132,7 @@ func (ic *informerCache) GetInformerForKind(ctx context.Context, gvk schema.Grou
 	if err != nil {
 		return nil, err
 	}
-
-	_, i, err := ic.Informers.Get(ctx, gvk, obj)
+	_, i, err := ic.getInformerForKind(ctx, gvk, obj)
 	if err != nil {
 		return nil, err
 	}
@@ -145,13 +146,32 @@ func (ic *informerCache) GetInformer(ctx context.Context, obj client.Object) (In
 		return nil, err
 	}
 
-	_, i, err := ic.Informers.Get(ctx, gvk, obj)
+	_, i, err := ic.getInformerForKind(ctx, gvk, obj)
 	if err != nil {
 		return nil, err
 	}
 	return i.Informer, err
 }
 
+func (ic *informerCache) getInformerForKind(ctx context.Context, gvk schema.GroupVersionKind, obj runtime.Object) (bool, *internal.Cache, error) {
+	var cache *internal.Cache
+	var started bool
+	var err error
+	switch ic.missPolicy {
+	case MissPolicyFail:
+		var ok bool
+		cache, started, ok = ic.Informers.Peek(gvk, obj)
+		if !ok {
+			err = &errors.ErrResourceNotCached{GVK: gvk}
+		}
+	case MissPolicyBackfill:
+		fallthrough
+	default:
+		started, cache, err = ic.Informers.Get(ctx, gvk, obj)
+	}
+	return started, cache, err
+}
+
 // NeedLeaderElection implements the LeaderElectionRunnable interface
 // to indicate that this can be started without requiring the leader lock.
 func (ic *informerCache) NeedLeaderElection() bool {
diff --git a/pkg/cache/internal/informers.go b/pkg/cache/internal/informers.go
index 09e0111114..63e4ed94e2 100644
--- a/pkg/cache/internal/informers.go
+++ b/pkg/cache/internal/informers.go
@@ -271,7 +271,8 @@ func (ip *Informers) WaitForCacheSync(ctx context.Context) bool {
 	return cache.WaitForCacheSync(ctx.Done(), ip.getHasSyncedFuncs()...)
 }
 
-func (ip *Informers) get(gvk schema.GroupVersionKind, obj runtime.Object) (res *Cache, started bool, ok bool) {
+// Peek attempts to get the informer for the GVK, but does not start one if one does not exist.
+func (ip *Informers) Peek(gvk schema.GroupVersionKind, obj runtime.Object) (res *Cache, started bool, ok bool) {
 	ip.mu.RLock()
 	defer ip.mu.RUnlock()
 	i, ok := ip.informersByType(obj)[gvk]
@@ -282,7 +283,7 @@ func (ip *Informers) get(gvk schema.GroupVersionKind, obj runtime.Object) (res *
 // the Informer from the map.
 func (ip *Informers) Get(ctx context.Context, gvk schema.GroupVersionKind, obj runtime.Object) (bool, *Cache, error) {
 	// Return the informer if it is found
-	i, started, ok := ip.get(gvk, obj)
+	i, started, ok := ip.Peek(gvk, obj)
 	if !ok {
 		var err error
 		if i, started, err = ip.addInformerToMap(gvk, obj); err != nil {
diff --git a/pkg/client/client.go b/pkg/client/client.go
index 21067b6f8f..45412b1169 100644
--- a/pkg/client/client.go
+++ b/pkg/client/client.go
@@ -31,6 +31,7 @@ import (
 	"k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/client-go/metadata"
 	"k8s.io/client-go/rest"
+	cacheerrors "sigs.k8s.io/controller-runtime/pkg/cache/errors"
 
 	"sigs.k8s.io/controller-runtime/pkg/client/apiutil"
 	"sigs.k8s.io/controller-runtime/pkg/log"
@@ -82,8 +83,27 @@ type CacheOptions struct {
 	// Unstructured is a flag that indicates whether the cache-backed client should
 	// read unstructured objects or lists from the cache.
 	Unstructured bool
+	// MissPolicy determines how the client should behave when the underlying cache is
+	// configured to fail on a miss. See the CacheMissPolicies for documentation for each
+	// policy.
+	MissPolicy CacheMissPolicy
 }
 
+// CacheMissPolicy determines how the client should behave when the underlying cache is
+// configured to fail on a miss.
+type CacheMissPolicy string
+
+const (
+	// CacheMissPolicyFail configures the client to return an errors.NotFound error when a user
+	// requests a resource the cache is not configured to hold.
+	CacheMissPolicyFail CacheMissPolicy = "fail"
+
+	// CacheMissPolicyLiveLookup configures the client to issue a live client lookup to get data
+	// for a resource that is not cached.
+	// This is the default policy.
+	CacheMissPolicyLiveLookup CacheMissPolicy = "live-lookup"
+)
+
 // NewClientFunc allows a user to define how to create a client.
 type NewClientFunc func(config *rest.Config, options Options) (Client, error)
 
@@ -185,9 +205,15 @@ func newClient(config *rest.Config, options Options) (*client, error) {
 		return c, nil
 	}
 
+	// Default to looking up items when missing in the cache
+	if options.Cache.MissPolicy == "" {
+		options.Cache.MissPolicy = CacheMissPolicyLiveLookup
+	}
+
 	// We want a cache if we're here.
 	// Set the cache.
 	c.cache = options.Cache.Reader
+	c.cacheMissPolicy = options.Cache.MissPolicy
 
 	// Load uncached GVKs.
 	c.cacheUnstructured = options.Cache.Unstructured
@@ -214,6 +240,7 @@ type client struct {
 	mapper             meta.RESTMapper
 
 	cache             Reader
+	cacheMissPolicy   CacheMissPolicy
 	uncachedGVKs      map[schema.GroupVersionKind]struct{}
 	cacheUnstructured bool
 }
@@ -338,7 +365,10 @@ func (c *client) Get(ctx context.Context, key ObjectKey, obj Object, opts ...Get
 	if isUncached, err := c.shouldBypassCache(obj); err != nil {
 		return err
 	} else if !isUncached {
-		return c.cache.Get(ctx, key, obj, opts...)
+		err := c.cache.Get(ctx, key, obj, opts...)
+		if c.shouldReturnCacheErr(err) {
+			return err
+		}
 	}
 
 	switch obj.(type) {
@@ -353,12 +383,22 @@ func (c *client) Get(ctx context.Context, key ObjectKey, obj Object, opts ...Get
 	}
 }
 
+// shouldReturnCacheErr determines if we should return the error we got from the cache, which we do in every case
+// except for when we're configured to do live lookups and the cache returns cache.ErrResourceNotCached
+func (c *client) shouldReturnCacheErr(err error) bool {
+	var errNotCached *cacheerrors.ErrResourceNotCached
+	return c.cacheMissPolicy != CacheMissPolicyLiveLookup || !errors.As(err, &errNotCached)
+}
+
 // List implements client.Client.
 func (c *client) List(ctx context.Context, obj ObjectList, opts ...ListOption) error {
 	if isUncached, err := c.shouldBypassCache(obj); err != nil {
 		return err
 	} else if !isUncached {
-		return c.cache.List(ctx, obj, opts...)
+		err := c.cache.List(ctx, obj, opts...)
+		if c.shouldReturnCacheErr(err) {
+			return err
+		}
 	}
 
 	switch x := obj.(type) {
diff --git a/pkg/client/client_test.go b/pkg/client/client_test.go
index d8b3995050..08f36df4e8 100644
--- a/pkg/client/client_test.go
+++ b/pkg/client/client_test.go
@@ -19,6 +19,7 @@ package client_test
 import (
 	"context"
 	"encoding/json"
+	"errors"
 	"fmt"
 	"reflect"
 	"sync/atomic"
@@ -41,6 +42,7 @@ import (
 	"k8s.io/apimachinery/pkg/types"
 	kscheme "k8s.io/client-go/kubernetes/scheme"
 	"k8s.io/utils/pointer"
+	cacheerrors "sigs.k8s.io/controller-runtime/pkg/cache/errors"
 
 	"sigs.k8s.io/controller-runtime/examples/crd/pkg"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -143,6 +145,7 @@ var _ = Describe("Client", func() {
 	var count uint64 = 0
 	var replicaCount int32 = 2
 	var ns = "default"
+	var errNotCached *cacheerrors.ErrResourceNotCached
 	ctx := context.TODO()
 
 	BeforeEach(func() {
@@ -278,6 +281,50 @@ U5wwSivyi7vmegHKmblOzNVKA5qPO8zWzqBC
 			Expect(cache.Called).To(Equal(2))
 		})
 
+		It("should use the provided reader cache if provided, but fail when configured to on cache misses for get and list", func() {
+			c := &fakeUncachedReader{}
+			cl, err := client.New(cfg, client.Options{Cache: &client.CacheOptions{Reader: c, MissPolicy: client.CacheMissPolicyFail}})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cl).NotTo(BeNil())
+			Expect(errors.As(cl.Get(ctx, client.ObjectKey{Name: "test"}, &appsv1.Deployment{}), &errNotCached)).To(BeTrue())
+			Expect(errors.As(cl.List(ctx, &appsv1.DeploymentList{}), &errNotCached)).To(BeTrue())
+			Expect(c.Called).To(Equal(2))
+		})
+
+		It("should use the provided reader cache if provided, but live lookup when configured to on cache misses for get and list", func() {
+			cache := &fakeUncachedReader{}
+			cl, err := client.New(cfg, client.Options{Cache: &client.CacheOptions{Reader: cache, MissPolicy: client.CacheMissPolicyLiveLookup}})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cl).NotTo(BeNil())
+
+			By("creating the object")
+			err = cl.Create(context.TODO(), dep)
+			Expect(err).NotTo(HaveOccurred())
+
+			By("getting the object with a get")
+			var actual appsv1.Deployment
+			Expect(cl.Get(ctx, client.ObjectKey{Name: dep.Name, Namespace: dep.Namespace}, &actual)).To(Succeed())
+			Expect(dep).To(Equal(&actual))
+
+			By("getting the object with a list")
+			var list appsv1.DeploymentList
+			Expect(cl.List(ctx, &list)).To(Succeed())
+			Expect(list.Items).To(HaveLen(1))
+			Expect(dep).To(Equal(&list.Items[0]))
+
+			Expect(cache.Called).To(Equal(2))
+		})
+
+		It("should use the provided reader cache if provided, but forward cache errors even if configured to live lookup on cache misses for get and list", func() {
+			c := &fakeErrorReader{}
+			cl, err := client.New(cfg, client.Options{Cache: &client.CacheOptions{Reader: c, MissPolicy: client.CacheMissPolicyLiveLookup}})
+			Expect(err).NotTo(HaveOccurred())
+			Expect(cl).NotTo(BeNil())
+			Expect(cl.Get(ctx, client.ObjectKey{Name: "test"}, &appsv1.Deployment{})).To(MatchError(ContainSubstring("sentinel")))
+			Expect(cl.List(ctx, &appsv1.DeploymentList{})).To(MatchError(ContainSubstring("sentinel")))
+			Expect(c.Called).To(Equal(2))
+		})
+
 		It("should not use the provided reader cache if provided, on get and list for uncached GVKs", func() {
 			cache := &fakeReader{}
 			cl, err := client.New(cfg, client.Options{Cache: &client.CacheOptions{Reader: cache, DisableFor: []client.Object{&corev1.Namespace{}}}})
@@ -3938,6 +3985,34 @@ func (f *fakeReader) List(ctx context.Context, list client.ObjectList, opts ...c
 	return nil
 }
 
+type fakeUncachedReader struct {
+	Called int
+}
+
+func (f *fakeUncachedReader) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
+	f.Called++
+	return &cacheerrors.ErrResourceNotCached{}
+}
+
+func (f *fakeUncachedReader) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
+	f.Called++
+	return &cacheerrors.ErrResourceNotCached{}
+}
+
+type fakeErrorReader struct {
+	Called int
+}
+
+func (f *fakeErrorReader) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
+	f.Called++
+	return errors.New("sentinel")
+}
+
+func (f *fakeErrorReader) List(ctx context.Context, list client.ObjectList, opts ...client.ListOption) error {
+	f.Called++
+	return errors.New("sentinel")
+}
+
 func ptr[T any](to T) *T {
 	return &to
 }