Use paginated lists in sharder resyncs (#39)

* Drop unused utilities * Improve evaluation docs * Log when shard gets unavailable * Add pager implementation for controller-runtime * Use paginated lists in sharder resyncs * Align query step with scrape interval cadvisor is only scraped every 30s, hence step should also be 30s. * Update evaluation results
timebertt · Dec 12, 2023 · 1c5b7c1 · 1c5b7c1
1 parent 418e85d
commit 1c5b7c1
Show file tree

Hide file tree

Showing 11 changed files with 236 additions and 51 deletions.
diff --git a/docs/assets/comparison-cpu.jpg b/docs/assets/comparison-cpu.jpg
diff --git a/docs/assets/comparison-memory.jpg b/docs/assets/comparison-memory.jpg
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -117,6 +117,9 @@ kubectl apply -f hack/config/shoot.yaml
 # gardenctl target --shoot ...
 kubectl apply --server-side -k hack/config/external-dns
 kubectl -n external-dns create secret generic google-clouddns-timebertt-dev --from-literal project=$PROJECT_NAME --from-file service-account.json=$SERVICE_ACCOUNT_FILE
+
+# gardenctl target --control-plane
+kubectl apply --server-side -k hack/config/policy/controlplane
 ```
 
 In addition to the described components, [kyverno](https://github.com/kyverno/kyverno) is deployed to the cluster itself (shoot cluster) and to the control plane (seed cluster).
@@ -128,7 +131,7 @@ This is done to make load test experiments more stable and and their results mor
 ## Key Results
 
 > [!NOTE]
-> These are preliminary results from a first set of test runs.
+> These are preliminary results from a first set of test runs.  
 > TODO: update these once the full evaluation is completed.
 
 The following graphs compare CPU and memory usage of the components in three different setups when running the `basic` experiment scenario (~8,000 websites created over 10m):
@@ -143,5 +146,5 @@ The following graphs compare CPU and memory usage of the components in three dif
 
 The new external sharding approach proves to scale best.
 The individual shards consume about a third of the singleton controller's usage (close to optimum).
-Also, the sharder pods consume a low amount of resources which is almost static apart from periodic resyncs every 5 minutes. 
+Also, the sharder pods consume a low static amount of resources. 
 Most importantly, the sharder's resource usage is independent of the number of sharded objects.
diff --git a/hack/config/external-dns/patch-deployment.yaml b/hack/config/external-dns/patch-deployment.yaml
@@ -18,7 +18,7 @@ spec:
         - --google-zone-visibility=public
         - --policy=sync
         - --registry=txt
-        - --txt-owner-id=shoot--timebertt--sharding-d657103c-eb4f-4a02-9af3-8ee7dc8d6e12-ond-82dc04
+        - --txt-owner-id=shoot--timebertt--sharding-0ed260ae-7e79-45a3-8e6d-2f30c68a4e42-ond-1d3ea8
         - --interval=1m
         # ensure the records are not owned by short-lived acme solvers managed by cert-manager or website ingresses
         - --label-filter=acme.cert-manager.io/http01-solver!=true,app!=website

diff --git a/pkg/controller/sharder/reconciler.go b/pkg/controller/sharder/reconciler.go
@@ -42,6 +42,7 @@ import (
 	"github.com/timebertt/kubernetes-controller-sharding/pkg/sharding/ring"
 	utilclient "github.com/timebertt/kubernetes-controller-sharding/pkg/utils/client"
 	utilerrors "github.com/timebertt/kubernetes-controller-sharding/pkg/utils/errors"
+	"github.com/timebertt/kubernetes-controller-sharding/pkg/utils/pager"
 )
 
 //+kubebuilder:rbac:groups=sharding.timebertt.dev,resources=clusterrings,verbs=get;list;watch
@@ -161,26 +162,30 @@ func (r *Reconciler) resyncResource(
 		return fmt.Errorf("no kinds found for resource %q", gr.String())
 	}
 
+	var allErrs *multierror.Error
+
 	list := &metav1.PartialObjectMetadataList{}
 	list.SetGroupVersionKind(gvks[0])
-	// List a recent version from the API server's watch cache by setting resourceVersion=0. This reduces the load on etcd
-	// for ring resyncs. Listing from etcd with quorum read would be a scalability limitation/bottleneck.
-	// If we try to move or drain an object with an old resourceVersion (conflict error), we will retry with exponential
-	// backoff.
-	// This trades retries for smaller impact of periodic resyncs (that don't require any action).
-	if err := r.Reader.List(ctx, list, utilclient.ResourceVersion("0")); err != nil {
-		return fmt.Errorf("error listing %s: %w", gr.String(), err)
+	err = pager.New(r.Reader).EachListItem(ctx, list,
+		func(obj client.Object) error {
+			if !namespaces.Has(obj.GetNamespace()) {
+				return nil
+			}
+
+			allErrs = multierror.Append(allErrs, r.resyncObject(ctx, log, gr, obj.(*metav1.PartialObjectMetadata), ring, hashRing, shards, controlled))
+			return nil
+		},
+		// List a recent version from the API server's watch cache by setting resourceVersion=0. This reduces the load on etcd
+		// for ring resyncs. Listing from etcd with quorum read would be a scalability limitation/bottleneck.
+		// If we try to move or drain an object with an old resourceVersion (conflict error), we will retry with exponential
+		// backoff.
+		// This trades retries for smaller impact of periodic resyncs (that don't require any action).
+		utilclient.ResourceVersion("0"),
+	)
+	if err != nil {
+		allErrs = multierror.Append(allErrs, fmt.Errorf("error listing %s: %w", gr.String(), err))
 	}
 
-	var allErrs *multierror.Error
-	for _, o := range list.Items {
-		obj := o
-		if !namespaces.Has(obj.Namespace) {
-			continue
-		}
-
-		allErrs = multierror.Append(allErrs, r.resyncObject(ctx, log, gr, &obj, ring, hashRing, shards, controlled))
-	}
 	return allErrs.ErrorOrNil()
 }
 

diff --git a/pkg/controller/shardlease/reconciler.go b/pkg/controller/shardlease/reconciler.go
@@ -91,7 +91,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 	switch shard.State {
 	case leases.Ready:
 		if previousState != leases.Ready {
-			log.Info("Shard got ready")
+			log.Info("Shard got available")
 		}
 		requeueAfter = shard.Times.ToExpired
 	case leases.Expired:
@@ -119,6 +119,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req reconcile.Request) (reco
 		// requeue with leaseDuration just to be sure
 		requeueAfter = shard.Times.LeaseDuration
 	case leases.Dead:
+		if previousState != leases.Dead {
+			log.Info("Shard got unavailable")
+		}
+
 		// garbage collect later
 		requeueAfter = shard.Times.ToOrphaned
 	case leases.Orphaned:

diff --git a/pkg/utils/pager/pager.go b/pkg/utils/pager/pager.go
@@ -0,0 +1,198 @@
+/*
+Copyright 2023 Tim Ebert.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package pager
+
+import (
+	"context"
+	"fmt"
+
+	"k8s.io/apimachinery/pkg/api/meta"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+const (
+	defaultPageSize       = 500
+	defaultPageBufferSize = 10
+)
+
+// New creates a new pager from the provided reader using the default options.
+func New(reader client.Reader) *ListPager {
+	return &ListPager{
+		Reader:         reader,
+		PageSize:       defaultPageSize,
+		PageBufferSize: defaultPageBufferSize,
+	}
+}
+
+// ListPager assists client code in breaking large list queries into multiple smaller chunks of PageSize or smaller.
+// The pager does not alter any fields on the initial options list except Continue.
+// It is implemented like client-go's pager but uses a controller-runtime client,
+// see https://github.com/kubernetes/client-go/blob/release-1.28/tools/pager/pager.go.
+// Exception: this ListPager also fixes the `specifying resource version is not allowed when using continue` error
+// in EachListItem and EachListItemWithAlloc.
+type ListPager struct {
+	Reader client.Reader
+
+	// PageSize is the maximum number of objects to retrieve in individual list calls.
+	// If a client.Limit option is passed, the pager uses the option's value instead.
+	PageSize int64
+	// Number of pages to buffer in EachListItem and EachListItemWithAlloc.
+	PageBufferSize int32
+}
+
+// EachListItem fetches runtime.Object items using this ListPager and invokes fn on each item. If
+// fn returns an error, processing stops and that error is returned. If fn does not return an error,
+// any error encountered while retrieving the list from the server is returned. If the context
+// cancels or times out, the context error is returned. Since the list is retrieved in paginated
+// chunks, an "Expired" error (metav1.StatusReasonExpired) may be returned if the pagination list
+// requests exceed the expiration limit of the apiserver being called.
+//
+// Items are retrieved in chunks from the server to reduce the impact on the server with up to
+// ListPager.PageBufferSize chunks buffered concurrently in the background.
+//
+// If items passed to fn are retained for different durations, and you want to avoid
+// retaining the whole slice returned by p.PageFn as long as any item is referenced,
+// use EachListItemWithAlloc instead.
+func (p *ListPager) EachListItem(ctx context.Context, list client.ObjectList, fn func(obj client.Object) error, opts ...client.ListOption) error {
+	return p.eachListChunkBuffered(ctx, list, func(list client.ObjectList) error {
+		return meta.EachListItem(list, func(obj runtime.Object) error {
+			return fn(obj.(client.Object))
+		})
+	}, opts...)
+}
+
+// EachListItemWithAlloc works like EachListItem, but avoids retaining references to the items slice returned by p.PageFn.
+// It does this by making a shallow copy of non-pointer items in the slice returned by p.PageFn.
+//
+// If the items passed to fn are not retained, or are retained for the same duration, use EachListItem instead for memory efficiency.
+func (p *ListPager) EachListItemWithAlloc(ctx context.Context, list client.ObjectList, fn func(obj client.Object) error, opts ...client.ListOption) error {
+	return p.eachListChunkBuffered(ctx, list, func(list client.ObjectList) error {
+		return meta.EachListItem(list, func(obj runtime.Object) error {
+			return fn(obj.(client.Object))
+		})
+	}, opts...)
+}
+
+// eachListChunkBuffered fetches runtimeObject list chunks using this ListPager and invokes fn on
+// each list chunk.  If fn returns an error, processing stops and that error is returned. If fn does
+// not return an error, any error encountered while retrieving the list from the server is
+// returned. If the context cancels or times out, the context error is returned. Since the list is
+// retrieved in paginated chunks, an "Expired" error (metav1.StatusReasonExpired) may be returned if
+// the pagination list requests exceed the expiration limit of the apiserver being called.
+//
+// Up to ListPager.PageBufferSize chunks are buffered concurrently in the background.
+func (p *ListPager) eachListChunkBuffered(ctx context.Context, list client.ObjectList, fn func(list client.ObjectList) error, opts ...client.ListOption) error {
+	if p.PageBufferSize < 0 {
+		return fmt.Errorf("ListPager.PageBufferSize must be >= 0, got %d", p.PageBufferSize)
+	}
+
+	// Ensure background goroutine is stopped if this call exits before all list items are
+	// processed. Cancellation error from this deferred cancel call is never returned to caller;
+	// either the list result has already been sent to bgResultC or the fn error is returned and
+	// the cancellation error is discarded.
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+
+	chunkC := make(chan client.ObjectList, p.PageBufferSize)
+	bgResultC := make(chan error, 1)
+	go func() {
+		defer utilruntime.HandleCrash()
+
+		var err error
+		defer func() {
+			close(chunkC)
+			bgResultC <- err
+		}()
+		err = p.eachListChunk(ctx, list, func(chunk client.ObjectList) error {
+			select {
+			case chunkC <- chunk: // buffer the chunk, this can block
+			case <-ctx.Done():
+				return ctx.Err()
+			}
+			return nil
+		}, opts...)
+	}()
+
+	for o := range chunkC {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+		err := fn(o)
+		if err != nil {
+			return err // any fn error should be returned immediately
+		}
+	}
+	// promote the results of our background goroutine to the foreground
+	return <-bgResultC
+}
+
+// eachListChunk fetches runtimeObject list chunks using this ListPager and invokes fn on each list
+// chunk. If fn returns an error, processing stops and that error is returned. If fn does not return
+// an error, any error encountered while retrieving the list from the server is returned. If the
+// context cancels or times out, the context error is returned. Since the list is retrieved in
+// paginated chunks, an "Expired" error (metav1.StatusReasonExpired) may be returned if the
+// pagination list requests exceed the expiration limit of the apiserver being called.
+func (p *ListPager) eachListChunk(ctx context.Context, list client.ObjectList, fn func(list client.ObjectList) error, opts ...client.ListOption) error {
+	options := &client.ListOptions{}
+	options.ApplyOptions(opts)
+
+	if options.Limit == 0 {
+		options.Limit = p.PageSize
+	}
+	if options.Limit < 0 {
+		return fmt.Errorf("ListPager.PageSize must be >= 0, got %d", options.Limit)
+	}
+
+	for {
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		default:
+		}
+
+		// create a new list to not share memory between paginated lists
+		list = list.DeepCopyObject().(client.ObjectList)
+		if err := p.Reader.List(ctx, list, options); err != nil {
+			return err
+		}
+
+		if err := fn(list); err != nil {
+			return err
+		}
+
+		// if we have no more items, return.
+		if len(list.GetContinue()) == 0 {
+			return nil
+		}
+
+		// set the next loop up
+		options.Continue = list.GetContinue()
+		// Clear the ResourceVersion(Match) on the subsequent List calls to avoid the
+		// `specifying resource version is not allowed when using continue` error.
+		// See https://github.com/kubernetes/kubernetes/issues/85221#issuecomment-553748143.
+		if options.Raw == nil {
+			options.Raw = &metav1.ListOptions{}
+		}
+		options.Raw.ResourceVersion = ""
+		options.Raw.ResourceVersionMatch = ""
+	}
+}
diff --git a/webhosting-operator/cmd/measure/main.go b/webhosting-operator/cmd/measure/main.go
@@ -51,7 +51,7 @@ var (
 	queryRange    = v1.Range{
 		Start: now.Add(-15 * time.Minute),
 		End:   now,
-		Step:  15 * time.Second,
+		Step:  30 * time.Second,
 	}
 	rateInterval = 5 * time.Minute
 )

diff --git a/webhosting-operator/cmd/webhosting-operator/main.go b/webhosting-operator/cmd/webhosting-operator/main.go
@@ -48,7 +48,7 @@ import (
 	configv1alpha1 "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/apis/config/v1alpha1"
 	webhostingv1alpha1 "github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/apis/webhosting/v1alpha1"
 	"github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/controllers/webhosting"
-	"github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/routes"
+	"github.com/timebertt/kubernetes-controller-sharding/webhosting-operator/pkg/utils/routes"
 	//+kubebuilder:scaffold:imports
 )
 

diff --git a/webhosting-operator/pkg/routes/profiling.go → ...ng-operator/pkg/utils/routes/profiling.go b/webhosting-operator/pkg/routes/profiling.go → ...ng-operator/pkg/utils/routes/profiling.go
diff --git a/webhosting-operator/pkg/utils/utils.go b/webhosting-operator/pkg/utils/utils.go
@@ -20,36 +20,11 @@ import (
 	"math/rand"
 )
 
-// CopyMap returns a new map with the same contents as the given map.
-func CopyMap[K comparable, V any](in map[K]V) map[K]V {
-	out := make(map[K]V, len(in))
-	for k, v := range in {
-		out[k] = v
-	}
-	return out
-}
-
 // PickRandom picks a random element from the given slice.
 func PickRandom[T any](in []T) T {
 	return in[rand.Intn(len(in))]
 }
 
-// PickNRandom picks n random elements from the given slice.
-func PickNRandom[T any](in []T, n int) []T {
-	if n <= 0 {
-		return nil
-	}
-	if n >= len(in) {
-		return in
-	}
-
-	rand.Shuffle(len(in), func(i, j int) {
-		in[i], in[j] = in[j], in[i]
-	})
-
-	return in[:n]
-}
-
 // RandomName generates a random string with n characters that can be used as part of API object names.
 func RandomName(n int) string {
 	const charset = "abcdefghijklmnopqrstuvwxyz"