Skip to content

Commit

Permalink
feat: VPA for reconciler
Browse files Browse the repository at this point in the history
- Enable VPA for reconciler using annotation on RootSync/RepoSync:
  configsync.gke.io/reconciler-autoscaling-strategy: Auto
  - Auto - evict and recreate pods to apply recommended resource
    values, as needed.
  - Recommend - monitor and record recommended resource values for
    each reconciler, but don't automatically apply them.
  - Disabled - Do not apply any VPA config, and delete it if it
    exists with the same name as the reconciler.
- VPA disabled by default (opt-in for preview and testing)
- When VPA is enabled, use smaller resource requests for smaller
  footprint on initial install.
- Move regular (non-VPA) defaults out of a ConfigMap and into the
  reconciler-manager code, next to the new VPA resource defaults.
  This should make them easier to keep in sync.
  • Loading branch information
karlkfi committed Jul 12, 2023
1 parent ded035b commit c0a3b5e
Show file tree
Hide file tree
Showing 26 changed files with 1,841 additions and 199 deletions.
65 changes: 11 additions & 54 deletions e2e/testcases/override_resource_limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,10 @@
package e2e

import (
"io/ioutil"
"testing"
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/types"
"kpt.dev/configsync/e2e/nomostest"
Expand All @@ -35,51 +33,10 @@ import (
"kpt.dev/configsync/pkg/core"
"kpt.dev/configsync/pkg/kinds"
"kpt.dev/configsync/pkg/reconcilermanager"
"kpt.dev/configsync/pkg/reconcilermanager/controllers"
"kpt.dev/configsync/pkg/testing/fake"
"sigs.k8s.io/yaml"
)

func defaultResourceRequestsLimits(nt *nomostest.NT) (reconcilerRequests, reconcilerLimits, gitSyncRequests, gitSyncLimits corev1.ResourceList) {
path := "../../manifests/templates/reconciler-manager-configmap.yaml"
contents, err := ioutil.ReadFile(path)
if err != nil {
nt.T.Fatalf("Failed to read file (%s): %v", path, err)
}

var cm corev1.ConfigMap
if err := yaml.Unmarshal(contents, &cm); err != nil {
nt.T.Fatalf("Failed to parse the ConfigMap object in %s: %v", path, err)
}

key := "deployment.yaml"
deployContents, ok := cm.Data[key]
if !ok {
nt.T.Fatalf("The `data` field of the ConfigMap object in %s does not include the %q key", path, key)
}

var deploy appsv1.Deployment
if err := yaml.Unmarshal([]byte(deployContents), &deploy); err != nil {
nt.T.Fatalf("Failed to parse the Deployment object in the `data` field of the ConfigMap object in %s: %v", path, err)
}

for _, container := range deploy.Spec.Template.Spec.Containers {
if container.Name == reconcilermanager.Reconciler || container.Name == reconcilermanager.GitSync {
if container.Resources.Requests.Cpu().IsZero() || container.Resources.Requests.Memory().IsZero() {
nt.T.Fatalf("The %s container in %s should define CPU/memory requests", container.Name, path)
}
}
if container.Name == reconcilermanager.Reconciler {
reconcilerRequests = container.Resources.Requests
reconcilerLimits = container.Resources.Limits
}
if container.Name == reconcilermanager.GitSync {
gitSyncRequests = container.Resources.Requests
gitSyncLimits = container.Resources.Limits
}
}
return
}

func TestOverrideReconcilerResourcesV1Alpha1(t *testing.T) {
nt := nomostest.New(t, nomostesting.OverrideAPI, ntopts.SkipAutopilotCluster,
ntopts.NamespaceRepo(backendNamespace, configsync.RepoSyncName),
Expand All @@ -102,11 +59,11 @@ func TestOverrideReconcilerResourcesV1Alpha1(t *testing.T) {
}

// Get the default CPU/memory requests and limits of the reconciler container and the git-sync container
reconcilerResourceRequests, reconcilerResourceLimits, gitSyncResourceRequests, gitSyncResourceLimits := defaultResourceRequestsLimits(nt)
defaultReconcilerCPURequest, defaultReconcilerMemRequest := reconcilerResourceRequests[corev1.ResourceCPU], reconcilerResourceRequests[corev1.ResourceMemory]
defaultReconcilerCPULimits, defaultReconcilerMemLimits := reconcilerResourceLimits[corev1.ResourceCPU], reconcilerResourceLimits[corev1.ResourceMemory]
defaultGitSyncCPURequest, defaultGitSyncMemRequest := gitSyncResourceRequests[corev1.ResourceCPU], gitSyncResourceRequests[corev1.ResourceMemory]
defaultGitSyncCPULimits, defaultGitSyncMemLimits := gitSyncResourceLimits[corev1.ResourceCPU], gitSyncResourceLimits[corev1.ResourceMemory]
defaultResources := controllers.ReconcilerContainerResourceDefaults()
defaultReconcilerCPURequest, defaultReconcilerMemRequest := defaultResources[reconcilermanager.Reconciler].CPURequest, defaultResources[reconcilermanager.Reconciler].MemoryRequest
defaultReconcilerCPULimits, defaultReconcilerMemLimits := defaultResources[reconcilermanager.Reconciler].CPULimit, defaultResources[reconcilermanager.Reconciler].MemoryLimit
defaultGitSyncCPURequest, defaultGitSyncMemRequest := defaultResources[reconcilermanager.GitSync].CPURequest, defaultResources[reconcilermanager.GitSync].MemoryRequest
defaultGitSyncCPULimits, defaultGitSyncMemLimits := defaultResources[reconcilermanager.GitSync].CPULimit, defaultResources[reconcilermanager.GitSync].MemoryLimit

// Verify root-reconciler uses the default resource requests and limits
rootReconcilerDeployment := &appsv1.Deployment{}
Expand Down Expand Up @@ -405,11 +362,11 @@ func TestOverrideReconcilerResourcesV1Beta1(t *testing.T) {
}

// Get the default CPU/memory requests and limits of the reconciler container and the git-sync container
reconcilerResourceRequests, reconcilerResourceLimits, gitSyncResourceRequests, gitSyncResourceLimits := defaultResourceRequestsLimits(nt)
defaultReconcilerCPURequest, defaultReconcilerMemRequest := reconcilerResourceRequests[corev1.ResourceCPU], reconcilerResourceRequests[corev1.ResourceMemory]
defaultReconcilerCPULimits, defaultReconcilerMemLimits := reconcilerResourceLimits[corev1.ResourceCPU], reconcilerResourceLimits[corev1.ResourceMemory]
defaultGitSyncCPURequest, defaultGitSyncMemRequest := gitSyncResourceRequests[corev1.ResourceCPU], gitSyncResourceRequests[corev1.ResourceMemory]
defaultGitSyncCPULimits, defaultGitSyncMemLimits := gitSyncResourceLimits[corev1.ResourceCPU], gitSyncResourceLimits[corev1.ResourceMemory]
defaultResources := controllers.ReconcilerContainerResourceDefaults()
defaultReconcilerCPURequest, defaultReconcilerMemRequest := defaultResources[reconcilermanager.Reconciler].CPURequest, defaultResources[reconcilermanager.Reconciler].MemoryRequest
defaultReconcilerCPULimits, defaultReconcilerMemLimits := defaultResources[reconcilermanager.Reconciler].CPULimit, defaultResources[reconcilermanager.Reconciler].MemoryLimit
defaultGitSyncCPURequest, defaultGitSyncMemRequest := defaultResources[reconcilermanager.GitSync].CPURequest, defaultResources[reconcilermanager.GitSync].MemoryRequest
defaultGitSyncCPULimits, defaultGitSyncMemLimits := defaultResources[reconcilermanager.GitSync].CPULimit, defaultResources[reconcilermanager.GitSync].MemoryLimit

// Verify root-reconciler uses the default resource requests and limits
rootReconcilerDeployment := &appsv1.Deployment{}
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ require (
k8s.io/api v0.25.11
k8s.io/apiextensions-apiserver v0.25.11
k8s.io/apimachinery v0.25.11
k8s.io/autoscaler/vertical-pod-autoscaler v0.13.0
k8s.io/cli-runtime v0.25.11
k8s.io/client-go v0.25.11
k8s.io/cluster-registry v0.0.6
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,8 @@ k8s.io/apimachinery v0.25.11 h1:2EhfdrSAMvBxsswvOGCEymKk4ZnHZkranuZqdR0rsO4=
k8s.io/apimachinery v0.25.11/go.mod h1:IFwbcNi3gKkfDhuy0VYu3+BwbxbiIov3p6FR8ge1Epc=
k8s.io/apiserver v0.25.11 h1:LDJbRe4bbjCUTy6F3LHflcfEdaYDNQdMrVEca+/mZ6w=
k8s.io/apiserver v0.25.11/go.mod h1:E3YRqMn6Hr9RBkBevjf/mtd8az2rfawRedig8Rinei4=
k8s.io/autoscaler/vertical-pod-autoscaler v0.13.0 h1:pH6AsxeBZcyX6KBqcnl7SPIJqbN1d59RrEBuIE6Rq6c=
k8s.io/autoscaler/vertical-pod-autoscaler v0.13.0/go.mod h1:LraL5kR2xX7jb4VMCG6/tUH4I75uRHlnzC0VWQHcyWk=
k8s.io/cli-runtime v0.25.11 h1:GE2yNZm1tN+MJtw1SGMOLesLF7Kp7NVAVqRSTbXfu4o=
k8s.io/cli-runtime v0.25.11/go.mod h1:r/nEINuHVEpgGhcd2WamU7hD1t/lMnSz8XM44Autltc=
k8s.io/client-go v0.25.11 h1:DJQ141UsbNRI6wYSlcYLP5J5BW5Wq7Bgm42Ztq2SW70=
Expand Down
27 changes: 0 additions & 27 deletions manifests/templates/reconciler-manager-configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@ data:
drop:
- NET_RAW
runAsUser: 65533
resources:
requests:
cpu: "10m"
memory: "100Mi"
- name: reconciler
image: RECONCILER_IMAGE_NAME
command:
Expand All @@ -98,10 +94,6 @@ data:
readOnly: true
- name: kube
mountPath: /.kube
resources:
requests:
cpu: "50m"
memory: "200Mi"
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
Expand All @@ -126,10 +118,6 @@ data:
drop:
- NET_RAW
runAsUser: 65533
resources:
requests:
cpu: "10m"
memory: "200Mi"
- name: oci-sync
image: OCI_SYNC_IMAGE_NAME
args: ["--root=/repo/source", "--dest=rev", "--max-sync-failures=30", "--error-file=error.json"]
Expand All @@ -144,10 +132,6 @@ data:
drop:
- NET_RAW
runAsUser: 65533
resources:
requests:
cpu: "10m"
memory: "200Mi"
- name: helm-sync
image: HELM_SYNC_IMAGE_NAME
args: ["--root=/repo/source", "--dest=rev", "--max-sync-failures=30", "--error-file=error.json"]
Expand All @@ -165,23 +149,12 @@ data:
drop:
- NET_RAW
runAsUser: 65533
resources:
requests:
cpu: "50m"
memory: "200Mi"
- name: otel-agent
image: gcr.io/config-management-release/otelcontribcol:v0.54.0-gke.1
command:
- /otelcontribcol
args:
- "--config=/conf/otel-agent-config.yaml"
resources:
limits:
cpu: 1
memory: 1Gi
requests:
cpu: 10m
memory: 100Mi
securityContext:
allowPrivilegeEscalation: false
readOnlyRootFilesystem: true
Expand Down
8 changes: 8 additions & 0 deletions pkg/core/scheme.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ import (
apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1"
apiextensionsv1beta1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

// autoscalingv1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
"k8s.io/client-go/kubernetes/scheme"
clusterregistry "k8s.io/cluster-registry/pkg/apis/clusterregistry/v1alpha1"
k8sadmissionv1 "k8s.io/kubernetes/pkg/apis/admission/v1"
Expand Down Expand Up @@ -60,6 +62,7 @@ var Scheme = scheme.Scheme
func init() {
mustRegisterKubernetesResources()
mustRegisterAPIExtensionsResources()
// mustRegisterAutosclaingResources()

// Config Sync types
utilruntime.Must(clusterregistry.AddToScheme(scheme.Scheme))
Expand Down Expand Up @@ -122,3 +125,8 @@ func mustRegisterAPIExtensionsResources() {

utilruntime.Must(scheme.Scheme.SetVersionPriority(apiextensionsv1.SchemeGroupVersion, apiextensionsv1beta1.SchemeGroupVersion))
}

// func mustRegisterAutosclaingResources() {
// utilruntime.Must(autoscalingv1.AddToScheme(scheme.Scheme))
// // autoscaling API has no generated defaults or conversions
// }
6 changes: 6 additions & 0 deletions pkg/kinds/kinds.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
rbacv1beta1 "k8s.io/api/rbac/v1beta1"
"k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1beta1"
"k8s.io/apimachinery/pkg/runtime/schema"
autoscalingv1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
"kpt.dev/configsync/pkg/api/configmanagement"
v1 "kpt.dev/configsync/pkg/api/configmanagement/v1"
"kpt.dev/configsync/pkg/api/configsync"
Expand Down Expand Up @@ -263,3 +264,8 @@ func APIService() schema.GroupVersionKind {
func ValidatingWebhookConfiguration() schema.GroupVersionKind {
return admissionv1.SchemeGroupVersion.WithKind("ValidatingWebhookConfiguration")
}

// VerticalPodAutoscaler returns the VerticalPodAutoscaler kind.
func VerticalPodAutoscaler() schema.GroupVersionKind {
return autoscalingv1.SchemeGroupVersion.WithKind("VerticalPodAutoscaler")
}
25 changes: 25 additions & 0 deletions pkg/metadata/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,11 @@ const (
// reconciler-manager will create the reconciler with the hydration-controller
// sidecar container.
RequiresRenderingAnnotationKey = configsync.ConfigSyncPrefix + "requires-rendering"

// ReconcilerAutoscalingStrategyAnnotationKey is the annotation key set on
// RootSync/RepoSync objects to indicate whether to autoscale the reconciler
// deployment and what strategy to use.
ReconcilerAutoscalingStrategyAnnotationKey = configsync.ConfigSyncPrefix + "reconciler-autoscaling-strategy"
)

// Lifecycle annotations
Expand Down Expand Up @@ -228,3 +233,23 @@ const (
// This is the default behavior if the annotation is not specified.
DeletionPropagationPolicyOrphan = DeletionPropagationPolicy("Orphan")
)

// ReconcilerAutoscalingStrategy is the type used to identify value enums to use
// with the reconciler-autoscaling-strategy annotation.
type ReconcilerAutoscalingStrategy string

const (
// ReconcilerAutoscalingStrategyAuto indicates that a VPA config should be
// applied for the reconciler Deployment, if the VPA CRD is installed.
ReconcilerAutoscalingStrategyAuto = ReconcilerAutoscalingStrategy("Auto")

// ReconcilerAutoscalingStrategyRecommend indicates that a VPA config should
// be applied for the reconciler Deployment, if the VPA CRD is installed,
// but it should be configured only to monitor and make resource
// recommendations, not to apply them.
ReconcilerAutoscalingStrategyRecommend = ReconcilerAutoscalingStrategy("Recommend")

// ReconcilerAutoscalingStrategyDisabled indicates that a VPA config should
// NOT be applied for the reconciler Deployment.
ReconcilerAutoscalingStrategyDisabled = ReconcilerAutoscalingStrategy("Disabled")
)
16 changes: 9 additions & 7 deletions pkg/metadata/metadata.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@ func GetNomosAnnotationKeys() []string {
// in the source repository.
// These annotations are set by Config Sync users.
var sourceAnnotations = map[string]bool{
NamespaceSelectorAnnotationKey: true,
LegacyClusterSelectorAnnotationKey: true,
ClusterNameSelectorAnnotationKey: true,
ResourceManagementKey: true,
LifecycleMutationAnnotation: true,
DeletionPropagationPolicyAnnotationKey: true,
NamespaceSelectorAnnotationKey: true,
LegacyClusterSelectorAnnotationKey: true,
ClusterNameSelectorAnnotationKey: true,
ResourceManagementKey: true,
LifecycleMutationAnnotation: true,
DeletionPropagationPolicyAnnotationKey: true,
ReconcilerAutoscalingStrategyAnnotationKey: true,
}

// IsSourceAnnotation returns true if the annotation is a ConfigSync source
Expand All @@ -73,7 +74,8 @@ func IsConfigSyncAnnotationKey(k string) bool {
return HasConfigSyncPrefix(k) ||
strings.HasPrefix(k, LifecycleMutationAnnotation) ||
k == OwningInventoryKey ||
k == DeletionPropagationPolicyAnnotationKey
k == DeletionPropagationPolicyAnnotationKey ||
k == ReconcilerAutoscalingStrategyAnnotationKey
}

// isConfigSyncAnnotation returns whether an annotation is a Config Sync annotation.
Expand Down
12 changes: 12 additions & 0 deletions pkg/reconcilermanager/controllers/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ func (ooe *ObjectOperationError) Error() string {
ooe.ID.Kind, ooe.ID.ObjectKey, ooe.Operation, ooe.Cause)
}

// Unwrap returns the cause of the error, to allow type checking with errors.Is
// and errors.As.
func (ooe *ObjectOperationError) Unwrap() error {
return ooe.Cause
}

// NewObjectOperationError constructs a new ObjectOperationError
func NewObjectOperationError(err error, obj client.Object, op Operation) *ObjectOperationError {
id := core.IDOf(obj)
Expand Down Expand Up @@ -144,6 +150,12 @@ func (oripe *ObjectReconcileError) Error() string {
oripe.ID.Kind, oripe.ID.ObjectKey, oripe.Status, oripe.Cause)
}

// Unwrap returns the cause of the error, to allow type checking with errors.Is
// and errors.As.
func (oripe *ObjectReconcileError) Unwrap() error {
return oripe.Cause
}

// NewObjectReconcileError constructs a new ObjectReconcileError
func NewObjectReconcileError(err error, obj client.Object, status kstatus.Status) *ObjectReconcileError {
id := core.IDOf(obj)
Expand Down
19 changes: 19 additions & 0 deletions pkg/reconcilermanager/controllers/garbage_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
autoscalingv1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"
"kpt.dev/configsync/pkg/core"
"kpt.dev/configsync/pkg/kinds"
"kpt.dev/configsync/pkg/reconcilermanager"
Expand Down Expand Up @@ -204,3 +205,21 @@ func (r *RootSyncReconciler) deleteClusterRoleBinding(ctx context.Context, recon
}
return nil
}

func (r *reconcilerBase) deleteVerticalPodAutoscaler(ctx context.Context, reconcilerRef types.NamespacedName) error {
vpaRef := reconcilerRef
vpaEnabled, err := r.isVPAEnabled()
if err != nil {
return err
}
if !vpaEnabled {
r.logger(ctx).Info("Managed object delete skipped - not enabled",
logFieldObjectRef, vpaRef.String(),
logFieldObjectKind, "VerticalPodAutoscaler")
return nil
}
obj := &autoscalingv1.VerticalPodAutoscaler{}
obj.Name = vpaRef.Name
obj.Namespace = vpaRef.Namespace
return r.cleanup(ctx, obj)
}
Loading

0 comments on commit c0a3b5e

Please sign in to comment.