Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Add status conditions for nodeClass #6438

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions pkg/apis/v1/ec2nodeclass_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ import (
v1 "k8s.io/api/core/v1"
)

const (
ConditionTypeSubnetsReady = "SubnetsReady"
ConditionTypeSecurityGroupsReady = "SecurityGroupsReady"
ConditionTypeAMIsReady = "AMIsReady"
ConditionTypeInstanceProfileReady = "InstanceProfileReady"
)

// Subnet contains resolved Subnet selector values utilized for node launch
type Subnet struct {
// ID of the subnet
Expand Down Expand Up @@ -77,13 +84,13 @@ type EC2NodeClassStatus struct {
Conditions []op.Condition `json:"conditions,omitempty"`
}

const (
// ConditionTypeNodeClassReady = "Ready" condition indicates that subnets, security groups, AMIs and instance profile for nodeClass were resolved
ConditionTypeNodeClassReady = "Ready"
)

func (in *EC2NodeClass) StatusConditions() op.ConditionSet {
return op.NewReadyConditions(ConditionTypeNodeClassReady).For(in)
return op.NewReadyConditions(
ConditionTypeAMIsReady,
ConditionTypeSubnetsReady,
ConditionTypeSecurityGroupsReady,
ConditionTypeInstanceProfileReady,
).For(in)
}

func (in *EC2NodeClass) GetConditions() []op.Condition {
Expand Down
14 changes: 13 additions & 1 deletion pkg/apis/v1beta1/ec2nodeclass_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,13 @@ import (
v1 "k8s.io/api/core/v1"
)

const (
ConditionTypeSubnetsReady = "SubnetsReady"
ConditionTypeSecurityGroupsReady = "SecurityGroupsReady"
ConditionTypeAMIsReady = "AMIsReady"
ConditionTypeInstanceProfileReady = "InstanceProfileReady"
)

// Subnet contains resolved Subnet selector values utilized for node launch
type Subnet struct {
// ID of the subnet
Expand Down Expand Up @@ -78,7 +85,12 @@ type EC2NodeClassStatus struct {
}

func (in *EC2NodeClass) StatusConditions() status.ConditionSet {
return status.NewReadyConditions().For(in)
return status.NewReadyConditions(
ConditionTypeAMIsReady,
ConditionTypeSubnetsReady,
ConditionTypeSecurityGroupsReady,
ConditionTypeInstanceProfileReady,
).For(in)
}

func (in *EC2NodeClass) GetConditions() []status.Condition {
Expand Down
29 changes: 20 additions & 9 deletions pkg/cloudprovider/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ package cloudprovider_test
import (
"context"
"fmt"
"github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/ami"
"github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/instance_profile"
"github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/security_group"
"github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/subnet"
"net"
"strings"
"testing"
Expand All @@ -40,7 +44,6 @@ import (
"github.com/aws/karpenter-provider-aws/pkg/apis"
"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
"github.com/aws/karpenter-provider-aws/pkg/cloudprovider"
"github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status"
"github.com/aws/karpenter-provider-aws/pkg/fake"
"github.com/aws/karpenter-provider-aws/pkg/operator/options"
"github.com/aws/karpenter-provider-aws/pkg/test"
Expand Down Expand Up @@ -1109,9 +1112,11 @@ var _ = Describe("CloudProvider", func() {
{SubnetId: aws.String("test-subnet-2"), AvailabilityZone: aws.String("test-zone-1a"), AvailabilityZoneId: aws.String("tstz1-1a"), AvailableIpAddressCount: aws.Int64(100),
Tags: []*ec2.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}}},
}})
controller := status.NewController(env.Client, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.LaunchTemplateProvider)
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
ExpectObjectReconciled(ctx, env.Client, subnet.NewController(env.Client, awsEnv.SubnetProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, ami.NewController(env.Client, awsEnv.AMIProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, instance_profile.NewController(env.Client, awsEnv.InstanceProfileProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, security_group.NewController(env.Client, awsEnv.SecurityGroupProvider), nodeClass)
pod := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{v1.LabelTopologyZone: "test-zone-1a"}})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod)
ExpectScheduled(ctx, env.Client, pod)
Expand All @@ -1126,10 +1131,12 @@ var _ = Describe("CloudProvider", func() {
{SubnetId: aws.String("test-subnet-2"), AvailabilityZone: aws.String("test-zone-1a"), AvailabilityZoneId: aws.String("tstz1-1a"), AvailableIpAddressCount: aws.Int64(11),
Tags: []*ec2.Tag{{Key: aws.String("Name"), Value: aws.String("test-subnet-2")}}},
}})
controller := status.NewController(env.Client, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.LaunchTemplateProvider)
nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{MaxPods: aws.Int32(1)}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
ExpectObjectReconciled(ctx, env.Client, subnet.NewController(env.Client, awsEnv.SubnetProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, ami.NewController(env.Client, awsEnv.AMIProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, instance_profile.NewController(env.Client, awsEnv.InstanceProfileProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, security_group.NewController(env.Client, awsEnv.SecurityGroupProvider), nodeClass)
pod1 := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{v1.LabelTopologyZone: "test-zone-1a"}})
pod2 := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{v1.LabelTopologyZone: "test-zone-1a"}})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod1, pod2)
Expand Down Expand Up @@ -1164,14 +1171,15 @@ var _ = Describe("CloudProvider", func() {
}})
nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{{Tags: map[string]string{"Name": "test-subnet-1"}}}
ExpectApplied(ctx, env.Client, nodePool, nodeClass)
controller := status.NewController(env.Client, awsEnv.SubnetProvider, awsEnv.SecurityGroupProvider, awsEnv.AMIProvider, awsEnv.InstanceProfileProvider, awsEnv.LaunchTemplateProvider)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass)
ExpectObjectReconciled(ctx, env.Client, subnet.NewController(env.Client, awsEnv.SubnetProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, ami.NewController(env.Client, awsEnv.AMIProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, instance_profile.NewController(env.Client, awsEnv.InstanceProfileProvider), nodeClass)
ExpectObjectReconciled(ctx, env.Client, security_group.NewController(env.Client, awsEnv.SecurityGroupProvider), nodeClass)
podSubnet1 := coretest.UnschedulablePod()
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, podSubnet1)
ExpectScheduled(ctx, env.Client, podSubnet1)
createFleetInput := awsEnv.EC2API.CreateFleetBehavior.CalledWithInput.Pop()
Expect(fake.SubnetsFromFleetRequest(createFleetInput)).To(ConsistOf("test-subnet-1"))

nodeClass2 := test.EC2NodeClass(v1beta1.EC2NodeClass{
Spec: v1beta1.EC2NodeClassSpec{
SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{
Expand Down Expand Up @@ -1208,7 +1216,10 @@ var _ = Describe("CloudProvider", func() {
},
})
ExpectApplied(ctx, env.Client, nodePool2, nodeClass2)
ExpectObjectReconciled(ctx, env.Client, controller, nodeClass2)
ExpectObjectReconciled(ctx, env.Client, subnet.NewController(env.Client, awsEnv.SubnetProvider), nodeClass2)
ExpectObjectReconciled(ctx, env.Client, ami.NewController(env.Client, awsEnv.AMIProvider), nodeClass2)
ExpectObjectReconciled(ctx, env.Client, instance_profile.NewController(env.Client, awsEnv.InstanceProfileProvider), nodeClass2)
ExpectObjectReconciled(ctx, env.Client, security_group.NewController(env.Client, awsEnv.SecurityGroupProvider), nodeClass2)
podSubnet2 := coretest.UnschedulablePod(coretest.PodOptions{NodeSelector: map[string]string{corev1beta1.NodePoolLabelKey: nodePool2.Name}})
ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, podSubnet2)
ExpectScheduled(ctx, env.Client, podSubnet2)
Expand Down
12 changes: 10 additions & 2 deletions pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,11 @@ import (
"sigs.k8s.io/karpenter/pkg/cloudprovider"

nodeclasshash "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/hash"
nodeclassstatus "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status"
nodeclassstatusami "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/ami"
nodeclassstatusip "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/instance_profile"
nodeclassstatuslaunchtemplate "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/launch_template"
nodeclassstatussg "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/security_group"
nodeclassstatussubnet "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/status/subnet"
nodeclasstermination "github.com/aws/karpenter-provider-aws/pkg/controllers/nodeclass/termination"
controllersinstancetype "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/instancetype"
controllerspricing "github.com/aws/karpenter-provider-aws/pkg/controllers/providers/pricing"
Expand Down Expand Up @@ -57,7 +61,11 @@ func NewControllers(ctx context.Context, sess *session.Session, clk clock.Clock,

controllers := []controller.Controller{
nodeclasshash.NewController(kubeClient),
nodeclassstatus.NewController(kubeClient, subnetProvider, securityGroupProvider, amiProvider, instanceProfileProvider, launchTemplateProvider),
nodeclassstatusami.NewController(kubeClient, amiProvider),
nodeclassstatusip.NewController(kubeClient, instanceProfileProvider),
nodeclassstatussg.NewController(kubeClient, securityGroupProvider),
nodeclassstatussubnet.NewController(kubeClient, subnetProvider),
nodeclassstatuslaunchtemplate.NewController(kubeClient, launchTemplateProvider),
nodeclasstermination.NewController(kubeClient, recorder, instanceProfileProvider, launchTemplateProvider),
nodeclaimgarbagecollection.NewController(kubeClient, cloudProvider),
nodeclaimtagging.NewController(kubeClient, instanceProvider),
Expand Down
64 changes: 0 additions & 64 deletions pkg/controllers/nodeclass/status/ami.go

This file was deleted.

123 changes: 123 additions & 0 deletions pkg/controllers/nodeclass/status/ami/controller.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we want to have separate controllers for each resource?

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package ami

import (
"context"
"github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1"
"github.com/aws/karpenter-provider-aws/pkg/providers/amifamily"
"github.com/awslabs/operatorpkg/reasonable"
"github.com/awslabs/operatorpkg/singleton"
"github.com/samber/lo"
"go.uber.org/multierr"
"k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/apimachinery/pkg/api/errors"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/manager"
"sigs.k8s.io/controller-runtime/pkg/predicate"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1"
"sigs.k8s.io/karpenter/pkg/operator/injection"
"sort"
"time"
)

type Controller struct {
kubeClient client.Client
amiProvider amifamily.Provider
}

// NewController is a constructor
func NewController(kubeClient client.Client, amiProvider amifamily.Provider) *Controller {

return &Controller{
kubeClient: kubeClient,
amiProvider: amiProvider,
}
}

func (c *Controller) Reconcile(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) (reconcile.Result, error) {
ctx = injection.WithControllerName(ctx, "nodeclass.ami")

if !controllerutil.ContainsFinalizer(nodeClass, v1beta1.TerminationFinalizer) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we run into a potential race condition here with multiple controllers adding this termination finalizer? I.e. if multiple controllers make the read before any make the write, and multiple patches get applied.

stored := nodeClass.DeepCopy()
controllerutil.AddFinalizer(nodeClass, v1beta1.TerminationFinalizer)
if err := c.kubeClient.Patch(ctx, nodeClass, client.MergeFrom(stored)); err != nil {
return reconcile.Result{}, err
}
}
stored := nodeClass.DeepCopy()
amis, err := c.amiProvider.List(ctx, nodeClass)
var errs error
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This multierr is only used to capture the error from amiProvider.List, can we just drop it?

if err != nil {
nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeAMIsReady, "UnresolvedAMIs", "Unable to resolve AMI")
errs = multierr.Append(errs, err)
} else {
if len(amis) == 0 {
nodeClass.Status.AMIs = nil
nodeClass.StatusConditions().SetFalse(v1beta1.ConditionTypeAMIsReady, "UnresolvedAMIs", "Unable to resolve AMI")
} else {
nodeClass.Status.AMIs = lo.Map(amis, func(ami amifamily.AMI, _ int) v1beta1.AMI {
reqs := lo.Map(ami.Requirements.NodeSelectorRequirements(), func(item corev1beta1.NodeSelectorRequirementWithMinValues, _ int) v1.NodeSelectorRequirement {
return item.NodeSelectorRequirement
})

sort.Slice(reqs, func(i, j int) bool {
if len(reqs[i].Key) != len(reqs[j].Key) {
return len(reqs[i].Key) < len(reqs[j].Key)
}
return reqs[i].Key < reqs[j].Key
})
return v1beta1.AMI{
Name: ami.Name,
ID: ami.AmiID,
Requirements: reqs,
}
})
nodeClass.StatusConditions().SetTrue(v1beta1.ConditionTypeAMIsReady)
}
}
if !equality.Semantic.DeepEqual(stored, nodeClass) {
if err = c.kubeClient.Status().Update(ctx, nodeClass); err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this need to be an update over a patch? We're only updating the existing status condition right? Do we expect any other controllers to be updating this status condition which we could conflict with? Same comment for all other controllers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to follow what we have done with most of our controllers for status updates. Is there a benefit of using patch over update?

if errors.IsConflict(err) {
return reconcile.Result{Requeue: true}, nil
}
return reconcile.Result{}, client.IgnoreNotFound(err)
}
}
if errs != nil {
return reconcile.Result{}, errs
}
if !nodeClass.StatusConditions().IsTrue(v1beta1.ConditionTypeAMIsReady) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we should be rereconciling immediately in this case. If this is caused by genuine user misconfiguration, we won't be able to resolve any additional AMIs unless the user updates their configuration (or does something like tag an existing AMI to match). If this is caused by an API outage, retrying immediately doesn't help us either. We're hitting internal caches so it shouldn't cause a retry storm, but there's still not a reason to retry immediately here IMO.

return reconcile.Result{RequeueAfter: singleton.RequeueImmediately}, nil
}
return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil
}

func (c *Controller) Register(_ context.Context, m manager.Manager) error {
return controllerruntime.NewControllerManagedBy(m).
Named("nodeclass.ami").
For(&v1beta1.EC2NodeClass{}).
WithEventFilter(predicate.GenerationChangedPredicate{}).
WithOptions(controller.Options{
RateLimiter: reasonable.RateLimiter(),
MaxConcurrentReconciles: 10,
}).
Complete(reconcile.AsReconciler(m.GetClient(), c))
}
Loading
Loading