Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add cloudprovider specific Eventual disruption methods #1588

Merged
merged 5 commits into from
Aug 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions hack/kwok/requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ for Version in $(seq 0 1); do
yqVersion="$Version" yq eval '.spec.versions[env(yqVersion)].schema.openAPIV3Schema.properties.spec.properties.template.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [
{"message": "label domain \"karpenter.kwok.sh\" is restricted", "rule": "self in [\"karpenter.kwok.sh/instance-cpu\", \"karpenter.kwok.sh/instance-memory\", \"karpenter.kwok.sh/instance-family\", \"karpenter.kwok.sh/instance-size\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.kwok.sh\")"}]' -i kwok/charts/crds/karpenter.sh_nodepools.yaml
done

# Add ExampleReason in KwoK CloudProvider
yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.disruption.properties.budgets.items.properties.reasons.items.enum += [ "ExampleReason" ]' -i kwok/charts/crds/karpenter.sh_nodepools.yaml
4 changes: 4 additions & 0 deletions kwok/apis/v1alpha1/kwoknodeclass.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ package v1alpha1

import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

v1 "sigs.k8s.io/karpenter/pkg/apis/v1"
)

// KWOKNodeClass is the Schema for the KWOKNodeClass API
Expand All @@ -38,3 +40,5 @@ type KWOKNodeClassList struct {
metav1.ListMeta `json:"metadata,omitempty"`
Items []KWOKNodeClass `json:"items"`
}

const DisruptionReasonExampleReason v1.DisruptionReason = "ExampleReason"
7 changes: 5 additions & 2 deletions kwok/charts/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,13 +112,16 @@ spec:
description: |-
Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
Otherwise, this will apply to each reason defined.
allowed reasons are Underutilized, Empty, and Drifted.
allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
items:
description: DisruptionReason defines valid reasons for disruption budgets.
description: |-
DisruptionReason defines valid reasons for disruption budgets.
CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
enum:
- Underutilized
- Empty
- Drifted
- ExampleReason
type: string
type: array
schedule:
Expand Down
2 changes: 1 addition & 1 deletion kwok/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func (c CloudProvider) Create(ctx context.Context, nodeClaim *v1.NodeClaim) (*v1
}

func (c CloudProvider) DisruptionReasons() []v1.DisruptionReason {
return nil
return []v1.DisruptionReason{v1alpha1.DisruptionReasonExampleReason}
}

func (c CloudProvider) Delete(ctx context.Context, nodeClaim *v1.NodeClaim) error {
Expand Down
6 changes: 4 additions & 2 deletions pkg/apis/crds/karpenter.sh_nodepools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,11 @@ spec:
description: |-
Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
Otherwise, this will apply to each reason defined.
allowed reasons are Underutilized, Empty, and Drifted.
allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
items:
description: DisruptionReason defines valid reasons for disruption budgets.
description: |-
DisruptionReason defines valid reasons for disruption budgets.
CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
enum:
- Underutilized
- Empty
Expand Down
31 changes: 10 additions & 21 deletions pkg/apis/v1/nodepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package v1

import (
"context"
"fmt"
"math"
"sort"
Expand Down Expand Up @@ -89,7 +88,7 @@ type Disruption struct {
type Budget struct {
// Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods.
// Otherwise, this will apply to each reason defined.
// allowed reasons are Underutilized, Empty, and Drifted.
// allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons.
// +optional
Reasons []DisruptionReason `json:"reasons,omitempty"`
// Nodes dictates the maximum number of NodeClaims owned by this NodePool
Expand Down Expand Up @@ -129,6 +128,7 @@ const (
)

// DisruptionReason defines valid reasons for disruption budgets.
// CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons
// +kubebuilder:validation:Enum={Underutilized,Empty,Drifted}
engedaam marked this conversation as resolved.
Show resolved Hide resolved
type DisruptionReason string

Expand All @@ -138,11 +138,6 @@ const (
DisruptionReasonDrifted DisruptionReason = "Drifted"
)

var (
// WellKnownDisruptionReasons is a list of all valid reasons for disruption budgets.
WellKnownDisruptionReasons = []DisruptionReason{DisruptionReasonUnderutilized, DisruptionReasonEmpty, DisruptionReasonDrifted}
)

type Limits v1.ResourceList

func (l Limits) ExceededBy(resources v1.ResourceList) error {
Expand Down Expand Up @@ -314,34 +309,28 @@ func (nl *NodePoolList) OrderByWeight() {
// MustGetAllowedDisruptions calls GetAllowedDisruptionsByReason if the error is not nil. This reduces the
// amount of state that the disruption controller must reconcile, while allowing the GetAllowedDisruptionsByReason()
// to bubble up any errors in validation.
func (in *NodePool) MustGetAllowedDisruptions(ctx context.Context, c clock.Clock, numNodes int) map[DisruptionReason]int {
allowedDisruptions, err := in.GetAllowedDisruptionsByReason(ctx, c, numNodes)
func (in *NodePool) MustGetAllowedDisruptions(c clock.Clock, numNodes int, reason DisruptionReason) int {
allowedDisruptions, err := in.GetAllowedDisruptionsByReason(c, numNodes, reason)
if err != nil {
return map[DisruptionReason]int{}
return 0
}
return allowedDisruptions
}

// GetAllowedDisruptionsByReason returns the minimum allowed disruptions across all disruption budgets, for all disruption methods for a given nodepool
func (in *NodePool) GetAllowedDisruptionsByReason(ctx context.Context, c clock.Clock, numNodes int) (map[DisruptionReason]int, error) {
func (in *NodePool) GetAllowedDisruptionsByReason(c clock.Clock, numNodes int, reason DisruptionReason) (int, error) {
allowedNodes := math.MaxInt32
var multiErr error
allowedDisruptions := map[DisruptionReason]int{}
for _, reason := range WellKnownDisruptionReasons {
allowedDisruptions[reason] = math.MaxInt32
}

for _, budget := range in.Spec.Disruption.Budgets {
val, err := budget.GetAllowedDisruptions(c, numNodes)
if err != nil {
multiErr = multierr.Append(multiErr, err)
}
// If reasons is nil, it applies to all well known disruption reasons
for _, reason := range lo.Ternary(budget.Reasons == nil, WellKnownDisruptionReasons, budget.Reasons) {
allowedDisruptions[reason] = lo.Min([]int{allowedDisruptions[reason], val})
if budget.Reasons == nil || lo.Contains(budget.Reasons, reason) {
engedaam marked this conversation as resolved.
Show resolved Hide resolved
allowedNodes = lo.Min([]int{allowedNodes, val})
}
}

return allowedDisruptions, multiErr
return allowedNodes, multiErr
}

// GetAllowedDisruptions returns an intstr.IntOrString that can be used a comparison
Expand Down
60 changes: 36 additions & 24 deletions pkg/apis/v1/nodepool_budgets_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package v1_test

import (
"context"
"math"
"strings"
"time"
Expand All @@ -36,7 +35,7 @@ var _ = Describe("Budgets", func() {
var nodePool *NodePool
var budgets []Budget
var fakeClock *clock.FakeClock
var ctx = context.Background()
var allKnownDisruptionReasons []DisruptionReason

BeforeEach(func() {
// Set the time to the middle of the year of 2000, the best year ever
Expand Down Expand Up @@ -79,6 +78,7 @@ var _ = Describe("Budgets", func() {
DisruptionReasonUnderutilized,
DisruptionReasonDrifted,
DisruptionReasonEmpty,
"CloudProviderDisruptionReason",
},
Nodes: "0",
Schedule: lo.ToPtr("@weekly"),
Expand All @@ -93,52 +93,60 @@ var _ = Describe("Budgets", func() {
},
},
}
allKnownDisruptionReasons = append([]DisruptionReason{
DisruptionReasonEmpty,
DisruptionReasonUnderutilized,
DisruptionReasonDrifted},
DisruptionReason("CloudProviderDisruptionReason"),
)
})

Context("GetAllowedDisruptionsByReason", func() {
It("should return 0 for all reasons if a budget is active for all reasons", func() {
budgets[5].Schedule = lo.ToPtr("* * * * *")
budgets[5].Duration = lo.ToPtr(metav1.Duration{Duration: lo.Must(time.ParseDuration("1h"))})

disruptionsByReason, err := nodePool.GetAllowedDisruptionsByReason(ctx, fakeClock, 100)
Expect(err).To(BeNil())
Expect(disruptionsByReason[DisruptionReasonUnderutilized]).To(Equal(0))
Expect(disruptionsByReason[DisruptionReasonDrifted]).To(Equal(0))
Expect(disruptionsByReason[DisruptionReasonEmpty]).To(Equal(0))
for _, reason := range allKnownDisruptionReasons {
allowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, reason)
Expect(err).To(BeNil())
Expect(allowedDisruption).To(Equal(0))
}
})

It("should return MaxInt32 for all reasons when there are no active budgets", func() {
for i := range budgets {
budgets[i].Schedule = lo.ToPtr("@yearly")
budgets[i].Duration = lo.ToPtr(metav1.Duration{Duration: lo.Must(time.ParseDuration("1h"))})
}
disruptionsByReason, err := nodePool.GetAllowedDisruptionsByReason(ctx, fakeClock, 100)
Expect(err).To(BeNil())

// All budgets should have unbounded disruptions when inactive
for _, disruptions := range disruptionsByReason {
Expect(disruptions).To(Equal(math.MaxInt32))
for _, reason := range allKnownDisruptionReasons {
allowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, reason)
Expect(err).To(BeNil())
Expect(allowedDisruption).To(Equal(math.MaxInt32))
}
})

It("should ignore reason-defined budgets when inactive", func() {
budgets[3].Schedule = lo.ToPtr("@yearly")
budgets[4].Schedule = lo.ToPtr("@yearly")
disruptionsByReason, err := nodePool.GetAllowedDisruptionsByReason(ctx, fakeClock, 100)
Expect(err).To(BeNil())
for _, disruptions := range disruptionsByReason {
Expect(disruptions).To(Equal(10))

for _, reason := range allKnownDisruptionReasons {
allowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, reason)
Expect(err).To(BeNil())
Expect(allowedDisruption).To(Equal(10))
}
})

It("should return the budget for all disruption reasons when undefined", func() {
nodePool.Spec.Disruption.Budgets = budgets[:1]
Expect(len(nodePool.Spec.Disruption.Budgets)).To(Equal(1))
disruptionsByReason, err := nodePool.GetAllowedDisruptionsByReason(ctx, fakeClock, 100)
Expect(err).To(BeNil())
Expect(len(budgets[0].Reasons)).To(Equal(0))
for _, disruptions := range disruptionsByReason {
Expect(disruptions).To(Equal(10))

for _, reason := range allKnownDisruptionReasons {
allowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, reason)
Expect(err).To(BeNil())
Expect(allowedDisruption).To(Equal(10))
}
})

Expand All @@ -155,13 +163,17 @@ var _ = Describe("Budgets", func() {
},
},
}...)
disruptionsByReason, err := nodePool.GetAllowedDisruptionsByReason(ctx, fakeClock, 100)
Expect(err).To(BeNil())

Expect(disruptionsByReason[DisruptionReasonEmpty]).To(Equal(4))
Expect(disruptionsByReason[DisruptionReasonDrifted]).To(Equal(5))
emptyAllowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, DisruptionReasonEmpty)
Expect(err).To(BeNil())
Expect(emptyAllowedDisruption).To(Equal(4))
driftedAllowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, DisruptionReasonDrifted)
Expect(err).To(BeNil())
Expect(driftedAllowedDisruption).To(Equal(5))
// The budget where reason == nil overrides the budget with a specified reason
Expect(disruptionsByReason[DisruptionReasonUnderutilized]).To(Equal(10))
underutilizedAllowedDisruption, err := nodePool.GetAllowedDisruptionsByReason(fakeClock, 100, DisruptionReasonUnderutilized)
Expect(err).To(BeNil())
Expect(underutilizedAllowedDisruption).To(Equal(10))
})

})
Expand Down
2 changes: 1 addition & 1 deletion pkg/cloudprovider/fake/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ func (c *CloudProvider) GetInstanceTypes(_ context.Context, np *v1.NodePool) ([]
}

func (c *CloudProvider) DisruptionReasons() []v1.DisruptionReason {
return nil
return []v1.DisruptionReason{"CloudProviderDisruptionReason"}
}

func (c *CloudProvider) Delete(_ context.Context, nc *v1.NodeClaim) error {
Expand Down
12 changes: 6 additions & 6 deletions pkg/controllers/disruption/consolidation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

emptyConsolidation := disruption.NewEmptiness(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, emptyConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, emptyConsolidation.ShouldDisrupt, emptyConsolidation.Class(), queue)
Expand Down Expand Up @@ -641,7 +641,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

emptyConsolidation := disruption.NewEmptiness(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, emptyConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, emptyConsolidation.ShouldDisrupt, emptyConsolidation.Class(), queue)
Expand All @@ -665,7 +665,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

multiConsolidation := disruption.NewMultiNodeConsolidation(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, multiConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, multiConsolidation.ShouldDisrupt, multiConsolidation.Class(), queue)
Expand Down Expand Up @@ -728,7 +728,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

multiConsolidation := disruption.NewMultiNodeConsolidation(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, multiConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, multiConsolidation.ShouldDisrupt, multiConsolidation.Class(), queue)
Expand All @@ -752,7 +752,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

singleConsolidation := disruption.NewSingleNodeConsolidation(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, singleConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, singleConsolidation.ShouldDisrupt, singleConsolidation.Class(), queue)
Expand Down Expand Up @@ -815,7 +815,7 @@ var _ = Describe("Consolidation", func() {
ExpectMakeNodesAndNodeClaimsInitializedAndStateUpdated(ctx, env.Client, nodeStateController, nodeClaimStateController, nodes, nodeClaims)

singleConsolidation := disruption.NewSingleNodeConsolidation(disruption.MakeConsolidation(fakeClock, cluster, env.Client, prov, cloudProvider, recorder, queue))
budgets, err := disruption.BuildDisruptionBudgets(ctx, cluster, fakeClock, env.Client, recorder)
budgets, err := disruption.BuildDisruptionBudgetMapping(ctx, cluster, fakeClock, env.Client, recorder, singleConsolidation.Reason())
Expect(err).To(Succeed())

candidates, err := disruption.GetCandidates(ctx, cluster, env.Client, recorder, fakeClock, cloudProvider, singleConsolidation.ShouldDisrupt, singleConsolidation.Class(), queue)
Expand Down
Loading
Loading