Skip to content

Commit

Permalink
Allow GPU instance types for Windows nodes (#7681)
Browse files Browse the repository at this point in the history
* allow GPU instance type for Windows nodes

* update unit test for case gpus:0
  • Loading branch information
TiberiuGC authored Mar 27, 2024
1 parent cfec0d9 commit cdbbd6d
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 15 deletions.
1 change: 1 addition & 0 deletions examples/28-instance-selector.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ nodeGroups:
instanceSelector:
vCPUs: 2
memory: "4" # 4 GiB, unit defaults to GiB
gpus: 0 # when set to 0, will only select non-GPU instance types

managedNodeGroups:
- name: mng
Expand Down
70 changes: 70 additions & 0 deletions pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package v1alpha5_test

import (
"bytes"
"fmt"

"github.com/kris-nova/logger"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

Expand All @@ -15,8 +17,10 @@ var _ = Describe("GPU instance support", func() {
gpuInstanceType string
amiFamily string
instanceTypeName string
instanceSelector *api.InstanceSelector

expectUnsupportedErr bool
expectWarning bool
}

assertValidationError := func(e gpuInstanceEntry, err error) {
Expand Down Expand Up @@ -127,6 +131,72 @@ var _ = Describe("GPU instance support", func() {
}),
)

DescribeTable("GPU drivers", func(e gpuInstanceEntry) {
ng := api.NewNodeGroup()
ng.AMIFamily = e.amiFamily
ng.InstanceType = e.gpuInstanceType
ng.InstanceSelector = e.instanceSelector

mng := api.NewManagedNodeGroup()
mng.AMIFamily = e.amiFamily
mng.InstanceType = e.gpuInstanceType
mng.InstanceSelector = e.instanceSelector
if mng.InstanceSelector == nil {
mng.InstanceSelector = &api.InstanceSelector{}
}

output := &bytes.Buffer{}
logger.Writer = output
Expect(api.ValidateNodeGroup(0, ng, api.NewClusterConfig())).NotTo(HaveOccurred())
if e.expectWarning {
Expect(output.String()).To(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily)))
} else {
Expect(output.String()).NotTo(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily)))
}

output = &bytes.Buffer{}
logger.Writer = output
Expect(api.ValidateManagedNodeGroup(0, mng)).NotTo(HaveOccurred())
if e.expectWarning {
Expect(output.String()).To(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily)))
} else {
Expect(output.String()).NotTo(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily)))
}
},
Entry("Windows without GPU instances", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyUbuntu2004,
instanceSelector: &api.InstanceSelector{
VCPUs: 4,
GPUs: newInt(0),
},
}),
Entry("Windows with explicit GPU instance", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyWindowsServer2019FullContainer,
gpuInstanceType: "g4dn.xlarge",
expectWarning: true,
}),
Entry("Windows with implicit GPU instance", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyWindowsServer2022CoreContainer,
instanceSelector: &api.InstanceSelector{
VCPUs: 4,
},
expectWarning: true,
}),
Entry("Ubuntu with explicit GPU instance", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyUbuntu1804,
gpuInstanceType: "g4dn.xlarge",
expectWarning: true,
}),
Entry("Ubuntu with implicit GPU instance", gpuInstanceEntry{
amiFamily: api.NodeImageFamilyUbuntu2004,
instanceSelector: &api.InstanceSelector{
VCPUs: 4,
GPUs: newInt(2),
},
expectWarning: true,
}),
)

DescribeTable("ARM-based GPU instance type support", func(amiFamily string, expectErr bool) {
ng := api.NewNodeGroup()
ng.InstanceType = "g5g.medium"
Expand Down
15 changes: 12 additions & 3 deletions pkg/apis/eksctl.io/v1alpha5/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ var (
ErrPodIdentityAgentNotInstalled = func(suggestion string) error {
return fmt.Errorf("the %q addon must be installed to create pod identity associations; %s", PodIdentityAgentAddon, suggestion)
}

GPUDriversWarning = func(amiFamily string) string {
return fmt.Sprintf("%s does not ship with NVIDIA GPU drivers installed, hence won't support running GPU-accelerated workloads out of the box", amiFamily)
}
)

// NOTE: we don't use k8s.io/apimachinery/pkg/util/sets here to keep API package free of dependencies
Expand Down Expand Up @@ -637,9 +641,14 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool
}
}

if instanceutils.IsNvidiaInstanceType(SelectInstanceType(np)) &&
(ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != NodeImageFamilyBottlerocket && ng.AMIFamily != "") {
logger.Warning("%s does not ship with NVIDIA GPU drivers installed, hence won't support running GPU-accelerated workloads out of the box", ng.AMIFamily)
if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != NodeImageFamilyBottlerocket && ng.AMIFamily != "" {
if instanceutils.IsNvidiaInstanceType(SelectInstanceType(np)) {
logger.Warning(GPUDriversWarning(ng.AMIFamily))
}
if ng.InstanceSelector != nil && !ng.InstanceSelector.IsZero() &&
(ng.InstanceSelector.GPUs == nil || *ng.InstanceSelector.GPUs != 0) {
logger.Warning("instance selector may/will select GPU instance types, " + GPUDriversWarning(ng.AMIFamily))
}
}

if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != "" {
Expand Down
19 changes: 8 additions & 11 deletions pkg/cfn/builder/managed_nodegroup.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,6 @@ type ManagedNodeGroupResourceSet struct {

const ManagedNodeGroupResourceName = "ManagedNodeGroup"

// Windows AMI types are not in sdk-v2 yet, so the constants here are temporary; will remove after sdk is updated
const AMITypesWindows2019FullX8664 ekstypes.AMITypes = "WINDOWS_FULL_2019_x86_64"
const AMITypesWindows2019CoreX8664 ekstypes.AMITypes = "WINDOWS_CORE_2019_x86_64"
const AMITypesWindows2022FullX8664 ekstypes.AMITypes = "WINDOWS_FULL_2022_x86_64"
const AMITypesWindows2022CoreX8664 ekstypes.AMITypes = "WINDOWS_CORE_2022_x86_64"

// NewManagedNodeGroup creates a new ManagedNodeGroupResourceSet
func NewManagedNodeGroup(ec2API awsapi.EC2, cluster *api.ClusterConfig, nodeGroup *api.ManagedNodeGroup, launchTemplateFetcher *LaunchTemplateFetcher, bootstrapper nodebootstrap.Bootstrapper, forceAddCNIPolicy bool, vpcImporter vpc.Importer) *ManagedNodeGroupResourceSet {
return &ManagedNodeGroupResourceSet{
Expand Down Expand Up @@ -285,18 +279,21 @@ func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes
ARM: ekstypes.AMITypesBottlerocketArm64,
ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia,
},
// Windows AMI Types are not in sdk-v2 yet, so the constant here is temporary; will remove after sdk is updated
api.NodeImageFamilyWindowsServer2019FullContainer: {
X86x64: AMITypesWindows2019FullX8664,
X86x64: ekstypes.AMITypesWindowsFull2019X8664,
X86GPU: ekstypes.AMITypesWindowsFull2019X8664,
},
api.NodeImageFamilyWindowsServer2019CoreContainer: {
X86x64: AMITypesWindows2019CoreX8664,
X86x64: ekstypes.AMITypesWindowsCore2019X8664,
X86GPU: ekstypes.AMITypesWindowsCore2019X8664,
},
api.NodeImageFamilyWindowsServer2022FullContainer: {
X86x64: AMITypesWindows2022FullX8664,
X86x64: ekstypes.AMITypesWindowsFull2022X8664,
X86GPU: ekstypes.AMITypesWindowsFull2022X8664,
},
api.NodeImageFamilyWindowsServer2022CoreContainer: {
X86x64: AMITypesWindows2022CoreX8664,
X86x64: ekstypes.AMITypesWindowsCore2022X8664,
X86GPU: ekstypes.AMITypesWindowsCore2022X8664,
},
}

Expand Down
5 changes: 4 additions & 1 deletion userdocs/src/usage/instance-selector.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ users have to spend time figuring out which instance types would be well suited
when using Spot instances because you need to choose a set of instances that works together well with the Cluster Autoscaler.

eksctl now integrates with the [EC2 instance selector](https://github.com/aws/amazon-ec2-instance-selector),
which addresses this problem by generating a list of instance types based on resource criteria such as vCPUs, memory, etc.
which addresses this problem by generating a list of instance types based on resource criteria: vCPUs, memory, # of GPUs and CPU architecture.
When the instance selector criteria is passed, eksctl creates a nodegroup with the instance types set to the instance types
matching the supplied criteria.

Expand Down Expand Up @@ -62,6 +62,9 @@ The following instance selector CLI options are supported by `eksctl create clus

`--instance-selector-vcpus`, `--instance-selector-memory`, `--instance-selector-gpus` and `instance-selector-cpu-architecture`

???+ note
By default, GPU instance types are not filtered out. If you wish to do so (e.g. for cost effectiveness, when your applications don't particularly benefit from GPU-accelerated workloads), please explicitly set `gpus: 0` (via config file) or `--instance-selector-gpus=0` (via CLI flag).

An example file can be found [here](https://github.com/eksctl-io/eksctl/blob/main/examples/28-instance-selector.yaml).

### Dry Run
Expand Down

0 comments on commit cdbbd6d

Please sign in to comment.