From fa1983ee195b0fd3a6e5a681169984baee25ad76 Mon Sep 17 00:00:00 2001 From: tiberiugc Date: Fri, 11 Oct 2024 19:39:36 +0300 Subject: [PATCH] add support for EKS accelerated AMIs based on AL2023 --- pkg/ami/api.go | 6 +- pkg/ami/auto_resolver.go | 23 +++++--- pkg/ami/ssm_resolver.go | 16 +++++- pkg/apis/eksctl.io/v1alpha5/defaults.go | 2 +- .../eksctl.io/v1alpha5/gpu_validation_test.go | 36 ++++-------- pkg/apis/eksctl.io/v1alpha5/validation.go | 18 +++--- pkg/cfn/builder/managed_nodegroup.go | 57 ++++++++++--------- .../managed_nodegroup_ami_type_test.go | 23 ++++++-- 8 files changed, 104 insertions(+), 77 deletions(-) diff --git a/pkg/ami/api.go b/pkg/ami/api.go index 53812150f2..a19312fa21 100644 --- a/pkg/ami/api.go +++ b/pkg/ami/api.go @@ -19,14 +19,16 @@ import ( // Variations of image classes const ( ImageClassGeneral = iota - ImageClassGPU + ImageClassNvidia + ImageClassNeuron ImageClassARM ) // ImageClasses is a list of image class names var ImageClasses = []string{ "ImageClassGeneral", - "ImageClassGPU", + "ImageClassNvidia", + "ImageClassNeuron", "ImageClassARM", } diff --git a/pkg/ami/auto_resolver.go b/pkg/ami/auto_resolver.go index b0a49f4597..883490723a 100644 --- a/pkg/ami/auto_resolver.go +++ b/pkg/ami/auto_resolver.go @@ -25,11 +25,14 @@ func MakeImageSearchPatterns(version string) map[string]map[int]string { return map[string]map[int]string{ api.NodeImageFamilyAmazonLinux2023: { ImageClassGeneral: fmt.Sprintf("amazon-eks-node-al2023-x86_64-standard-%s-v*", version), + ImageClassNvidia: fmt.Sprintf("amazon-eks-node-al2023-x86_64-nvidia-*-%s-v*", version), + ImageClassNeuron: fmt.Sprintf("amazon-eks-node-al2023-x86_64-neuron-%s-v*", version), ImageClassARM: fmt.Sprintf("amazon-eks-node-al2023-arm64-standard-%s-v*", version), }, api.NodeImageFamilyAmazonLinux2: { ImageClassGeneral: fmt.Sprintf("amazon-eks-node-%s-v*", version), - ImageClassGPU: fmt.Sprintf("amazon-eks-gpu-node-%s-*", version), + ImageClassNvidia: fmt.Sprintf("amazon-eks-gpu-node-%s-*", version), + ImageClassNeuron: fmt.Sprintf("amazon-eks-gpu-node-%s-*", version), ImageClassARM: fmt.Sprintf("amazon-eks-arm64-node-%s-*", version), }, api.NodeImageFamilyUbuntuPro2204: { @@ -90,16 +93,22 @@ func (r *AutoResolver) Resolve(ctx context.Context, region, version, instanceTyp imageClasses := MakeImageSearchPatterns(version)[imageFamily] namePattern := imageClasses[ImageClassGeneral] - if instanceutils.IsGPUInstanceType(instanceType) { + var ok bool + switch { + case instanceutils.IsNvidiaInstanceType(instanceType): + namePattern, ok = imageClasses[ImageClassNvidia] + if !ok { + logger.Critical("image family %s doesn't support Nvidia GPU image class", imageFamily) + return "", NewErrFailedResolution(region, version, instanceType, imageFamily) + } + case instanceutils.IsNeuronInstanceType(instanceType): var ok bool - namePattern, ok = imageClasses[ImageClassGPU] + namePattern, ok = imageClasses[ImageClassNeuron] if !ok { - logger.Critical("image family %s doesn't support GPU image class", imageFamily) + logger.Critical("image family %s doesn't support Neuron GPU image class", imageFamily) return "", NewErrFailedResolution(region, version, instanceType, imageFamily) } - } - - if instanceutils.IsARMInstanceType(instanceType) { + case instanceutils.IsARMInstanceType(instanceType): var ok bool namePattern, ok = imageClasses[ImageClassARM] if !ok { diff --git a/pkg/ami/ssm_resolver.go b/pkg/ami/ssm_resolver.go index de0edbbefc..2020e53716 100644 --- a/pkg/ami/ssm_resolver.go +++ b/pkg/ami/ssm_resolver.go @@ -55,8 +55,8 @@ func MakeSSMParameterName(version, instanceType, imageFamily string) (string, er switch imageFamily { case api.NodeImageFamilyAmazonLinux2023: - return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/%s/standard/recommended/%s", - version, utils.ToKebabCase(imageFamily), instanceEC2ArchName(instanceType), fieldName), nil + return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/%s/%s/recommended/%s", + version, utils.ToKebabCase(imageFamily), instanceEC2ArchName(instanceType), imageType(imageFamily, instanceType, version), fieldName), nil case api.NodeImageFamilyAmazonLinux2: return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/recommended/%s", version, imageType(imageFamily, instanceType, version), fieldName), nil case api.NodeImageFamilyWindowsServer2019CoreContainer, @@ -102,6 +102,10 @@ func MakeManagedSSMParameterName(version string, amiType ekstypes.AMITypes) stri switch amiType { case ekstypes.AMITypesAl2023X8664Standard: return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/x86_64/standard/recommended/release_version", version, utils.ToKebabCase(api.NodeImageFamilyAmazonLinux2023)) + case ekstypes.AMITypesAl2023X8664Nvidia: + return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/x86_64/nvidia/recommended/release_version", version, utils.ToKebabCase(api.NodeImageFamilyAmazonLinux2023)) + case ekstypes.AMITypesAl2023X8664Neuron: + return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/x86_64/neuron/recommended/release_version", version, utils.ToKebabCase(api.NodeImageFamilyAmazonLinux2023)) case ekstypes.AMITypesAl2023Arm64Standard: return fmt.Sprintf("/aws/service/eks/optimized-ami/%s/%s/arm64/standard/recommended/release_version", version, utils.ToKebabCase(api.NodeImageFamilyAmazonLinux2023)) case ekstypes.AMITypesAl2X8664: @@ -138,6 +142,14 @@ func ubuntuArchName(instanceType string) string { func imageType(imageFamily, instanceType, version string) string { family := utils.ToKebabCase(imageFamily) switch imageFamily { + case api.NodeImageFamilyAmazonLinux2023: + if instanceutils.IsNvidiaInstanceType(instanceType) { + return "nvidia" + } + if instanceutils.IsNeuronInstanceType(instanceType) { + return "neuron" + } + return "standard" case api.NodeImageFamilyBottlerocket: if instanceutils.IsNvidiaInstanceType(instanceType) { return fmt.Sprintf("%s-%s", version, "nvidia") diff --git a/pkg/apis/eksctl.io/v1alpha5/defaults.go b/pkg/apis/eksctl.io/v1alpha5/defaults.go index 7602109b8b..c9bfbbc6ee 100644 --- a/pkg/apis/eksctl.io/v1alpha5/defaults.go +++ b/pkg/apis/eksctl.io/v1alpha5/defaults.go @@ -135,7 +135,7 @@ func SetManagedNodeGroupDefaults(ng *ManagedNodeGroup, meta *ClusterMeta, contro // When using custom AMIs, we want the user to explicitly specify AMI family. // Thus, we only set up default AMI family when no custom AMI is being used. if ng.AMIFamily == "" && ng.AMI == "" { - if isMinVer, _ := utils.IsMinVersion(Version1_30, meta.Version); isMinVer && !instanceutils.IsGPUInstanceType(ng.InstanceType) && + if isMinVer, _ := utils.IsMinVersion(Version1_30, meta.Version); isMinVer && !instanceutils.IsARMGPUInstanceType(ng.InstanceType) { ng.AMIFamily = NodeImageFamilyAmazonLinux2023 } else { diff --git a/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go b/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go index 33796d387a..404105932b 100644 --- a/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go +++ b/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go @@ -40,22 +40,16 @@ var _ = Describe("GPU instance support", func() { assertValidationError(e, api.ValidateManagedNodeGroup(0, mng)) }, Entry("AL2023 INF", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "inf1.xlarge", - expectUnsupportedErr: true, - instanceTypeName: "Inferentia", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "inf1.xlarge", }), Entry("AL2023 TRN", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "trn1.2xlarge", - expectUnsupportedErr: true, - instanceTypeName: "Trainium", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "trn1.2xlarge", }), Entry("AL2023 NVIDIA", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "g4dn.xlarge", - expectUnsupportedErr: true, - instanceTypeName: "GPU", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "g4dn.xlarge", }), Entry("AL2", gpuInstanceEntry{ gpuInstanceType: "asdf", @@ -107,22 +101,16 @@ var _ = Describe("GPU instance support", func() { }, Entry("AL2023 INF", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "inf1.xlarge", - expectUnsupportedErr: true, - instanceTypeName: "Inferentia", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "inf1.xlarge", }), Entry("AL2023 TRN", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "trn1.2xlarge", - expectUnsupportedErr: true, - instanceTypeName: "Trainium", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "trn1.2xlarge", }), Entry("AL2023 NVIDIA", gpuInstanceEntry{ - amiFamily: api.NodeImageFamilyAmazonLinux2023, - gpuInstanceType: "g4dn.xlarge", - expectUnsupportedErr: true, - instanceTypeName: "GPU", + amiFamily: api.NodeImageFamilyAmazonLinux2023, + gpuInstanceType: "g4dn.xlarge", }), Entry("AL2", gpuInstanceEntry{ gpuInstanceType: "g4dn.xlarge", diff --git a/pkg/apis/eksctl.io/v1alpha5/validation.go b/pkg/apis/eksctl.io/v1alpha5/validation.go index cd8212afc8..58c484059a 100644 --- a/pkg/apis/eksctl.io/v1alpha5/validation.go +++ b/pkg/apis/eksctl.io/v1alpha5/validation.go @@ -661,12 +661,10 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool instanceType := SelectInstanceType(np) - if ng.AMIFamily == NodeImageFamilyAmazonLinux2023 && instanceutils.IsNvidiaInstanceType(instanceType) { - return ErrUnsupportedInstanceTypes("GPU", NodeImageFamilyAmazonLinux2023, - fmt.Sprintf("EKS accelerated AMIs based on %s will be available at a later date", NodeImageFamilyAmazonLinux2023)) - } - - if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != NodeImageFamilyBottlerocket && ng.AMIFamily != "" { + if ng.AMIFamily != NodeImageFamilyAmazonLinux2023 && + ng.AMIFamily != NodeImageFamilyAmazonLinux2 && + ng.AMIFamily != NodeImageFamilyBottlerocket && + ng.AMIFamily != "" { if instanceutils.IsNvidiaInstanceType(instanceType) { logger.Warning(GPUDriversWarning(ng.AMIFamily)) } @@ -676,12 +674,14 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool } } - if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != "" { - // Only AL2 supports Inferentia hosts. + if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && + ng.AMIFamily != NodeImageFamilyAmazonLinux2023 && + ng.AMIFamily != "" { + // Only AL2 and AL2023 support Inferentia hosts. if instanceutils.IsInferentiaInstanceType(instanceType) { return ErrUnsupportedInstanceTypes("Inferentia", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2)) } - // Only AL2 supports Trainium hosts. + // Only AL2 and AL2023 support Trainium hosts. if instanceutils.IsTrainiumInstanceType(instanceType) { return ErrUnsupportedInstanceTypes("Trainium", ng.AMIFamily, fmt.Sprintf("please use %s instead", NodeImageFamilyAmazonLinux2)) } diff --git a/pkg/cfn/builder/managed_nodegroup.go b/pkg/cfn/builder/managed_nodegroup.go index b90d07eb44..18cf5c4193 100644 --- a/pkg/cfn/builder/managed_nodegroup.go +++ b/pkg/cfn/builder/managed_nodegroup.go @@ -263,41 +263,45 @@ func validateLaunchTemplate(launchTemplateData *ec2types.ResponseLaunchTemplateD func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes { amiTypeMapping := map[string]struct { - X86x64 ekstypes.AMITypes - X86GPU ekstypes.AMITypes - ARM ekstypes.AMITypes - ARMGPU ekstypes.AMITypes + X86x64 ekstypes.AMITypes + X86Nvidia ekstypes.AMITypes + X86Neuron ekstypes.AMITypes + ARM ekstypes.AMITypes + ARMGPU ekstypes.AMITypes }{ api.NodeImageFamilyAmazonLinux2023: { - X86x64: ekstypes.AMITypesAl2023X8664Standard, - ARM: ekstypes.AMITypesAl2023Arm64Standard, + X86x64: ekstypes.AMITypesAl2023X8664Standard, + X86Nvidia: ekstypes.AMITypesAl2023X8664Nvidia, + X86Neuron: ekstypes.AMITypesAl2023X8664Neuron, + ARM: ekstypes.AMITypesAl2023Arm64Standard, }, api.NodeImageFamilyAmazonLinux2: { - X86x64: ekstypes.AMITypesAl2X8664, - X86GPU: ekstypes.AMITypesAl2X8664Gpu, - ARM: ekstypes.AMITypesAl2Arm64, + X86x64: ekstypes.AMITypesAl2X8664, + X86Nvidia: ekstypes.AMITypesAl2X8664Gpu, + X86Neuron: ekstypes.AMITypesAl2X8664Gpu, + ARM: ekstypes.AMITypesAl2Arm64, }, api.NodeImageFamilyBottlerocket: { - X86x64: ekstypes.AMITypesBottlerocketX8664, - X86GPU: ekstypes.AMITypesBottlerocketX8664Nvidia, - ARM: ekstypes.AMITypesBottlerocketArm64, - ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia, + X86x64: ekstypes.AMITypesBottlerocketX8664, + X86Nvidia: ekstypes.AMITypesBottlerocketX8664Nvidia, + ARM: ekstypes.AMITypesBottlerocketArm64, + ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia, }, api.NodeImageFamilyWindowsServer2019FullContainer: { - X86x64: ekstypes.AMITypesWindowsFull2019X8664, - X86GPU: ekstypes.AMITypesWindowsFull2019X8664, + X86x64: ekstypes.AMITypesWindowsFull2019X8664, + X86Nvidia: ekstypes.AMITypesWindowsFull2019X8664, }, api.NodeImageFamilyWindowsServer2019CoreContainer: { - X86x64: ekstypes.AMITypesWindowsCore2019X8664, - X86GPU: ekstypes.AMITypesWindowsCore2019X8664, + X86x64: ekstypes.AMITypesWindowsCore2019X8664, + X86Nvidia: ekstypes.AMITypesWindowsCore2019X8664, }, api.NodeImageFamilyWindowsServer2022FullContainer: { - X86x64: ekstypes.AMITypesWindowsFull2022X8664, - X86GPU: ekstypes.AMITypesWindowsFull2022X8664, + X86x64: ekstypes.AMITypesWindowsFull2022X8664, + X86Nvidia: ekstypes.AMITypesWindowsFull2022X8664, }, api.NodeImageFamilyWindowsServer2022CoreContainer: { - X86x64: ekstypes.AMITypesWindowsCore2022X8664, - X86GPU: ekstypes.AMITypesWindowsCore2022X8664, + X86x64: ekstypes.AMITypesWindowsCore2022X8664, + X86Nvidia: ekstypes.AMITypesWindowsCore2022X8664, }, } @@ -307,13 +311,14 @@ func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes } switch { - case instanceutils.IsGPUInstanceType(instanceType): - if instanceutils.IsARMInstanceType(instanceType) { - return amiType.ARMGPU - } - return amiType.X86GPU + case instanceutils.IsARMGPUInstanceType(instanceType): + return amiType.ARMGPU case instanceutils.IsARMInstanceType(instanceType): return amiType.ARM + case instanceutils.IsNvidiaInstanceType(instanceType): + return amiType.X86Nvidia + case instanceutils.IsNeuronInstanceType(instanceType): + return amiType.X86Neuron default: return amiType.X86x64 } diff --git a/pkg/cfn/builder/managed_nodegroup_ami_type_test.go b/pkg/cfn/builder/managed_nodegroup_ami_type_test.go index 2f3772b1e5..3839b44939 100644 --- a/pkg/cfn/builder/managed_nodegroup_ami_type_test.go +++ b/pkg/cfn/builder/managed_nodegroup_ami_type_test.go @@ -77,23 +77,24 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) { expectedAMIType: "AL2_x86_64", }), - Entry("AMI type", amiTypeEntry{ + Entry("default Nvidia GPU instance type", amiTypeEntry{ nodeGroup: &api.ManagedNodeGroup{ NodeGroupBase: &api.NodeGroupBase{ - Name: "test", + Name: "test", + InstanceType: "p2.xlarge", }, }, - expectedAMIType: "AL2023_x86_64_STANDARD", + expectedAMIType: "AL2023_x86_64_NVIDIA", }), - Entry("default GPU instance type", amiTypeEntry{ + Entry("default Neuron GPU instance type", amiTypeEntry{ nodeGroup: &api.ManagedNodeGroup{ NodeGroupBase: &api.NodeGroupBase{ Name: "test", - InstanceType: "p2.xlarge", + InstanceType: "inf1.2xlarge", }, }, - expectedAMIType: "AL2_x86_64_GPU", + expectedAMIType: "AL2023_x86_64_NEURON", }), Entry("AL2 GPU instance type", amiTypeEntry{ @@ -107,6 +108,16 @@ var _ = DescribeTable("Managed Nodegroup AMI type", func(e amiTypeEntry) { expectedAMIType: "AL2_x86_64_GPU", }), + Entry("default ARM instance type", amiTypeEntry{ + nodeGroup: &api.ManagedNodeGroup{ + NodeGroupBase: &api.NodeGroupBase{ + Name: "test", + InstanceType: "a1.2xlarge", + }, + }, + expectedAMIType: "AL2023_ARM_64_STANDARD", + }), + Entry("AL2 ARM instance type", amiTypeEntry{ nodeGroup: &api.ManagedNodeGroup{ NodeGroupBase: &api.NodeGroupBase{