Skip to content

Commit

Permalink
Merge pull request #27 from NVIDIA/magzhang/0.7.0-updates
Browse files Browse the repository at this point in the history
0.7.0 - Updated to k8s 1.29 and GPU Operator v23.9.2
  • Loading branch information
MaggieXJZhang authored Apr 18, 2024
2 parents b388bcd + 59b7fcd commit 8977e19
Show file tree
Hide file tree
Showing 8 changed files with 33 additions and 30 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Each CSP has its own end of life date for the versions of Kubernetes they suppor

| Version | Release Date | Kubernetes Versions | NVIDIA GPU Operator | NVIDIA Data Center Driver* | End of Life |
| :--- | :--- | :--- | :--- | :--- | :--- |
| 0.7.0 | April 2024 | EKS - 1.29 <br> GKE - 1.29 <br> AKS - 1.29 | 23.9.2 (Default); 23.9.2 (NV AI E) | 550.54.15 (EKS & GKE Default); 550.54.15 (NV AI E version for GKE & EKS) | EKS - Mar 2025 <br> GKE - Mar 2025 <br> AKS - Not Specified |
| 0.6.0 | January 2024 | EKS - 1.28 <br> GKE - 1.28 <br> AKS - 1.28 | 23.9.1 (Default); 23.9.0 (NV AI E) | 535.129.03 (EKS & GKE Default); 535.129.03 (NV AI E version for GKE & EKS) | EKS - Nov 2024 <br> GKE - Nov 2024 <br> AKS - Nov 2024 |
| 0.5.0 | November 2023 | EKS - 1.27 <br> GKE - 1.27 <br> AKS - 1.27 | 23.6.1 (Default); 23.3.2 (NV AI E) | 535.104.05 (EKS & GKE Default); 525.125.06 (NV AI E version for GKE & EKS) | EKS - July 2024 <br> GKE - August 2024 <br> AKS - July 2024 |
| 0.4.0 | October 2023 | EKS - 1.27 <br> GKE - 1.27 <br> AKS - 1.27 | 23.6.1 (Default); 23.3.2 (NV AI E) | 535.104.05 (EKS & GKE Default); 525.125.06 (NV AI E version for GKE & EKS) | EKS - July 2024 <br> GKE - August 2024 <br> AKS - July 2024 |
Expand Down
10 changes: 6 additions & 4 deletions aks/examples/cnpack/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
- Add `fluentbit_workspace_name`. This will create Azure Log Analytics Workspace with the specified name.
- Add `prometheus_name`. This will create Azure Monitor Workspace with the specified name.

2. Run `terraform plan -out tfplan` and validate that the output is correct
2. Run `terraform init`

3. Run `terraform apply tfplan`
3. Run `terraform plan -out tfplan` and validate that the output is correct

4. The `terraform output` of this module can be used immediately within the configuration file of CNPack
4. Run `terraform apply tfplan`

5. Run `terraform destroy` to delete all resources created by this module.
5. The `terraform output` of this module can be used immediately within the configuration file of CNPack

6. Once you're done, run `terraform state rm module.holoscan-ready-aks.helm_release.gpu-operator` and `terraform state rm module.holoscan-ready-aks.kubernetes_namespace_v1.gpu-operator`. Lastly, run `terraform destroy` to delete all resources created by this module.

**Note**
The `log_analytics_workspace_primary_shared_key` used for Fluentbit is a sensitive variable and should be protected like a password
Expand Down
6 changes: 3 additions & 3 deletions aks/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
# gpu_node_pool_max_count = 5
# gpu_node_pool_min_count = 2
# gpu_operator_namespace = "gpu-operator"
# gpu_operator_version = "v23.9.1"
# gpu_operator_version = "v23.9.2"
# gpu_os_sku = "Ubuntu"
# kubernetes_version = "1.28"
# kubernetes_version = "1.29"
# location = ""
# nvaie = false
# nvaie_gpu_operator_version = "v23.9.0"
# nvaie_gpu_operator_version = "v23.9.2"
6 changes: 3 additions & 3 deletions aks/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ variable "cluster_name" {
}

variable "kubernetes_version" {
default = "1.28"
default = "1.29"
description = "Version of Kubernetes to turn on. Run 'az aks get-versions --location <location> --output table' to view all available versions "
}

Expand Down Expand Up @@ -87,7 +87,7 @@ variable "gpu_os_sku" {
GPU Operator Variables
****************************/
variable "gpu_operator_version" {
default = "v23.9.1"
default = "v23.9.2"
description = "Version of the GPU operator to be installed"
}

Expand All @@ -105,7 +105,7 @@ variable "nvaie" {

variable "nvaie_gpu_operator_version" {
type = string
default = "v23.9.0"
default = "v23.9.2"
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
}

Expand Down
10 changes: 5 additions & 5 deletions eks/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
# aws_profile = "development"
# cidr_block = "10.0.0.0/16"
# cluster_name = ""
# cluster_version = "1.28"
# cluster_version = "1.29"
# cpu_instance_type = "t2.xlarge"
# cpu_node_pool_additional_user_data = ""
# cpu_node_pool_delete_on_termination = true
Expand All @@ -28,16 +28,16 @@
# gpu_node_pool_delete_on_termination = true
# gpu_node_pool_root_disk_size_gb = 512
# gpu_node_pool_root_volume_type = "gp2"
# gpu_operator_driver_version = "535.129.03"
# gpu_operator_driver_version = "550.54.15"
# gpu_operator_namespace = "gpu-operator"
# gpu_operator_version = "v23.9.1"
# gpu_operator_version = "v23.9.2"
# max_cpu_nodes = "2"
# max_gpu_nodes = "5"
# min_cpu_nodes = "0"
# min_gpu_nodes = "2"
# nvaie = false
# nvaie_gpu_operator_driver_version = "535.129.03"
# nvaie_gpu_operator_version = "v23.9.0"
# nvaie_gpu_operator_driver_version = "550.54.15"
# nvaie_gpu_operator_version = "v23.9.2"
# private_subnets = [
# "10.0.0.0/19",
# "10.0.32.0/19",
Expand Down
10 changes: 5 additions & 5 deletions eks/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,20 @@ variable "cluster_name" {

variable "cluster_version" {
type = string
default = "1.28"
default = "1.29"
description = "Version of EKS to install on the control plane (Major and Minor version only, do not include the patch)"
}
/************************
GPU Operator Variables
*************************/
variable "gpu_operator_version" {
default = "v23.9.1"
default = "v23.9.2"
description = "Version of the GPU Operator to deploy. Defaults to latest available. Not set when `nvaie` is set to `true`"
}

variable "gpu_operator_driver_version" {
type = string
default = "535.129.03"
default = "550.54.15"
description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. Not set when `nvaie` is set to true"
}

Expand All @@ -59,13 +59,13 @@ variable "nvaie" {

variable "nvaie_gpu_operator_version" {
type = string
default = "v23.9.0"
default = "v23.9.2"
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
}

variable "nvaie_gpu_operator_driver_version" {
type = string
default = "535.129.03"
default = "550.54.15"
description = "The NVIDIA AI Enterprise version of the NVIDIA driver to be installed with the GPU operator. Overrides `gpu_operator_driver_version` when `nvaie` is set to `true`"
}
/*****************************
Expand Down
10 changes: 5 additions & 5 deletions gke/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,18 @@
# gpu_instance_type = "n1-standard-4"
# gpu_max_node_count = "5"
# gpu_min_node_count = "2"
# gpu_operator_driver_version = "535.129.03"
# gpu_operator_driver_version = "550.54.15"
# gpu_operator_namespace = "gpu-operator"
# gpu_operator_version = "v23.9.1"
# gpu_operator_version = "v23.9.2"
# gpu_type = "nvidia-tesla-v100"
# min_master_version = "1.28"
# min_master_version = "1.29"
# network = ""
# node_zones = ""
# num_cpu_nodes = 1
# num_gpu_nodes = 2
# nvaie = false
# nvaie_gpu_operator_driver_version = "535.129.03"
# nvaie_gpu_operator_version = "v23.9.0"
# nvaie_gpu_operator_driver_version = "550.54.15"
# nvaie_gpu_operator_version = "v23.9.2"
# project_id = ""
# region = ""
# release_channel = "REGULAR"
Expand Down
10 changes: 5 additions & 5 deletions gke/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ variable "release_channel" {
}

variable "min_master_version" {
default = "1.28"
default = "1.29"
description = "The minimum cluster version of the master."
}

Expand Down Expand Up @@ -133,13 +133,13 @@ variable "disk_size_gb" {
GPU Operator Variables
***************************/
variable "gpu_operator_version" {
default = "v23.9.1"
default = "v23.9.2"
description = "Version of the GPU Operator to deploy. Defaults to latest available. Not set when `nvaie` is set to `true`"
}

variable "gpu_operator_driver_version" {
type = string
default = "535.129.03"
default = "550.54.15"
description = "The NVIDIA Driver version deployed with GPU Operator. Defaults to latest available. Not set when `nvaie` is set to true"
}

Expand All @@ -157,12 +157,12 @@ variable "nvaie" {

variable "nvaie_gpu_operator_version" {
type = string
default = "v23.9.0"
default = "v23.9.2"
description = "The NVIDIA Driver version of GPU Operator. Overrides `gpu_operator_version` when `nvaie` is set to `true`"
}

variable "nvaie_gpu_operator_driver_version" {
type = string
default = "535.129.03"
default = "550.54.15"
description = "The NVIDIA AI Enterprise version of the NVIDIA driver to be installed with the GPU operator. Overrides `gpu_operator_driver_version` when `nvaie` is set to `true`"
}

0 comments on commit 8977e19

Please sign in to comment.