Skip to content

Commit

Permalink
feat: add support for gpu_sharing_config on nodepool (#1874)
Browse files Browse the repository at this point in the history
  • Loading branch information
jimgus committed May 24, 2024
1 parent c51c446 commit b57387c
Show file tree
Hide file tree
Showing 16 changed files with 317 additions and 149 deletions.
41 changes: 23 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,24 +54,29 @@ module "gke" {
node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]
Expand Down
43 changes: 24 additions & 19 deletions autogen/main/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,27 +96,32 @@ module "gke" {
{% if autopilot_cluster != true %}
node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
{% if beta_cluster %}
local_ssd_ephemeral_count = 0
local_ssd_ephemeral_count = 0
{% endif %}
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]
Expand Down
10 changes: 10 additions & 0 deletions autogen/main/cluster.tf.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -643,6 +643,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down Expand Up @@ -927,6 +929,14 @@ resource "google_container_node_pool" "windows_pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down
16 changes: 16 additions & 0 deletions cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -644,6 +644,14 @@ resource "google_container_node_pool" "pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down Expand Up @@ -882,6 +890,14 @@ resource "google_container_node_pool" "windows_pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down
43 changes: 24 additions & 19 deletions modules/beta-private-cluster-update-variant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,25 +84,30 @@ module "gke" {
node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]
Expand Down
18 changes: 18 additions & 0 deletions modules/beta-private-cluster-update-variant/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -552,6 +552,8 @@ locals {
"accelerator_type",
"gpu_partition_size",
"gpu_driver_version",
"gpu_sharing_strategy",
"max_shared_clients_per_gpu",
"enable_secure_boot",
"enable_integrity_monitoring",
"local_ssd_count",
Expand Down Expand Up @@ -811,6 +813,14 @@ resource "google_container_node_pool" "pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down Expand Up @@ -1075,6 +1085,14 @@ resource "google_container_node_pool" "windows_pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down
43 changes: 24 additions & 19 deletions modules/beta-private-cluster/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,25 +62,30 @@ module "gke" {
node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]
Expand Down
16 changes: 16 additions & 0 deletions modules/beta-private-cluster/cluster.tf
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,14 @@ resource "google_container_node_pool" "pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down Expand Up @@ -998,6 +1006,14 @@ resource "google_container_node_pool" "windows_pools" {
gpu_driver_version = lookup(each.value, "gpu_driver_version", "")
}
}

dynamic "gpu_sharing_config" {
for_each = lookup(each.value, "gpu_sharing_strategy", "") != "" ? [1] : []
content {
gpu_sharing_strategy = lookup(each.value, "gpu_sharing_strategy", "")
max_shared_clients_per_gpu = lookup(each.value, "max_shared_clients_per_gpu", 2)
}
}
}
}

Expand Down
43 changes: 24 additions & 19 deletions modules/beta-public-cluster-update-variant/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,25 +78,30 @@ module "gke" {
node_pools = [
{
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
name = "default-node-pool"
machine_type = "e2-medium"
node_locations = "us-central1-b,us-central1-c"
min_count = 1
max_count = 100
local_ssd_count = 0
spot = false
local_ssd_ephemeral_count = 0
disk_size_gb = 100
disk_type = "pd-standard"
image_type = "COS_CONTAINERD"
enable_gcfs = false
enable_gvnic = false
logging_variant = "DEFAULT"
auto_repair = true
auto_upgrade = true
service_account = "project-service-account@<PROJECT ID>.iam.gserviceaccount.com"
preemptible = false
initial_node_count = 80
accelerator_count = 1
accelerator_type = "nvidia-l4"
gpu_driver_version = "LATEST"
gpu_sharing_strategy = "TIME_SHARING"
max_shared_clients_per_gpu = 2
},
]
Expand Down
Loading

0 comments on commit b57387c

Please sign in to comment.