From cff1c1be2658fc88ea793dca73a5156b849050c3 Mon Sep 17 00:00:00 2001
From: Stanley Yu <stanley98yu@gmail.com>
Date: Mon, 22 Nov 2021 20:49:58 -0500
Subject: [PATCH] feat: Add support for `gpu_partition_size` (#1072)

* Add support for gpu_partition_size

* Add test for gpu_partition_size

* Lint formatting fix
---
 README.md                                     |  1 +
 autogen/main/README.md                        |  1 +
 autogen/main/cluster.tf.tmpl                  |  2 ++
 cluster.tf                                    | 10 ++++---
 examples/node_pool/main.tf                    | 27 ++++++++++---------
 .../README.md                                 |  1 +
 .../cluster.tf                                | 10 ++++---
 modules/beta-private-cluster/README.md        |  1 +
 modules/beta-private-cluster/cluster.tf       | 10 ++++---
 .../README.md                                 |  1 +
 .../cluster.tf                                | 10 ++++---
 modules/beta-public-cluster/README.md         |  1 +
 modules/beta-public-cluster/cluster.tf        | 10 ++++---
 .../private-cluster-update-variant/README.md  |  1 +
 .../private-cluster-update-variant/cluster.tf | 10 ++++---
 modules/private-cluster/README.md             |  1 +
 modules/private-cluster/cluster.tf            | 10 ++++---
 test/integration/node_pool/controls/gcloud.rb |  7 ++---
 18 files changed, 70 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index f453556c7..c07f08644 100644
--- a/README.md
+++ b/README.md
@@ -236,6 +236,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/autogen/main/README.md b/autogen/main/README.md
index 14acb6f3b..1457215e5 100644
--- a/autogen/main/README.md
+++ b/autogen/main/README.md
@@ -186,6 +186,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/autogen/main/cluster.tf.tmpl b/autogen/main/cluster.tf.tmpl
index 36c154953..901d0bd36 100644
--- a/autogen/main/cluster.tf.tmpl
+++ b/autogen/main/cluster.tf.tmpl
@@ -598,9 +598,11 @@ resource "google_container_node_pool" "pools" {
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
         type  = lookup(each.value, "accelerator_type", "")
         count = lookup(each.value, "accelerator_count", 0)
+	gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
         type  = guest_accelerator["type"]
         count = guest_accelerator["count"]
+	gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/cluster.tf b/cluster.tf
index 57de71180..d06a02678 100644
--- a/cluster.tf
+++ b/cluster.tf
@@ -308,11 +308,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/examples/node_pool/main.tf b/examples/node_pool/main.tf
index f9bce4935..e2b98cf9e 100644
--- a/examples/node_pool/main.tf
+++ b/examples/node_pool/main.tf
@@ -19,7 +19,7 @@ locals {
 }
 
 provider "google-beta" {
-  version = "~> 3.79.0"
+  version = "~> 3.90.0"
   region  = var.region
 }
 
@@ -55,18 +55,19 @@ module "gke" {
       auto_upgrade    = true
     },
     {
-      name              = "pool-02"
-      machine_type      = "n1-standard-2"
-      min_count         = 1
-      max_count         = 2
-      local_ssd_count   = 0
-      disk_size_gb      = 30
-      disk_type         = "pd-standard"
-      accelerator_count = 1
-      accelerator_type  = "nvidia-tesla-p4"
-      image_type        = "COS"
-      auto_repair       = false
-      service_account   = var.compute_engine_service_account
+      name               = "pool-02"
+      machine_type       = "a2-highgpu-1g"
+      min_count          = 1
+      max_count          = 2
+      local_ssd_count    = 0
+      disk_size_gb       = 30
+      disk_type          = "pd-standard"
+      accelerator_count  = 1
+      accelerator_type   = "nvidia-tesla-a100"
+      gpu_partition_size = "1g.5gb"
+      image_type         = "COS"
+      auto_repair        = false
+      service_account    = var.compute_engine_service_account
     },
     {
       name               = "pool-03"
diff --git a/modules/beta-private-cluster-update-variant/README.md b/modules/beta-private-cluster-update-variant/README.md
index 6a25c0483..0916532bc 100644
--- a/modules/beta-private-cluster-update-variant/README.md
+++ b/modules/beta-private-cluster-update-variant/README.md
@@ -308,6 +308,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-private-cluster-update-variant/cluster.tf b/modules/beta-private-cluster-update-variant/cluster.tf
index 1c08ad7a8..c97df30d2 100644
--- a/modules/beta-private-cluster-update-variant/cluster.tf
+++ b/modules/beta-private-cluster-update-variant/cluster.tf
@@ -535,11 +535,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-private-cluster/README.md b/modules/beta-private-cluster/README.md
index 531eb8a40..206fb379e 100644
--- a/modules/beta-private-cluster/README.md
+++ b/modules/beta-private-cluster/README.md
@@ -286,6 +286,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-private-cluster/cluster.tf b/modules/beta-private-cluster/cluster.tf
index b36e78ac7..54843f821 100644
--- a/modules/beta-private-cluster/cluster.tf
+++ b/modules/beta-private-cluster/cluster.tf
@@ -450,11 +450,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-public-cluster-update-variant/README.md b/modules/beta-public-cluster-update-variant/README.md
index 9e70d20fe..367be68aa 100644
--- a/modules/beta-public-cluster-update-variant/README.md
+++ b/modules/beta-public-cluster-update-variant/README.md
@@ -295,6 +295,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-public-cluster-update-variant/cluster.tf b/modules/beta-public-cluster-update-variant/cluster.tf
index 4a8d0eb78..5286332ba 100644
--- a/modules/beta-public-cluster-update-variant/cluster.tf
+++ b/modules/beta-public-cluster-update-variant/cluster.tf
@@ -516,11 +516,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-public-cluster/README.md b/modules/beta-public-cluster/README.md
index 0c1306a58..6421c6eb9 100644
--- a/modules/beta-public-cluster/README.md
+++ b/modules/beta-public-cluster/README.md
@@ -273,6 +273,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-public-cluster/cluster.tf b/modules/beta-public-cluster/cluster.tf
index fb19ff7e0..0768bdbbf 100644
--- a/modules/beta-public-cluster/cluster.tf
+++ b/modules/beta-public-cluster/cluster.tf
@@ -431,11 +431,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/private-cluster-update-variant/README.md b/modules/private-cluster-update-variant/README.md
index 80f4ad988..580e89fb9 100644
--- a/modules/private-cluster-update-variant/README.md
+++ b/modules/private-cluster-update-variant/README.md
@@ -270,6 +270,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/private-cluster-update-variant/cluster.tf b/modules/private-cluster-update-variant/cluster.tf
index 8852b0139..96d7736a8 100644
--- a/modules/private-cluster-update-variant/cluster.tf
+++ b/modules/private-cluster-update-variant/cluster.tf
@@ -406,11 +406,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/private-cluster/README.md b/modules/private-cluster/README.md
index 4ac06b875..63ce736b4 100644
--- a/modules/private-cluster/README.md
+++ b/modules/private-cluster/README.md
@@ -248,6 +248,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/private-cluster/cluster.tf b/modules/private-cluster/cluster.tf
index 308bff0bc..5c17d614b 100644
--- a/modules/private-cluster/cluster.tf
+++ b/modules/private-cluster/cluster.tf
@@ -321,11 +321,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/test/integration/node_pool/controls/gcloud.rb b/test/integration/node_pool/controls/gcloud.rb
index fe3f88ce4..8858350a8 100644
--- a/test/integration/node_pool/controls/gcloud.rb
+++ b/test/integration/node_pool/controls/gcloud.rb
@@ -17,7 +17,7 @@
 cluster_name = attribute('cluster_name')
 
 expected_accelerators_count = "1"
-expected_accelerators_type = "nvidia-tesla-p4"
+expected_accelerators_type = "nvidia-tesla-a100"
 
 control "gcloud" do
   title "Google Compute Engine GKE configuration"
@@ -207,7 +207,7 @@
             including(
               "name" => "pool-02",
               "config" => including(
-                "machineType" => "n1-standard-2",
+                "machineType" => "a2-highgpu-1g",
               ),
             )
           )
@@ -252,7 +252,8 @@
               "name" => "pool-02",
               "config" => including(
                 "accelerators" => [{"acceleratorCount" => expected_accelerators_count,
-                                    "acceleratorType" => expected_accelerators_type}],
+                                    "acceleratorType" => expected_accelerators_type,
+                                    "gpuPartitionSize" => "1g.5gb"}],
               ),
             )
           )