Merge pull request #3476 from chengcongdu/develop

add GKE support for managed hyperdisk
GoogleCloudPlatform · Jan 3, 2025 · 37b4b96 · 37b4b96
2 parents 2d068ab + 7605aaa
commit 37b4b96
Show file tree

Hide file tree

Showing 14 changed files with 505 additions and 3 deletions.
diff --git a/examples/README.md b/examples/README.md
@@ -1518,6 +1518,30 @@ cleaned up when the job is deleted.
 
 [storage-gke.yaml]: ../examples/storage-gke.yaml
 
+### [gke-managed-hyperdisk.yaml] ![core-badge] ![experimental-badge]
+
+This blueprint shows how to use managed hyperdisk storage options with GKE in the toolkit.
+
+The blueprint contains the following:
+
+* A K8s Job that uses a managed hyperdisk storage volume option.
+* A K8s Job that demonstrates ML training workload with managed hyperdisk storage disk operation.
+
+> **Warning**: In this example blueprint, when storage type `Hyperdisk-balanced`, `Hyperdisk-extreme` or `Hyperdisk-throughput` is specified in `gke-storage` module.
+> The lifecycle of the hyperdisk is managed by the blueprint.
+> On glcuster destroy operation, the hyperdisk storage created will also be destroyed.
+>
+> [!Note]
+> The Kubernetes API server will only allow requests from authorized networks.
+> The `gke-cluster` module needs access to the Kubernetes API server
+> to create a Persistent Volume and a Persistent Volume Claim. **You must use
+> the `authorized_cidr` variable to supply an authorized network which contains
+> the IP address of the machine deploying the blueprint, for example
+> `--vars authorized_cidr=<your-ip-address>/32`.** You can use a service like
+> [whatismyip.com](https://whatismyip.com) to determine your IP address.
+
+[gke-managed-hyperdisk.yaml]: ../examples/gke-managed-hyperdisk.yaml
+
 ### [gke-managed-parallelstore.yaml] ![core-badge] ![experimental-badge]
 
 This blueprint shows how to use managed parallelstore storage options with GKE in the toolkit.

diff --git a/examples/gke-managed-hyperdisk.yaml b/examples/gke-managed-hyperdisk.yaml
@@ -0,0 +1,225 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+blueprint_name: gke-managed-hyperdisk
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: gke-managed-hyperdisk
+  region: us-central1
+  zone: us-central1-c
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: <your-ip-address>/32
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network
+    source: modules/network/vpc
+    settings:
+      subnetwork_name: gke-subnet-hyperdisk
+      secondary_ranges:
+        gke-subnet-hyperdisk:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network]
+    settings:
+      release_channel: RAPID
+      enable_persistent_disk_csi: true # enable Hyperdisk for the cluster
+      configure_workload_identity_sa: true
+      enable_private_endpoint: false  # Allows for access from authorized public IPs
+      master_authorized_networks:
+      - display_name: deployment-machine
+        cidr_block: $(vars.authorized_cidr)
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
+    outputs: [instructions]
+
+  ### Set up storage class and persistent volume claim for Hyperdisk ###
+  - id: hyperdisk-balanced-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-balanced
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 100
+
+  - id: hyperdisk-throughput-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-throughput
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 5000
+
+  - id: hyperdisk-extreme-setup
+    source: modules/file-system/gke-storage
+    use: [gke_cluster]
+    settings:
+      storage_type: Hyperdisk-extreme
+      access_mode: ReadWriteOnce
+      sc_volume_binding_mode: Immediate
+      sc_reclaim_policy: Delete
+      sc_topology_zones: [$(vars.zone)]
+      pvc_count: 1
+      capacity_gb: 100
+
+  - id: sample-pool
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster]
+    settings:
+      name: sample-pool
+      zones: [$(vars.zone)]
+      machine_type: c3-standard-88 # Hyperdisk-extreme required C3 machine with 88 or more vCPUs
+      auto_upgrade: true
+
+  # Train a TensorFlow model with Keras and Hyperdisk Balanced on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-balanced-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-balanced-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-balanced-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
+
+  # Train a TensorFlow model with Keras and Hyperdisk Extreme on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-extreme-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-extreme-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-extreme-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
+
+  # Train a TensorFlow model with Keras and Hyperdisk Throughput on GKE
+  # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample
+  - id: hyperdisk-throughput-job
+    source: modules/compute/gke-job-template
+    use:
+    - gke_cluster
+    - hyperdisk-throughput-setup
+    settings:
+      name: tensorflow
+      image: jupyter/tensorflow-notebook@sha256:173f124f638efe870bb2b535e01a76a80a95217e66ed00751058c51c09d6d85d
+      security_context:  # to make sure the job have enough access to execute the jobs and r/w from hyperdisk
+      - key: runAsUser
+        value: 1000
+      - key: runAsGroup
+        value: 100
+      - key: fsGroup
+        value: 100
+      command:
+      - bash
+      - -c
+      - |
+        pip install transformers datasets
+        python - <<EOF
+        from datasets import load_dataset
+        dataset = load_dataset("glue", "cola", cache_dir='/data/hyperdisk-throughput-pvc-0')
+        dataset = dataset["train"]
+        from transformers import AutoTokenizer
+        import numpy as np
+        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+        tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+        tokenized_data = dict(tokenized_data)
+        labels = np.array(dataset["label"])
+        from transformers import TFAutoModelForSequenceClassification
+        from tensorflow.keras.optimizers import Adam
+        model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+        model.compile(optimizer=Adam(3e-5))
+        model.fit(tokenized_data, labels)
+        EOF
+      node_count: 1
+    outputs: [instructions]
diff --git a/examples/gke-managed-parallelstore.yaml b/examples/gke-managed-parallelstore.yaml
@@ -63,13 +63,19 @@ deployment_groups:
     source: modules/scheduler/gke-cluster
     use: [network]
     settings:
+      release_channel: RAPID
       enable_parallelstore_csi: true # enable Parallelstore for the cluster
       configure_workload_identity_sa: true
       enable_private_endpoint: false  # Allows for access from authorized public IPs
       gcp_public_cidrs_access_enabled: $(vars.gcp_public_cidrs_access_enabled)
       master_authorized_networks:
       - display_name: deployment-machine
         cidr_block: $(vars.authorized_cidr)
+      maintenance_exclusions:
+      - name: no-minor-or-node-upgrades-indefinite
+        start_time: "2024-12-01T00:00:00Z"
+        end_time: "2025-12-22T00:00:00Z"
+        exclusion_scope: NO_MINOR_OR_NODE_UPGRADES
     outputs: [instructions]
 
   ### Set up storage class and persistent volume claim for Parallelstore ###
@@ -92,6 +98,7 @@ deployment_groups:
       name: sample-pool
       zones: [$(vars.zone)]
       machine_type: n2-standard-16
+      auto_upgrade: true
 
   # Train a TensorFlow model with Keras and Parallelstore on GKE
   # Tutorial: https://cloud.google.com/parallelstore/docs/tensorflow-sample

diff --git a/modules/file-system/gke-storage/README.md b/modules/file-system/gke-storage/README.md
@@ -118,7 +118,7 @@ No resources.
 | <a name="input_sc_reclaim_policy"></a> [sc\_reclaim\_policy](#input\_sc\_reclaim\_policy) | Indicate whether to keep the dynamically provisioned PersistentVolumes of this storage class after the bound PersistentVolumeClaim is deleted.<br/>[More details about reclaiming](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)<br/>Supported value:<br/>- Retain<br/>- Delete | `string` | n/a | yes |
 | <a name="input_sc_topology_zones"></a> [sc\_topology\_zones](#input\_sc\_topology\_zones) | Zone location that allow the volumes to be dynamically provisioned. | `list(string)` | `null` | no |
 | <a name="input_sc_volume_binding_mode"></a> [sc\_volume\_binding\_mode](#input\_sc\_volume\_binding\_mode) | Indicates when volume binding and dynamic provisioning should occur and how PersistentVolumeClaims should be provisioned and bound.<br/>Supported value:<br/>- Immediate<br/>- WaitForFirstConsumer | `string` | `"WaitForFirstConsumer"` | no |
-| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore | `string` | n/a | yes |
+| <a name="input_storage_type"></a> [storage\_type](#input\_storage\_type) | The type of [GKE supported storage options](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview)<br/>to used. This module currently support dynamic provisioning for the below storage options<br/>- Parallelstore<br/>- Hyperdisk-balanced<br/>- Hyperdisk-throughput<br/>- Hyperdisk-extreme | `string` | n/a | yes |
 
 ## Outputs
 

diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-balanced-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-extreme-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl b/modules/file-system/gke-storage/persistent-volume-claim/hyperdisk-throughput-pvc.yaml.tftpl
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: ${pvc_name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+spec:
+  accessModes:
+    - ${access_mode}
+  resources:
+    requests:
+      storage: ${capacity}
+  storageClassName: ${storage_class_name}
diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-balanced-sc.yaml.tftpl
@@ -0,0 +1,25 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+  %{~ endfor ~}
+provisioner: pd.csi.storage.gke.io
+allowVolumeExpansion: true
+parameters:
+  type: hyperdisk-balanced
+  provisioned-throughput-on-create: "250Mi"
+  provisioned-iops-on-create: "7000"
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}
diff --git a/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl b/modules/file-system/gke-storage/storage-class/hyperdisk-extreme-sc.yaml.tftpl
@@ -0,0 +1,24 @@
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+  name: ${name}
+  labels:
+  %{~ for key, val in labels ~}
+    ${key}: ${val}
+provisioner: pd.csi.storage.gke.io
+allowVolumeExpansion: true
+parameters:
+  %{~ endfor ~}
+  type: hyperdisk-extreme
+  provisioned-iops-on-create: "50000"
+volumeBindingMode: ${volume_binding_mode}
+reclaimPolicy: ${reclaim_policy}
+  %{~ if topology_zones != null ~}
+allowedTopologies:
+- matchLabelExpressions:
+  - key: topology.gke.io/zone
+    values:
+    %{~ for z in topology_zones ~}
+    - ${z}
+    %{~ endfor ~}
+  %{~ endif ~}