From 6f9535aa4c7694bab09ccb35ce3df8209167cba9 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Mon, 29 Apr 2024 22:28:05 +0000
Subject: [PATCH 01/12] Creating User for models that support gRPC requests.
 (Currently bespoke for JetStream)

---
 .../locust-docker/Dockerfile                  |   6 +-
 .../locust-tasks/requirements.txt             |   3 +
 .../locust-docker/locust-tasks/run.sh         |   8 +-
 .../locust-docker/locust-tasks/tasks.py       | 135 ++++++++++++++----
 .../locust-master-controller.yaml.tpl         |   4 +-
 .../locust-worker-controller.yaml.tpl         |  10 +-
 .../tools/locust-load-inference/variables.tf  |  11 ++
 7 files changed, 141 insertions(+), 36 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/Dockerfile b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/Dockerfile
index e74754a1f..00aabacda 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/Dockerfile
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/Dockerfile
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 
-# Start with a base image Python 3.9.12 Debian 11 (bullseye) slim 
-FROM python:3.9.12-slim-bullseye
+# Start with a base image Python 3.10.14 Debian 11 (bullseye) slim 
+FROM python:3.10.14-slim-bullseye
 
 # Add the external tasks directory into /tasks
 ADD locust-tasks /locust-tasks
 
+RUN pip install --upgrade pip
+
 # Install the required dependencies via pip
 RUN pip install -r /locust-tasks/requirements.txt
 
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
index 7d1781dc6..50be80c2c 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/requirements.txt
@@ -32,3 +32,6 @@ zipp==3.8.0
 zope.event==4.5.0
 zope.interface==5.4.0
 TensorFlow >= 2.0
+google-jetstream==0.2.0
+grpcio==1.62.2
+grpc-interceptor==0.15.4
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
index b607e5bf6..23b21649c 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
@@ -15,9 +15,15 @@
 # limitations under the License.
 
 LOCUST="/usr/local/bin/locust"
-LOCUST_OPTS="-f /locust-tasks/tasks.py --host=$TARGET_HOST"
+LOCUST_OPTS="-f /locust-tasks/tasks.py "
 LOCUST_MODE=${LOCUST_MODE:-standalone}
 
+if [[ "$REQUEST_TYPE" = "grpc" ]]; then 
+    LOCUST_OPTS="$LOCUST_OPTS JetStreamUser --host=$TARGET_HOST"
+else
+    LOCUST_OPTS="$LOCUST_OPTS BenchmarkUser --host='http://$TARGET_HOST"
+fi
+
 if [[ "$LOCUST_MODE" = "master" ]]; then
     # Locust stop-timeout default is 0s. Only used in distributed mode.
     # Master will wait $stop-timout amount of time for the User to complete it's task.
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index b1e5a86a6..3992fb093 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -22,16 +22,24 @@
 import time
 from locust import web  # Import the web module from Locust
 from typing import Callable, List
-from locust import FastHttpUser, task, events
+from locust import FastHttpUser, task, events, User
 from locust.runners import MasterRunner
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
+from locust.exception import LocustError
+from jetstream.core.proto import jetstream_pb2
+from jetstream.core.proto import jetstream_pb2_grpc
+from typing import Any, Callable
+import grpc
+import grpc.experimental.gevent as grpc_gevent
+from grpc_interceptor import ClientInterceptor
+
 
 from custom_metric_aggregator import TokenMetricCollector
 local_metric_collector = TokenMetricCollector()
 
 logging.basicConfig(level=logging.INFO)
-
+grpc_gevent.init_gevent()
 
 def load_test_prompts():
     """Loads test prompts from a local file location."""
@@ -128,8 +136,6 @@ def get_token_count(prompt, resp):
             tokenizer.encode(resp_dict['text_output']))
     elif backend == "sax":
         number_of_output_tokens = 0  # to be added
-    elif backend == "jetstream":
-        number_of_output_tokens = 0
     else:
         raise ValueError(f"Unknown backend: {backend}")
     return number_of_input_tokens, number_of_output_tokens
@@ -170,42 +176,42 @@ def lm_generate(self):
                         f"Failed request with invalid response code: {resp.status_code}. Due to requests.RequestException thrown by Session, caused by connection errors, timeouts or similar. Try increasing connection_timeout")
                 self.handle_failed_response(request, resp)
 
-    def handle_successful_response(self, prompt, reponse, start_time):
-        global model_params
-        test_time = time.time() - start_time
-        request_successful_bool = 1
-        tokens_sent, tokens_received = get_token_count(prompt, reponse)
+def handle_successful_response(prompt, reponse, start_time):
+    global model_params
+    test_time = time.time() - start_time
+    request_successful_bool = 1
+    tokens_sent, tokens_received = get_token_count(prompt, reponse)
 
-        local_metric_collector.add_metric(
-            tokens_sent, tokens_received, test_time, request_successful_bool)
-        logging.info(
-            f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+    send_metrics(tokens_sent, tokens_received, test_time, request_successful_bool)
 
-    def handle_failed_response(self, request, response):
-        global model_params
-        response.failure("Got unexpected response")
-        logging.error(f"request {request} failed with: {response.status_code}")
-        tokens_sent = -1
-        tokens_received = -1
-        test_time = -1
-        request_successful_bool = 0
-
-        local_metric_collector.add_metric(
-            tokens_sent, tokens_received, test_time, request_successful_bool)
-        logging.info(
-            f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+def handle_failed_response(request, response):
+    global model_params
+    response.failure("Got unexpected response")
+    logging.error(f"request {request} failed with: {response.status_code}")
+    tokens_sent = -1
+    tokens_received = -1
+    test_time = -1
+    request_successful_bool = 0
 
+    send_metrics(tokens_sent, tokens_received, test_time, request_successful_bool)
+
+def send_metrics( tokens_sent, tokens_received, test_time, request_successful_bool):
+    local_metric_collector.add_metric(
+        tokens_sent, tokens_received, test_time, request_successful_bool)
+    logging.info(
+        f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
 
 @events.test_stop.add_listener
 def on_test_stop(environment, **kwargs):
     """on test stop the locust master resets metric collector"""
     if isinstance(environment.runner, MasterRunner):
+        logging.info(f'dumping metrics before clear: {local_metric_collector.json_dump_report()}')
         logging.info(f'init metric_collector')
         local_metric_collector.__init__()
 
 
 """
-Methods for collecting custom metrics to share to master webui
+Methods for collecting custom metrics to share to master web ui
 """
 
 @events.report_to_master.add_listener
@@ -300,3 +306,78 @@ def total_content_length():
             Add a route to the Locust web app, where we can see the total content-length
             """
             return local_metric_collector.json_dump_report()
+
+class GrpcUser(User):
+    abstract = True
+    stub_class = None
+
+    def __init__(self, environment):
+        super().__init__(environment)
+        for attr_value, attr_name in ((self.host, "host"), (self.stub_class, "stub_class")):
+            if attr_value is None:
+                raise LocustError(f"You must specify the {attr_name}.")
+
+        self._channel = grpc.insecure_channel(self.host)
+        interceptor = LocustInterceptor(environment=environment)
+        self._channel = grpc.intercept_channel(self._channel, interceptor)
+
+        self.stub = self.stub_class(self._channel)
+
+class JetStreamUser(GrpcUser):
+    stub_class = jetstream_pb2_grpc.OrchestratorStub
+
+    @task
+    def jetstream_grpc_infer(self):
+        prompt = test_data[random.randrange(0, len(test_data))]
+        request = jetstream_pb2.DecodeRequest(
+            additional_text=prompt,
+            priority=0,
+            max_tokens=model_params["max_output_len"],
+        )
+        logging.info(f"Prompt: {prompt}")
+        #return values format is from the interceptor, which makes the actual call
+        output, ttft, response_time = self.stub.Decode(request)
+        logging.info(f"Response: {output}")
+
+        number_of_input_tokens = len(tokenizer.encode(prompt))
+        number_of_output_tokens = len(tokenizer.encode(output))
+        send_metrics(number_of_input_tokens, number_of_output_tokens, response_time,1)
+
+
+class LocustInterceptor(ClientInterceptor):
+    def __init__(self, environment, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.env = environment
+
+    def intercept(
+        self,
+        method: Callable,
+        request_or_iterator: Any,
+        call_details: grpc.ClientCallDetails,
+    ):
+        response = None
+        exception = None
+        start_perf_counter = time.perf_counter()
+        response_length = 0
+        responses = method(request_or_iterator, call_details)
+        output = ""
+        response_length = 0
+        ttft = 0
+        for response in responses:
+            if ttft == 0:
+                ttft = time.perf_counter() - start_perf_counter
+            output += response.response[0]
+            response_length += response.ByteSize()  
+        response_time_ms = (time.perf_counter() - start_perf_counter) * 1000
+        logging.info(f"response_time {response_time_ms}; ttft:{ttft * 1000}")
+        self.env.events.request.fire(
+            request_type="grpc",
+            name=call_details.method,
+            response_time=response_time_ms,
+            response_length=response_length,
+            response=response,
+            context=None,
+            exception=exception,
+        )
+        return output, ttft, response_time_ms
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
index 93bfb2f83..ff212130e 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-master-controller.yaml.tpl
@@ -24,11 +24,13 @@ spec:
             - name: LOCUST_MODE
               value: master
             - name: TARGET_HOST
-              value: http://${inference_server_service}
+              value: ${inference_server_service}
             - name: BACKEND
               value: ${inference_server_framework}
             - name: STOP_TIMEOUT
               value: ${stop_timeout}
+            - name: REQUEST_TYPE
+              value: grpc
           ports:
             - name: loc-master-web
               containerPort: 8089
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
index 821126fdf..26bd2d5cd 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
@@ -23,10 +23,12 @@ spec:
           env:
             - name: LOCUST_MODE
               value: worker
+            - name: REQUEST_TYPE
+              value: grpc
             - name: LOCUST_MASTER
               value: locust-master
             - name: TARGET_HOST
-              value: http://${inference_server_service}
+              value: ${inference_server_service}
             - name: BACKEND
               value: ${inference_server_framework}
             - name: BEST_OF
@@ -45,10 +47,8 @@ spec:
               value: ${tokenizer}
             - name: USE_BEAM_SEARCH
               value: ${use_beam_search}
-%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HUGGINGFACE_TOKEN
               valueFrom:
                 secretKeyRef:
-                  name: hf-token
-                  key: HF_TOKEN
-%{ endfor ~}
+                  name: huggingface-secret
+                  key: token
diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
index 6f104efea..4b1155d1d 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
@@ -210,3 +210,14 @@ variable "hugging_face_secret_version" {
   nullable    = true
   default     = null
 }
+
+variable "request_type" {
+  description = "The method of request used when calling the model server (http or grpc)"
+  type        = string
+  nullable    = true
+  default     = "http"
+  validation {
+    condition     = var.request_type == "http" || var.request_type == "grpc"
+    error_message = "The request_type must be 'http' or 'grpc'."
+  }
+}

From 78ccb55258dba650e4adea744700466fcc31ec80 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Wed, 1 May 2024 17:00:30 +0000
Subject: [PATCH 02/12] genericizing grpc classes and functions

---
 .../locust-load-inference/locust-docker/locust-tasks/run.sh   | 2 +-
 .../locust-load-inference/locust-docker/locust-tasks/tasks.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
index 23b21649c..3e1084d0b 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/run.sh
@@ -19,7 +19,7 @@ LOCUST_OPTS="-f /locust-tasks/tasks.py "
 LOCUST_MODE=${LOCUST_MODE:-standalone}
 
 if [[ "$REQUEST_TYPE" = "grpc" ]]; then 
-    LOCUST_OPTS="$LOCUST_OPTS JetStreamUser --host=$TARGET_HOST"
+    LOCUST_OPTS="$LOCUST_OPTS GrpcBenchmarkUser --host=$TARGET_HOST"
 else
     LOCUST_OPTS="$LOCUST_OPTS BenchmarkUser --host='http://$TARGET_HOST"
 fi
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 3992fb093..86cb00995 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -323,11 +323,11 @@ def __init__(self, environment):
 
         self.stub = self.stub_class(self._channel)
 
-class JetStreamUser(GrpcUser):
+class GrpcBenchmarkUser(GrpcUser):
     stub_class = jetstream_pb2_grpc.OrchestratorStub
 
     @task
-    def jetstream_grpc_infer(self):
+    def grpc_infer(self):
         prompt = test_data[random.randrange(0, len(test_data))]
         request = jetstream_pb2.DecodeRequest(
             additional_text=prompt,

From 099b0face1a2793ee00b9b5ece6eb4aac703715c Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Wed, 1 May 2024 17:53:07 +0000
Subject: [PATCH 03/12] Updating readme with additions/details

---
 benchmarks/benchmark/tools/locust-load-inference/README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index e832373ae..171d5d7a8 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -113,6 +113,7 @@ Fill out your `terraform.tfvars` with the desired model and server configuration
 - `inference_server_service` - an accessible service name for inference workload to be benchmarked
 - `tokenizer` - must match the model running on the inference workload to be benchmarked
 - `inference_server_framework` - the inference workload framework
+- `request_type` - **required if using gRPC**
 - `gcs_path` - gcs bucket where prompts to use during benchmark are stored
 - `ksa` - access for the `gcs_path` gcs bucket where prompts are stored.
 - `output_bucket` - gcs bucket to write benchmarking metrics to.
@@ -249,7 +250,8 @@ To change the benchmark configuration, you will have to rerun terraform destroy
 | <a name="input_best_of"></a> [best\_of](#input\_best\_of)                                                            | Benchmark server configuration for best of.                                                                                 | `number`                                                                                                                                                                                                    | `1`                  |    no    |
 | <a name="input_credentials_config"></a> [credentials\_config](#input\_credentials\_config)                           | Configure how Terraform authenticates to the cluster.                                                                       | <pre>object({<br>    fleet_host = optional(string)<br>    kubeconfig = optional(object({<br>      context = optional(string)<br>      path    = optional(string, "~/.kube/config")<br>    }))<br>  })</pre> | n/a                  |   yes    |
 | <a name="input_gcs_path"></a> [gcs\_path](#input\_gcs\_path)                                                         | Benchmark server configuration for gcs\_path for downloading prompts.                                                       | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
-| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax | `string`                                                                                                                                                                                                    | `"tgi"`              |   yes    |
+| <a name="input_inference_server_framework"></a> [inference\_server\_framework](#input\_inference\_server\_framework) | Benchmark server configuration for inference server framework. Can be one of: vllm, tgi, tensorrt\_llm\_triton, sax, or jetstream  | `string`                                                                                                                                                                                                    | `"tgi"`              |   yes    |
+| <a name="input_request_type"></a> [request\_type](#input\_request\_type) | Protocol to use when making requests to the model server. Can be `grpc` or `http` | `string`                                                                                                                                                                                                    | `"http"`              |   no    |
 | <a name="input_inference_server_ip"></a> [inference\_server\_ip](#input\_inference\_server\_ip)                      | Inference server ip address                                                                                                 | `string`                                                                                                                                                                                                    | n/a                  |   yes    |
 | <a name="input_ksa"></a> [ksa](#input\_ksa)                                                                          | Kubernetes Service Account used for workload.                                                                               | `string`                                                                                                                                                                                                    | `"default"`          |    no    |
 | <a name="locust_runner_kubernetes_service_account"></a> [locust\_runner\_kubernetes\_service\_account](#locust\_runner\_kubernetes\_service\_account)                                                                          | "Kubernetes Service Account to be used for Locust runner tool. Must have storage.admin access to output_bucket"                                                                              | `string`                                                                                                                                                                                                    | `"sample-runner-ksa"`          |    no    |

From 282ba270a6f7165629a696d1c182c438c347e896 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Wed, 1 May 2024 18:01:13 +0000
Subject: [PATCH 04/12] Addressing comments, more ReadMe adjustments

---
 benchmarks/benchmark/tools/locust-load-inference/README.md  | 2 +-
 .../locust-docker/locust-tasks/tasks.py                     | 2 ++
 .../manifest-templates/locust-worker-controller.yaml.tpl    | 6 ++++--
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 171d5d7a8..58b6ae904 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -110,7 +110,7 @@ Fill out your `terraform.tfvars` with the desired model and server configuration
 - `credentials_config` - credentials for cluster to deploy Locust benchmark tool on
 - `project_id` - project id for enabling dependent services for building locust artifacts
 - `artifact_registry` - artifact registry to upload locust artifacts to
-- `inference_server_service` - an accessible service name for inference workload to be benchmarked
+- `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: a non-80 port should be specified here. Example: `my-service-name:9000`)
 - `tokenizer` - must match the model running on the inference workload to be benchmarked
 - `inference_server_framework` - the inference workload framework
 - `request_type` - **required if using gRPC**
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 86cb00995..049a48612 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -364,6 +364,8 @@ def intercept(
         output = ""
         response_length = 0
         ttft = 0
+        # Response is streamed and iterated over as it is received. The first
+        # chunk sent back is used to calculate time to first token(TTFT).
         for response in responses:
             if ttft == 0:
                 ttft = time.perf_counter() - start_perf_counter
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
index 26bd2d5cd..047b6c2ed 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
@@ -47,8 +47,10 @@ spec:
               value: ${tokenizer}
             - name: USE_BEAM_SEARCH
               value: ${use_beam_search}
+%{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
             - name: HUGGINGFACE_TOKEN
               valueFrom:
                 secretKeyRef:
-                  name: huggingface-secret
-                  key: token
+                  name: hf-token
+                  key: HF_TOKEN
+%{ endfor ~}
\ No newline at end of file

From 5fc6656b9009929ebcd683b84f0ec7b1d71affd6 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Wed, 1 May 2024 20:18:08 +0000
Subject: [PATCH 05/12] Readme clarification

---
 benchmarks/benchmark/tools/locust-load-inference/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 58b6ae904..cbc3f585c 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -110,7 +110,7 @@ Fill out your `terraform.tfvars` with the desired model and server configuration
 - `credentials_config` - credentials for cluster to deploy Locust benchmark tool on
 - `project_id` - project id for enabling dependent services for building locust artifacts
 - `artifact_registry` - artifact registry to upload locust artifacts to
-- `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: a non-80 port should be specified here. Example: `my-service-name:9000`)
+- `inference_server_service` - an accessible service name for inference workload to be benchmarked **(Note: If you are using a non-80 port for your model server service, it should be specified here. Example: `my-service-name:9000`)**
 - `tokenizer` - must match the model running on the inference workload to be benchmarked
 - `inference_server_framework` - the inference workload framework
 - `request_type` - **required if using gRPC**

From b58d35f7eb1f98d47967fdf281f4f1ab08726333 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Mon, 6 May 2024 14:51:12 +0000
Subject: [PATCH 06/12] Emitting avg ttft as a metric

---
 .../locust-custom-exporter/main.go                  | 13 ++++++++++++-
 .../locust-tasks/custom_metric_aggregator.py        | 11 ++++++++---
 .../locust-docker/locust-tasks/tasks.py             | 13 +++++++------
 .../locust-runner/metrics.yaml                      |  5 +++++
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/main.go b/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/main.go
index 51e7802fe..9b735ac4e 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/main.go
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-custom-exporter/main.go
@@ -33,7 +33,8 @@ type Exporter struct {
 	locustAvgTokensSent,
 	locustAvgTokensReceived,
 	locustAvgTestTime,
-	locustAvgOutputTokenLatency prometheus.Gauge
+	locustAvgOutputTokenLatency,
+	locustTimeToFirstToken prometheus.Gauge
 }
 
 // NewExporter function
@@ -82,6 +83,13 @@ func NewExporter(uri string, timeout time.Duration) (*Exporter, error) {
 				Name:      "avg_output_token_latency",
 			},
 		),
+		locustTimeToFirstToken: prometheus.NewGauge(
+			prometheus.GaugeOpts{
+				Namespace: namespace,
+				Subsystem: "custom_metrics",
+				Name:      "avg_time_to_first_token",
+			},
+		),
 	}, nil
 }
 
@@ -94,6 +102,7 @@ func (e *Exporter) Describe(ch chan<- *prometheus.Desc) {
 	ch <- e.locustAvgTokensReceived.Desc()
 	ch <- e.locustAvgTestTime.Desc()
 	ch <- e.locustAvgOutputTokenLatency.Desc()
+	ch <- e.locustTimeToFirstToken.Desc()
 }
 
 // Collect function of Exporter
@@ -121,6 +130,7 @@ func (e *Exporter) Collect(ch chan<- prometheus.Metric) {
 	ch <- prometheus.MustNewConstMetric(e.locustAvgTokensReceived.Desc(), prometheus.GaugeValue, float64(locustStats.AvgTokensReceived))
 	ch <- prometheus.MustNewConstMetric(e.locustAvgTestTime.Desc(), prometheus.GaugeValue, float64(locustStats.AvgTestTime))
 	ch <- prometheus.MustNewConstMetric(e.locustAvgOutputTokenLatency.Desc(), prometheus.GaugeValue, float64(locustStats.AvgOutputTokenLatency))
+	ch <- prometheus.MustNewConstMetric(e.locustTimeToFirstToken.Desc(), prometheus.GaugeValue, float64(locustStats.AvgTimeToFirstToken))
 }
 
 type locustStats struct {
@@ -128,6 +138,7 @@ type locustStats struct {
 	AvgTokensReceived     float64 `json:"average-tokens-received"`
 	AvgTestTime           float64 `json:"average-test-time"`
 	AvgOutputTokenLatency float64 `json:"average-output-token-latency"`
+	AvgTimeToFirstToken   float64 `json:"average-time-to-first-token"`
 }
 
 func fetchHTTP(uri string, timeout time.Duration) func(endpoint string) (io.ReadCloser, error) {
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
index c7df837c4..7a159e340 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/custom_metric_aggregator.py
@@ -9,25 +9,29 @@ def __init__(self):
         self.test_time = []
         self.success_count = 0
         self.failure_count = 0
+        self.time_to_first_token_list = []
 
-    def add_metric(self, sent, received, test_time, request_succesful_bool):
+    def add_metric(self, sent, received, test_time, request_succesful_bool, ttft):
         if request_succesful_bool == 1:
             self.tokens_sent.append(sent)
             self.tokens_received.append(received)
             self.test_time.append(test_time)
             self.success_count += 1
+            if ttft != 0:
+                self.time_to_first_token_list.append(ttft)
         else:
             self.failure_count += 1
 
-    def add_metrics(self, tokens_sent, tokens_received, test_time, success_count, failure_count):
+    def add_metrics(self, tokens_sent, tokens_received, test_time, success_count, failure_count, ttfts):
         self.tokens_sent = self.tokens_sent + tokens_sent
         self.tokens_received = self.tokens_received + tokens_received
         self.test_time = self.test_time + test_time
         self.success_count += success_count
         self.failure_count += failure_count
+        self.time_to_first_token_list = self.time_to_first_token_list + ttfts
 
     def share_stats(self):
-        return self.tokens_sent, self.tokens_received, self.test_time, self.success_count, self.failure_count
+        return self.tokens_sent, self.tokens_received, self.test_time, self.success_count, self.failure_count, self.time_to_first_token_list
 
     def calculate_average_tokens(self):
         if self.tokens_sent and len(self.tokens_sent) > 0:
@@ -53,6 +57,7 @@ def json_dump_report(self):
             "average-tokens-received": avg_received,
             "average-output-token-latency": avg_output_token_latency,
             "average-test-time": avg_test_time,
+            "average-time-to-first-token": sum(self.time_to_first_token_list)/max(len(self.time_to_first_token_list),1)
         }
         return json.dumps(stats)
 
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
index 049a48612..4df43e57e 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-docker/locust-tasks/tasks.py
@@ -195,11 +195,11 @@ def handle_failed_response(request, response):
 
     send_metrics(tokens_sent, tokens_received, test_time, request_successful_bool)
 
-def send_metrics( tokens_sent, tokens_received, test_time, request_successful_bool):
+def send_metrics( tokens_sent, tokens_received, test_time, request_successful_bool, ttft=0):
     local_metric_collector.add_metric(
-        tokens_sent, tokens_received, test_time, request_successful_bool)
+        tokens_sent, tokens_received, test_time, request_successful_bool, ttft)
     logging.info(
-        f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool]}')
+        f'sending to master: metric_update: {[tokens_sent, tokens_received, test_time, request_successful_bool, ttft]}')
 
 @events.test_stop.add_listener
 def on_test_stop(environment, **kwargs):
@@ -222,12 +222,13 @@ def on_report_to_master(client_id, data):
     to the dict that is being sent, and then we clear the local stats in the worker, so
     as to avoid sending duplicate data to the master on the next run.
     """
-    tokens_sent, tokens_recieved, test_time, success_count, failure_count = local_metric_collector.share_stats()
+    tokens_sent, tokens_recieved, test_time, success_count, failure_count, ttft = local_metric_collector.share_stats()
     data["tokens-sent"] = tokens_sent
     data["tokens-received"] = tokens_recieved
     data["test-time"] = test_time
     data["success-count"] = success_count
     data["failure-count"] = failure_count
+    data["time_to_first_token"] = ttft
     local_metric_collector.__init__
 
 
@@ -239,7 +240,7 @@ def on_worker_report(client_id, data):
     stats dict.
     """
     local_metric_collector.add_metrics(
-        data["tokens-sent"], data["tokens-received"], data["test-time"], data["success-count"], data["failure-count"])
+        data["tokens-sent"], data["tokens-received"], data["test-time"], data["success-count"], data["failure-count"], data["time_to_first_token"])
 
 
 @events.init_command_line_parser.add_listener
@@ -341,7 +342,7 @@ def grpc_infer(self):
 
         number_of_input_tokens = len(tokenizer.encode(prompt))
         number_of_output_tokens = len(tokenizer.encode(output))
-        send_metrics(number_of_input_tokens, number_of_output_tokens, response_time,1)
+        send_metrics(number_of_input_tokens, number_of_output_tokens, response_time,1, ttft)
 
 
 class LocustInterceptor(ClientInterceptor):
diff --git a/benchmarks/benchmark/tools/locust-load-inference/locust-runner/metrics.yaml b/benchmarks/benchmark/tools/locust-load-inference/locust-runner/metrics.yaml
index 8b5dffa64..383f5b045 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/locust-runner/metrics.yaml
+++ b/benchmarks/benchmark/tools/locust-load-inference/locust-runner/metrics.yaml
@@ -29,6 +29,11 @@
             'filter': 'metric.type = "prometheus/locust_custom_metrics_avg_tokens_sent/gauge"',
             'type': 'GAUGE',
             'aggregation': ''
+        },
+        'AVG_TIME_TO_FIRST_TOKEN': {
+            'filter': 'metric.type = "prometheus/locust_custom_metrics_time_to_first_token/gauge"',
+            'type': 'GAUGE',
+            'aggregation': ''
         }
     }
 }
\ No newline at end of file

From 9b68598567fd16427e552e4d913af0d4de48d258 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Mon, 20 May 2024 19:08:00 +0000
Subject: [PATCH 07/12] Adding support for TPU's and Jetstream deployment

---
 .../tools/locust-load-inference/README.md     |   1 +
 .../tools/locust-load-inference/main.tf       |   1 +
 .../locust-worker-controller.yaml.tpl         |   7 +
 .../sample-tfvars/jetstream-sample.tfvars     |  29 ++++
 .../tools/locust-load-inference/variables.tf  |   7 +
 .../inference-server/jetstream/README.md      | 145 ++++++++++++++++++
 .../inference-server/jetstream/jetstream.yaml |  63 ++++++++
 .../model-conversion/kaggle_converter.yaml    |  33 ++++
 .../stage-1/modules/gke-infra/cluster.tf      |   1 +
 .../stage-1/modules/gke-infra/variables.tf    |   1 +
 .../sample-tfvars/jetstream-sample.tfvars     |  27 ++++
 benchmarks/infra/stage-2/main.tf              |   2 +
 .../infra/stage-2/modules/gke-setup/main.tf   |   2 +-
 .../gke-setup/modules/gcs-fuse/main.tf        |   4 +-
 .../sample-tfvars/jetstream-sample.tfvars     |  27 ++++
 benchmarks/infra/stage-2/variables.tf         |  12 ++
 16 files changed, 359 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
 create mode 100644 benchmarks/inference-server/jetstream/README.md
 create mode 100644 benchmarks/inference-server/jetstream/jetstream.yaml
 create mode 100644 benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml
 create mode 100644 benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
 create mode 100644 benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars

diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index cbc3f585c..4ade6d2f9 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -37,6 +37,7 @@ The Locust benchmarking tool currently supports these frameworks:
 - tensorrt_llm_triton
 - text generation inference (tgi)
 - vllm
+- jetstream
 
 ## Instructions
 
diff --git a/benchmarks/benchmark/tools/locust-load-inference/main.tf b/benchmarks/benchmark/tools/locust-load-inference/main.tf
index 12690d02e..64f41c65f 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/main.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/main.tf
@@ -47,6 +47,7 @@ locals {
       tokenizer                      = var.tokenizer
       use_beam_search                = var.use_beam_search
       hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+      k8s_hf_secret_list             = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       stop_timeout                   = var.stop_timeout
     })) : data]
   ])
diff --git a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
index 047b6c2ed..0abb42c40 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
+++ b/benchmarks/benchmark/tools/locust-load-inference/manifest-templates/locust-worker-controller.yaml.tpl
@@ -48,6 +48,13 @@ spec:
             - name: USE_BEAM_SEARCH
               value: ${use_beam_search}
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+            - name: HUGGINGFACE_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-key
+                  key: HF_TOKEN
+%{ endfor ~}
+%{ for hf_token in k8s_hf_secret_list ~}
             - name: HUGGINGFACE_TOKEN
               valueFrom:
                 secretKeyRef:
diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
new file mode 100644
index 000000000..2e04b54ac
--- /dev/null
+++ b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
@@ -0,0 +1,29 @@
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/PROJECT_NUMBER/locations/global/gkeMemberships/ai-tpu-benchmark"
+}
+
+project_id = "PROJECT_ID"
+
+namespace = "default"
+ksa       = "benchmark-sa"
+request_type = "grpc"
+
+k8s_hf_secret = "hf-token"
+
+
+# Locust service configuration 
+artifact_registry                        = "REGISTRY_LOCATION"
+inference_server_service                 = "jetstream-http-svc:9000" # 104.196.118.117:9000
+locust_runner_kubernetes_service_account = "sample-runner-sa"
+output_bucket                            = "${PROJECT_ID}-jetstream-benchmark-output-bucket-01"
+gcs_path                                 = "PATH_TO_PROMPT_BUCKET"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "jetstream"
+tokenizer                  = "google/gemma-7b"
+
+# Benchmark configuration for triggering single test via Locust Runner
+test_duration = 60
+# Increase test_users to allow more parallelism (especially when testing HPA)
+test_users = 1
+test_rate  = 5
diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
index 4b1155d1d..184d6fbed 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
@@ -211,6 +211,13 @@ variable "hugging_face_secret_version" {
   default     = null
 }
 
+variable "k8s_hf_secret" {
+  description = "Name of secret in k8s for huggingface token"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
 variable "request_type" {
   description = "The method of request used when calling the model server (http or grpc)"
   type        = string
diff --git a/benchmarks/inference-server/jetstream/README.md b/benchmarks/inference-server/jetstream/README.md
new file mode 100644
index 000000000..8d76ea470
--- /dev/null
+++ b/benchmarks/inference-server/jetstream/README.md
@@ -0,0 +1,145 @@
+# AI on GKE Benchmarking for JetStream
+
+Deploying and benchmarking JetStream on TPU has many similarities with the standard GPU path. But distinct enough differences to warrant a separate readme. If you are familiar with deploying on GPU, much of this should be familiar. For a more detailed understanding of each step. Refer to our primary benchmarking (README)[https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/benchmarks]
+
+## Pre-requisites
+- kaggle user/token
+- huggingface user/token
+- gcs bucket with test-prompts
+
+### Creating K8s infra
+
+To create our TPU cluster, run:
+
+```
+# Stage 1 creates the cluster.
+cs infra/stage-1
+
+# Copy the sample variables and update the project ID, cluster name and other
+parameters as needed in the `terraform.tfvars` file.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+To verify that the cluster has been set up correctly, run
+```
+# Get credentials using fleet membership
+gcloud container fleet memberships get-credentials <cluster-name>
+
+# Run a kubectl command to verify
+kubectl get nodes
+```
+
+## Configure the cluster
+
+To configure the cluster to run inference workloads we need to set up workload identity and GCS Fuse.
+```
+# Stage 2 configures the cluster for running inference workloads.
+cd infra/stage-2
+
+# Copy the sample variables and update the project number and cluster name in
+# the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
+# and the project name and bucket name parameters as needed in the
+# `terraform.tfvars` file. You can specify a new bucket name in which case it
+# will be created.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+
+### Convert Gemma model weights to maxtext weights
+
+Jetstream currently requires that models be converted to MaxText weights. This example will deploy a Gemma-7b model. Much of this information is similar to this guide (here)[https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-tpu-jetstream#convert-checkpoints].
+
+*SKIP IF ALREADY COMPLETED*
+
+Create kaggle secret
+```
+kubectl create secret generic kaggle-secret \
+    --from-file=kaggle.json
+```
+
+Replace `model-conversion/kaggle_converter.yaml: GEMMA_BUCKET_NAME` with the correct bucket name where you would like the model to be stored.
+***NOTE: If you are using a different bucket that the ones you created give the service account Storage Admin permissions on that bucket.
+
+Run:
+```
+kubectl apply -f model-conversion/kaggle_converter.yaml
+```
+
+This should take ~10 minutes to complete.
+
+### Deploy JetStream
+
+Replace the `jetstream.yaml:GEMMA_BUCKET_NAME` with the same bucket name as above.
+
+Run:
+```
+kubectl apply -f jetstream.yaml
+```
+
+Verify the pod is running with
+```
+kubectl get pods
+```
+
+Get the external IP with:
+
+```
+kubectl get services
+```
+
+And you can make a request prompt with:
+```
+curl --request POST \
+--header "Content-type: application/json" \
+-s \
+JETSTREAM_EXTERNAL_IP:8000/generate \
+--data \
+'{
+    "prompt": "What is a TPU?",
+    "max_tokens": 200
+}'
+```
+
+### Deploy the benchmark
+
+To prepare the dataset for the Locust inference benchmark, view the README.md file in:
+```
+cd benchmark/dataset/ShareGPT_v3_unflitered_cleaned_split
+```
+
+To deploy the Locust inference benchmark with the above model, run
+```
+cd benchmark/tools/locust-load-inference
+
+# Copy the sample variables and update the project number and cluster name in
+# the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
+# in the `terraform.tfvars` file.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+
+To further interact with the Locust inference benchmark, view the README.md file in `benchmark/tools/locust-load-inference`
diff --git a/benchmarks/inference-server/jetstream/jetstream.yaml b/benchmarks/inference-server/jetstream/jetstream.yaml
new file mode 100644
index 000000000..7086d4329
--- /dev/null
+++ b/benchmarks/inference-server/jetstream/jetstream.yaml
@@ -0,0 +1,63 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: maxengine-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: maxengine-server
+  template:
+    metadata:
+      labels:
+        app: maxengine-server
+    spec:
+      serviceAccountName: benchmark-sa
+      nodeSelector:
+        cloud.google.com/gke-tpu-topology: 2x2
+        cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+      containers:
+      - name: maxengine-server
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.0
+        args:
+        - model_name=gemma-7b
+        - tokenizer_path=assets/tokenizer.gemma
+        - per_device_batch_size=4
+        - max_prefill_predict_length=1024
+        - max_target_length=2048
+        - async_checkpointing=false
+        - ici_fsdp_parallelism=1
+        - ici_autoregressive_parallelism=-1
+        - ici_tensor_parallelism=1
+        - scan_layers=false
+        - weight_dtype=bfloat16
+        - load_parameters_path=gs://GEMMA_BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items
+        ports:
+        - containerPort: 9000
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
+      - name: jetstream-http
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.0
+        ports:
+        - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: jetstream-http-svc
+spec:
+  selector:
+    app: maxengine-server
+  ports:
+  - protocol: TCP
+    name: http
+    port: 8000
+    targetPort: 8000
+  - protocol: TCP
+    name: grpc
+    port: 9000
+    targetPort: 9000
+  type: LoadBalancer
\ No newline at end of file
diff --git a/benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml b/benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml
new file mode 100644
index 000000000..2d0ec2d23
--- /dev/null
+++ b/benchmarks/inference-server/jetstream/model-conversion/kaggle_converter.yaml
@@ -0,0 +1,33 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: data-loader-7b
+spec:
+  ttlSecondsAfterFinished: 30
+  template:
+    spec:
+      serviceAccountName: benchmark-sa
+      restartPolicy: Never
+      containers:
+      - name: inference-checkpoint
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.0
+        args:
+        - -b=GEMMA_BUCKET_NAME
+        - -m=google/gemma/maxtext/7b-it/2
+        volumeMounts:
+        - mountPath: "/kaggle/"
+          name: kaggle-credentials
+          readOnly: true
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
+      nodeSelector:
+        cloud.google.com/gke-tpu-topology: 2x2
+        cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+      volumes:
+      - name: kaggle-credentials
+        secret:
+          defaultMode: 0400
+          secretName: kaggle-secret
\ No newline at end of file
diff --git a/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf b/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
index dc7066193..40f89252c 100644
--- a/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
+++ b/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
@@ -189,6 +189,7 @@ module "cluster-nodepool" {
 
   node_config = {
     machine_type = each.value.machine_type
+    spot = each.value.spot
     shielded_instance_config = {
       enable_integrity_monitoring = true
       enable_secure_boot          = true
diff --git a/benchmarks/infra/stage-1/modules/gke-infra/variables.tf b/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
index 97169076d..bcc31640a 100644
--- a/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
+++ b/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
@@ -135,6 +135,7 @@ variable "nodepools" {
     gke_version    = optional(string),
     max_node_count = optional(number, 10),
     min_node_count = optional(number, 1),
+    spot = optional(bool, false)
 
     guest_accelerator = optional(object({
       type  = optional(string),
diff --git a/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
new file mode 100644
index 000000000..2118910f7
--- /dev/null
+++ b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
@@ -0,0 +1,27 @@
+project_id   = "PROJECT_ID"
+cluster_name = "ai-tpu-benchmark"
+region       = "us-east1"
+gke_location = "us-east1-c"
+prefix       = "ai-tpu-benchmark"
+spot_vms     = true
+
+vpc_create = {
+  name             = "ai-benchmark"
+  enable_cloud_nat = true
+}
+
+cluster_options = {
+  enable_gcs_fuse_csi_driver            = false
+  enable_gcp_filestore_csi_driver       = false
+  enable_gce_persistent_disk_csi_driver = false
+}
+
+nodepools = {
+  nodepool-tpu = {
+    machine_type = "ct5lp-hightpu-4t",
+    spot = true,
+  },
+  nodepool-cpu = {
+    machine_type = "n2-standard-2",
+  },
+}
diff --git a/benchmarks/infra/stage-2/main.tf b/benchmarks/infra/stage-2/main.tf
index 6d53463f0..28c073fb1 100644
--- a/benchmarks/infra/stage-2/main.tf
+++ b/benchmarks/infra/stage-2/main.tf
@@ -32,4 +32,6 @@ module "gke-setup" {
   secret_create                               = var.secret_name == null ? false : true
   secret_name                                 = var.secret_name
   secret_location                             = var.secret_location
+  nvidia_dcgm_create                          = var.nvidia_dcgm_create
+  gcs_fuse_create                             = var.gcs_fuse_create
 }
diff --git a/benchmarks/infra/stage-2/modules/gke-setup/main.tf b/benchmarks/infra/stage-2/modules/gke-setup/main.tf
index 28b1a0b8a..98fffb1df 100644
--- a/benchmarks/infra/stage-2/modules/gke-setup/main.tf
+++ b/benchmarks/infra/stage-2/modules/gke-setup/main.tf
@@ -32,7 +32,7 @@ module "gcs-fuse" {
   project_id             = var.project_id
   bucket_name            = var.bucket_name
   bucket_location        = var.bucket_location
-  google_service_account = var.google_service_account
+  google_service_account = module.workload-identity.0.created_resources.gsa_email
   depends_on             = [module.workload-identity]
 }
 
diff --git a/benchmarks/infra/stage-2/modules/gke-setup/modules/gcs-fuse/main.tf b/benchmarks/infra/stage-2/modules/gke-setup/modules/gcs-fuse/main.tf
index 37d8079df..405e7c3cf 100644
--- a/benchmarks/infra/stage-2/modules/gke-setup/modules/gcs-fuse/main.tf
+++ b/benchmarks/infra/stage-2/modules/gke-setup/modules/gcs-fuse/main.tf
@@ -42,6 +42,6 @@ module "gcs-fuse-bucket" {
 
 resource "google_storage_bucket_iam_member" "bucket-iam" {
   bucket = local.bucket_name
-  role   = "roles/storage.objectAdmin"
-  member = data.google_service_account.gsa.member
+  role   = "roles/storage.admin"
+  member = "serviceAccount:${var.google_service_account}"
 }
diff --git a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
new file mode 100644
index 000000000..7e3d0ac54
--- /dev/null
+++ b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
@@ -0,0 +1,27 @@
+# can be obtained from stage-1 by running:
+# terraform output -json  | jq '."fleet_host".value'
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-tpu-benchmark"
+}
+
+# can be obtained from stage-1 by running:
+# terraform output -json  | jq '."project_id".value'
+project_id = "PROJECT_ID"
+
+bucket_name = "${PROJECT_ID}-tpu-model-repo-bucket-01"
+bucket_location = "US"
+
+output_bucket_name     = "${PROJECT_ID}-jetstream-benchmark-output-bucket-01"
+output_bucket_location = "US"
+
+google_service_account     = "benchmark-sa-01"
+kubernetes_service_account = "benchmark-sa"
+
+benchmark_runner_google_service_account     = "sample-runner-sa-01"
+benchmark_runner_kubernetes_service_account = "sample-runner-sa"
+
+nvidia_dcgm_create = "false"
+namespace = "default"
+namespace_create = false
+gcs_fuse_create = true
+
diff --git a/benchmarks/infra/stage-2/variables.tf b/benchmarks/infra/stage-2/variables.tf
index f6085a639..9e388b3a5 100644
--- a/benchmarks/infra/stage-2/variables.tf
+++ b/benchmarks/infra/stage-2/variables.tf
@@ -132,3 +132,15 @@ variable "secret_location" {
   default     = null
   nullable    = true
 }
+
+variable "nvidia_dcgm_create" {
+  description = "Should create nvidia dcgm resources or not"
+  type = bool
+  default = true
+}
+
+variable "gcs_fuse_create" {
+  description = "Give the SA object admin privledges"
+  type = bool
+  default = false
+}
\ No newline at end of file

From ff4f4cbffca48492701af7d95a9ca2a0aa5ef747 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Mon, 20 May 2024 22:35:12 +0000
Subject: [PATCH 08/12] PR feedback

---
 .../tools/locust-load-inference/README.md        |  3 ++-
 .../sample-tfvars/jetstream-sample.tfvars        |  4 ++--
 .../tools/locust-load-inference/variables.tf     | 13 +++++++------
 benchmarks/inference-server/jetstream/README.md  | 16 ++++++++++------
 .../inference-server/jetstream/jetstream.yaml    |  2 +-
 benchmarks/infra/stage-1/sample-terraform.tfvars |  1 +
 benchmarks/infra/stage-2/README.md               |  2 ++
 .../sample-tfvars/jetstream-sample.tfvars        |  2 +-
 benchmarks/infra/stage-2/variables.tf            |  4 ++--
 9 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 4ade6d2f9..5a57e60db 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -266,5 +266,6 @@ To change the benchmark configuration, you will have to rerun terraform destroy
 | <a name="input_sax_model"></a> [sax\_model](#input\_sax\_model)                                                      | Benchmark server configuration for sax model. Only required if framework is sax.                                            | `string`                                                                                                                                                                                                    | `""`                 |    no    |
 | <a name="input_tokenizer"></a> [tokenizer](#input\_tokenizer)                                                        | Benchmark server configuration for tokenizer.                                                                               | `string`                                                                                                                                                                                                    | `"tiiuae/falcon-7b"` |   yes    |
 | <a name="input_use_beam_search"></a> [use\_beam\_search](#input\_use\_beam\_search)                                  | Benchmark server configuration for use beam search.                                                                         | `bool`                                                                                                                                                                                                      | `false`              |    no    |
-  <a name="huggingface_secret"></a> [huggingface_secret](#input\_huggingface_secret)                                  | Name of the kubectl huggingface secret token                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
+  <a name="huggingface_secret"></a> [huggingface_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in GCP Secrets Manager.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
+  <a name="k8s_hf_secret"></a> [k8s_hf_secret](#input\_huggingface_secret)                                  | Name of the secret holding the huggingface token. Stored in K8s. Key is expected to be named: `HF_TOKEN`. See [here](https://kubernetes.io/docs/tasks/configmap-secret/managing-secret-using-kubectl/#use-raw-data) for more.                                                                          | `string`                                                                                                                                                                                                      | `huggingface-secret`              |    no   |
 <!-- END_TF_DOCS -->
diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
index 2e04b54ac..7033e11f5 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
@@ -13,9 +13,9 @@ k8s_hf_secret = "hf-token"
 
 # Locust service configuration 
 artifact_registry                        = "REGISTRY_LOCATION"
-inference_server_service                 = "jetstream-http-svc:9000" # 104.196.118.117:9000
+inference_server_service                 = "jetstream-svc:9000"
 locust_runner_kubernetes_service_account = "sample-runner-sa"
-output_bucket                            = "${PROJECT_ID}-jetstream-benchmark-output-bucket-01"
+output_bucket                            = "${PROJECT_ID}-tpu-benchmark-output-bucket-01"
 gcs_path                                 = "PATH_TO_PROMPT_BUCKET"
 
 # Benchmark configuration for Locust Docker accessing inference server
diff --git a/benchmarks/benchmark/tools/locust-load-inference/variables.tf b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
index 184d6fbed..4b3d3a030 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/variables.tf
+++ b/benchmarks/benchmark/tools/locust-load-inference/variables.tf
@@ -197,22 +197,23 @@ variable "run_test_automatically" {
   default     = false
 }
 
-variable "hugging_face_secret" {
-  description = "name of the kubectl huggingface secret token"
+// TODO: add validation to make k8s_hf_secret & hugging_face_secret mutually exclusive once terraform is updated with: https://discuss.hashicorp.com/t/experiment-feedback-input-variable-validation-can-cross-reference-other-objects/66644
+variable "k8s_hf_secret" {
+  description = "Name of secret for huggingface token; stored in k8s "
   type        = string
   nullable    = true
   default     = null
 }
 
-variable "hugging_face_secret_version" {
-  description = "Secret version in Secret Manager"
+variable "hugging_face_secret" {
+  description = "name of the kubectl huggingface secret token; stored in Secret Manager. Security considerations: https://kubernetes.io/docs/concepts/security/secrets-good-practices/"
   type        = string
   nullable    = true
   default     = null
 }
 
-variable "k8s_hf_secret" {
-  description = "Name of secret in k8s for huggingface token"
+variable "hugging_face_secret_version" {
+  description = "Secret version in Secret Manager"
   type        = string
   nullable    = true
   default     = null
diff --git a/benchmarks/inference-server/jetstream/README.md b/benchmarks/inference-server/jetstream/README.md
index 8d76ea470..57b22ff29 100644
--- a/benchmarks/inference-server/jetstream/README.md
+++ b/benchmarks/inference-server/jetstream/README.md
@@ -1,11 +1,10 @@
 # AI on GKE Benchmarking for JetStream
 
-Deploying and benchmarking JetStream on TPU has many similarities with the standard GPU path. But distinct enough differences to warrant a separate readme. If you are familiar with deploying on GPU, much of this should be familiar. For a more detailed understanding of each step. Refer to our primary benchmarking (README)[https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/benchmarks]
+Deploying and benchmarking JetStream on TPU has many similarities with the standard GPU path. But distinct enough differences to warrant a separate readme. If you are familiar with deploying on GPU, much of this should be familiar. For a more detailed understanding of each step. Refer to our primary benchmarking [README](https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/benchmarks)
 
 ## Pre-requisites
-- kaggle user/token
-- huggingface user/token
-- gcs bucket with test-prompts
+- [kaggle user/token](https://www.kaggle.com/docs/api)
+- [huggingface user/token](https://huggingface.co/docs/hub/en/security-tokens)
 
 ### Creating K8s infra
 
@@ -15,7 +14,7 @@ To create our TPU cluster, run:
 # Stage 1 creates the cluster.
 cs infra/stage-1
 
-# Copy the sample variables and update the project ID, cluster name and other
+# Copy the sample variables and update the project ID, cluster name and other 
 parameters as needed in the `terraform.tfvars` file.
 cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
 
@@ -74,7 +73,12 @@ kubectl create secret generic kaggle-secret \
 ```
 
 Replace `model-conversion/kaggle_converter.yaml: GEMMA_BUCKET_NAME` with the correct bucket name where you would like the model to be stored.
-***NOTE: If you are using a different bucket that the ones you created give the service account Storage Admin permissions on that bucket.
+***NOTE: If you are using a different bucket that the ones you created give the service account Storage Admin permissions on that bucket. This can be done on the UI or by running:
+```
+gcloud projects add-iam-policy-binding PROJECT_ID \
+    --member "serviceAccount:SA_NAME@PROJECT_ID.iam.gserviceaccount.com" \
+    --role roles/storage.admin
+```
 
 Run:
 ```
diff --git a/benchmarks/inference-server/jetstream/jetstream.yaml b/benchmarks/inference-server/jetstream/jetstream.yaml
index 7086d4329..b8bb42f98 100644
--- a/benchmarks/inference-server/jetstream/jetstream.yaml
+++ b/benchmarks/inference-server/jetstream/jetstream.yaml
@@ -47,7 +47,7 @@ spec:
 apiVersion: v1
 kind: Service
 metadata:
-  name: jetstream-http-svc
+  name: jetstream-svc
 spec:
   selector:
     app: maxengine-server
diff --git a/benchmarks/infra/stage-1/sample-terraform.tfvars b/benchmarks/infra/stage-1/sample-terraform.tfvars
index 1d5a4d045..9079c73f9 100644
--- a/benchmarks/infra/stage-1/sample-terraform.tfvars
+++ b/benchmarks/infra/stage-1/sample-terraform.tfvars
@@ -1,4 +1,5 @@
 project_id   = "change-me"
+// TODO: change all instances of clusterName to be ai-gpu-benchmark.
 cluster_name = "ai-benchmark"
 region       = "us-central1"
 gke_location = "us-central1-a"
diff --git a/benchmarks/infra/stage-2/README.md b/benchmarks/infra/stage-2/README.md
index edd3ec61c..b6e594d8f 100644
--- a/benchmarks/infra/stage-2/README.md
+++ b/benchmarks/infra/stage-2/README.md
@@ -104,6 +104,8 @@ kubectl get nodes
 | [secret_location](variables.tf#L105) | Location of secret | <code>string</code> |  | <code>null</code> |
 | [secret_name](variables.tf#L98) | Secret name | <code>string</code> |  | <code>null</code> |
 | [workload_identity_create](variables.tf#L54) | Setup Workload Identity configuration for newly created GKE cluster. Set to false to skip. | <code>bool</code> |  | <code>true</code> |
+| [nvidia_dcgm_create](variables.tf#L136) | Determines if DCGM resources should be added to the cluster. Used in capturing GPU metrics. | <code>bool</code> |  | <code>true</code> |
+| [gcs_fuse_create](variables.tf#L136) | Gives the model server service account Storage Admin access to the model store bucket | <code>bool</code> |  | <code>true</code> |
 
 ## Outputs
 
diff --git a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
index 7e3d0ac54..c93bcfdfe 100644
--- a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
@@ -11,7 +11,7 @@ project_id = "PROJECT_ID"
 bucket_name = "${PROJECT_ID}-tpu-model-repo-bucket-01"
 bucket_location = "US"
 
-output_bucket_name     = "${PROJECT_ID}-jetstream-benchmark-output-bucket-01"
+output_bucket_name     = "${PROJECT_ID}-tpu-benchmark-output-bucket-01"
 output_bucket_location = "US"
 
 google_service_account     = "benchmark-sa-01"
diff --git a/benchmarks/infra/stage-2/variables.tf b/benchmarks/infra/stage-2/variables.tf
index 9e388b3a5..162575f36 100644
--- a/benchmarks/infra/stage-2/variables.tf
+++ b/benchmarks/infra/stage-2/variables.tf
@@ -134,13 +134,13 @@ variable "secret_location" {
 }
 
 variable "nvidia_dcgm_create" {
-  description = "Should create nvidia dcgm resources or not"
+  description = "Should create nvidia dcgm resources or not; for use on GPU VMs"
   type = bool
   default = true
 }
 
 variable "gcs_fuse_create" {
-  description = "Give the SA object admin privledges"
+  description = "Give the SA object admin privileges"
   type = bool
   default = false
 }
\ No newline at end of file

From adc2fdeb948e44f79af8e008ce296c3c25a600b6 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Mon, 20 May 2024 22:36:13 +0000
Subject: [PATCH 09/12] typo fixes

---
 benchmarks/inference-server/jetstream/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/inference-server/jetstream/README.md b/benchmarks/inference-server/jetstream/README.md
index 57b22ff29..28b414289 100644
--- a/benchmarks/inference-server/jetstream/README.md
+++ b/benchmarks/inference-server/jetstream/README.md
@@ -12,7 +12,7 @@ To create our TPU cluster, run:
 
 ```
 # Stage 1 creates the cluster.
-cs infra/stage-1
+cd infra/stage-1
 
 # Copy the sample variables and update the project ID, cluster name and other 
 parameters as needed in the `terraform.tfvars` file.

From b9b240fb7a5957e77b9a51986ccc85586b5a98f6 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Tue, 21 May 2024 17:08:42 +0000
Subject: [PATCH 10/12] more feedback updates

---
 .../sample-tfvars/jetstream-sample.tfvars                   | 4 ++--
 benchmarks/inference-server/jetstream/README.md             | 4 +++-
 .../infra/stage-1/sample-tfvars/jetstream-sample.tfvars     | 4 ++--
 .../infra/stage-2/sample-tfvars/jetstream-sample.tfvars     | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
index 7033e11f5..c210ad42d 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
@@ -1,5 +1,5 @@
 credentials_config = {
-  fleet_host = "https://connectgateway.googleapis.com/v1/projects/PROJECT_NUMBER/locations/global/gkeMemberships/ai-tpu-benchmark"
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
 }
 
 project_id = "PROJECT_ID"
@@ -15,7 +15,7 @@ k8s_hf_secret = "hf-token"
 artifact_registry                        = "REGISTRY_LOCATION"
 inference_server_service                 = "jetstream-svc:9000"
 locust_runner_kubernetes_service_account = "sample-runner-sa"
-output_bucket                            = "${PROJECT_ID}-tpu-benchmark-output-bucket-01"
+output_bucket                            = "${PROJECT_ID}-benchmark-output-bucket-01"
 gcs_path                                 = "PATH_TO_PROMPT_BUCKET"
 
 # Benchmark configuration for Locust Docker accessing inference server
diff --git a/benchmarks/inference-server/jetstream/README.md b/benchmarks/inference-server/jetstream/README.md
index 28b414289..032febb62 100644
--- a/benchmarks/inference-server/jetstream/README.md
+++ b/benchmarks/inference-server/jetstream/README.md
@@ -62,7 +62,9 @@ terraform apply
 
 ### Convert Gemma model weights to maxtext weights
 
-Jetstream currently requires that models be converted to MaxText weights. This example will deploy a Gemma-7b model. Much of this information is similar to this guide (here)[https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-tpu-jetstream#convert-checkpoints].
+JetStream has [two engine implementations](https://github.com/google/JetStream?tab=readme-ov-file#jetstream-engine-implementation). A Jax variant (via MaxText) and a Pytorch variant. This guide will use the Jax backend.
+
+Jetstream currently requires that models be converted to MaxText weights. This example will deploy a Gemma-7b model. Much of this information is similar to this guide [here](https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-tpu-jetstream#convert-checkpoints).
 
 *SKIP IF ALREADY COMPLETED*
 
diff --git a/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
index 2118910f7..31ee47ffb 100644
--- a/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
@@ -1,8 +1,8 @@
 project_id   = "PROJECT_ID"
-cluster_name = "ai-tpu-benchmark"
+cluster_name = "ai-benchmark"
 region       = "us-east1"
 gke_location = "us-east1-c"
-prefix       = "ai-tpu-benchmark"
+prefix       = "ai-benchmark"
 spot_vms     = true
 
 vpc_create = {
diff --git a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
index c93bcfdfe..ea4ad63fd 100644
--- a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
@@ -1,17 +1,17 @@
 # can be obtained from stage-1 by running:
 # terraform output -json  | jq '."fleet_host".value'
 credentials_config = {
-  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-tpu-benchmark"
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/$PROJECT_NUMBER/locations/global/gkeMemberships/ai-benchmark"
 }
 
 # can be obtained from stage-1 by running:
 # terraform output -json  | jq '."project_id".value'
 project_id = "PROJECT_ID"
 
-bucket_name = "${PROJECT_ID}-tpu-model-repo-bucket-01"
+bucket_name = "${PROJECT_ID}-model-repo-bucket-01"
 bucket_location = "US"
 
-output_bucket_name     = "${PROJECT_ID}-tpu-benchmark-output-bucket-01"
+output_bucket_name     = "${PROJECT_ID}-benchmark-output-bucket-01"
 output_bucket_location = "US"
 
 google_service_account     = "benchmark-sa-01"

From e2712faf1f85afb11d1f71ac654c454485eccef8 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Tue, 21 May 2024 17:09:55 +0000
Subject: [PATCH 11/12] terraform formatting changes

---
 .../sample-tfvars/jetstream-sample.tfvars                 | 4 ++--
 benchmarks/infra/stage-1/modules/gke-infra/cluster.tf     | 2 +-
 benchmarks/infra/stage-1/modules/gke-infra/variables.tf   | 2 +-
 benchmarks/infra/stage-1/sample-terraform.tfvars          | 2 +-
 .../infra/stage-1/sample-tfvars/jetstream-sample.tfvars   | 2 +-
 .../infra/stage-2/sample-tfvars/jetstream-sample.tfvars   | 8 ++++----
 benchmarks/infra/stage-2/variables.tf                     | 8 ++++----
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
index c210ad42d..d5b3c0dce 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/jetstream-sample.tfvars
@@ -4,8 +4,8 @@ credentials_config = {
 
 project_id = "PROJECT_ID"
 
-namespace = "default"
-ksa       = "benchmark-sa"
+namespace    = "default"
+ksa          = "benchmark-sa"
 request_type = "grpc"
 
 k8s_hf_secret = "hf-token"
diff --git a/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf b/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
index b891591ee..69bb6b33e 100644
--- a/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
+++ b/benchmarks/infra/stage-1/modules/gke-infra/cluster.tf
@@ -187,7 +187,7 @@ module "cluster-nodepool" {
 
   node_config = {
     machine_type = each.value.machine_type
-    spot = each.value.spot
+    spot         = each.value.spot
     shielded_instance_config = {
       enable_integrity_monitoring = true
       enable_secure_boot          = true
diff --git a/benchmarks/infra/stage-1/modules/gke-infra/variables.tf b/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
index 756f03784..c45bff1fe 100644
--- a/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
+++ b/benchmarks/infra/stage-1/modules/gke-infra/variables.tf
@@ -143,7 +143,7 @@ variable "nodepools" {
     gke_version    = optional(string),
     max_node_count = optional(number, 10),
     min_node_count = optional(number, 1),
-    spot = optional(bool, false)
+    spot           = optional(bool, false)
 
     guest_accelerator = optional(object({
       type  = optional(string),
diff --git a/benchmarks/infra/stage-1/sample-terraform.tfvars b/benchmarks/infra/stage-1/sample-terraform.tfvars
index 9079c73f9..2557f9c6f 100644
--- a/benchmarks/infra/stage-1/sample-terraform.tfvars
+++ b/benchmarks/infra/stage-1/sample-terraform.tfvars
@@ -1,4 +1,4 @@
-project_id   = "change-me"
+project_id = "change-me"
 // TODO: change all instances of clusterName to be ai-gpu-benchmark.
 cluster_name = "ai-benchmark"
 region       = "us-central1"
diff --git a/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
index 31ee47ffb..28dd61827 100644
--- a/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/infra/stage-1/sample-tfvars/jetstream-sample.tfvars
@@ -19,7 +19,7 @@ cluster_options = {
 nodepools = {
   nodepool-tpu = {
     machine_type = "ct5lp-hightpu-4t",
-    spot = true,
+    spot         = true,
   },
   nodepool-cpu = {
     machine_type = "n2-standard-2",
diff --git a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
index ea4ad63fd..c9c884f2c 100644
--- a/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
+++ b/benchmarks/infra/stage-2/sample-tfvars/jetstream-sample.tfvars
@@ -8,7 +8,7 @@ credentials_config = {
 # terraform output -json  | jq '."project_id".value'
 project_id = "PROJECT_ID"
 
-bucket_name = "${PROJECT_ID}-model-repo-bucket-01"
+bucket_name     = "${PROJECT_ID}-model-repo-bucket-01"
 bucket_location = "US"
 
 output_bucket_name     = "${PROJECT_ID}-benchmark-output-bucket-01"
@@ -21,7 +21,7 @@ benchmark_runner_google_service_account     = "sample-runner-sa-01"
 benchmark_runner_kubernetes_service_account = "sample-runner-sa"
 
 nvidia_dcgm_create = "false"
-namespace = "default"
-namespace_create = false
-gcs_fuse_create = true
+namespace          = "default"
+namespace_create   = false
+gcs_fuse_create    = true
 
diff --git a/benchmarks/infra/stage-2/variables.tf b/benchmarks/infra/stage-2/variables.tf
index 162575f36..e68ace4b6 100644
--- a/benchmarks/infra/stage-2/variables.tf
+++ b/benchmarks/infra/stage-2/variables.tf
@@ -135,12 +135,12 @@ variable "secret_location" {
 
 variable "nvidia_dcgm_create" {
   description = "Should create nvidia dcgm resources or not; for use on GPU VMs"
-  type = bool
-  default = true
+  type        = bool
+  default     = true
 }
 
 variable "gcs_fuse_create" {
   description = "Give the SA object admin privileges"
-  type = bool
-  default = false
+  type        = bool
+  default     = false
 }
\ No newline at end of file

From 0ff5df4d759dc432d3fb6491db5b9beea91605e2 Mon Sep 17 00:00:00 2001
From: Kellen Swain <kfswain@google.com>
Date: Fri, 24 May 2024 15:46:33 +0000
Subject: [PATCH 12/12] migrating tfvars to a shared folder

---
 benchmarks/README.md                                      | 8 ++++----
 .../benchmark/tools/locust-load-inference/README.md       | 6 +++---
 .../tgi-sample.tfvars}                                    | 0
 benchmarks/infra/README.md                                | 4 ++--
 benchmarks/infra/stage-1/README.md                        | 4 ++--
 .../gpu-sample.tfvars}                                    | 0
 benchmarks/infra/stage-2/README.md                        | 4 ++--
 .../gpu-sample.tfvars}                                    | 0
 8 files changed, 13 insertions(+), 13 deletions(-)
 rename benchmarks/benchmark/tools/locust-load-inference/{sample-terraform.tfvars => sample-tfvars/tgi-sample.tfvars} (100%)
 rename benchmarks/infra/stage-1/{sample-terraform.tfvars => sample-tfvars/gpu-sample.tfvars} (100%)
 rename benchmarks/infra/stage-2/{sample-terraform.tfvars => sample-tfvars/gpu-sample.tfvars} (100%)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 77c713818..341393709 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -34,7 +34,7 @@ cd infra/stage-1
 
 # Copy the sample variables and update the project ID, cluster name and other
 parameters as needed in the `terraform.tfvars` file.
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 
 # Initialize the Terraform modules.
 terraform init
@@ -67,7 +67,7 @@ cd infra/stage-2
 # and the project name and bucket name parameters as needed in the
 # `terraform.tfvars` file. You can specify a new bucket name in which case it
 # will be created.
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 
 # Initialize the Terraform modules.
 terraform init
@@ -88,7 +88,7 @@ cd inference-server/text-generation-inference
 # Copy the sample variables and update the project number and cluster name in
 # the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
 # in the `terraform.tfvars` file.
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 
 # Initialize the Terraform modules.
 terraform init
@@ -120,7 +120,7 @@ cd benchmark/tools/locust-load-inference
 # Copy the sample variables and update the project number and cluster name in
 # the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
 # in the `terraform.tfvars` file.
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/tgi-sample.tfvars terraform.tfvars
 
 # Initialize the Terraform modules.
 terraform init
diff --git a/benchmarks/benchmark/tools/locust-load-inference/README.md b/benchmarks/benchmark/tools/locust-load-inference/README.md
index 5a57e60db..cc3727b9f 100644
--- a/benchmarks/benchmark/tools/locust-load-inference/README.md
+++ b/benchmarks/benchmark/tools/locust-load-inference/README.md
@@ -50,7 +50,7 @@ This is my first prompt.\n
 This is my second prompt.\n
 ```
 
-Example prompt datasets are available in the "../../dataset" folder with python scripts and instructions on how to make the dataset available for consumption by this benchmark. The dataset used in the `sample-terraform.tfvars` is the "ShareGPT_v3_unflitered_cleaned_split".
+Example prompt datasets are available in the "../../dataset" folder with python scripts and instructions on how to make the dataset available for consumption by this benchmark. The dataset used in the `./sample-tfvars/tgi-sample.tfvars` is the "ShareGPT_v3_unflitered_cleaned_split".
 
 You will set the `gcs_path` in your `terraform.tfvars` to this gcs path containing your prompts.
 
@@ -101,10 +101,10 @@ gcloud artifacts repositories create ai-benchmark --location=us-central1 --repos
 
 ### Step 6: create and configure terraform.tfvars
 
-Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an example file. You can copy the file as a starting point. Note that at a minimum you will have to change the existing `credentials_config`, `project_id`, and `artifact_registry`.
+Create a `terraform.tfvars` file. `./sample-tfvars/tgi-sample.tfvars` is provided as an example file. You can copy the file as a starting point. Note that at a minimum you will have to change the existing `credentials_config`, `project_id`, and `artifact_registry`.
 
 ```bash
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/tgi-sample.tfvars terraform.tfvars
 ```
 
 Fill out your `terraform.tfvars` with the desired model and server configuration, referring to the list of required and optional variables [here](#variables). The following variables are required:
diff --git a/benchmarks/benchmark/tools/locust-load-inference/sample-terraform.tfvars b/benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars
similarity index 100%
rename from benchmarks/benchmark/tools/locust-load-inference/sample-terraform.tfvars
rename to benchmarks/benchmark/tools/locust-load-inference/sample-tfvars/tgi-sample.tfvars
diff --git a/benchmarks/infra/README.md b/benchmarks/infra/README.md
index 75aa91628..87ec02b18 100644
--- a/benchmarks/infra/README.md
+++ b/benchmarks/infra/README.md
@@ -14,7 +14,7 @@ At a high level you will run the following:
 ```
 cd infra/stage-1
 
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 
 terraform init
 
@@ -31,7 +31,7 @@ You can find more details in the stage-2/README.md. At a high level you will run
 ```
 cd infra/stage-2
 
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 
 terraform init
 
diff --git a/benchmarks/infra/stage-1/README.md b/benchmarks/infra/stage-1/README.md
index 32260df5a..8f56588a8 100644
--- a/benchmarks/infra/stage-1/README.md
+++ b/benchmarks/infra/stage-1/README.md
@@ -27,10 +27,10 @@ In particular, stage-1 provisions:
 
 ### Step 1: create and configure terraform.tfvars
 
-Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an example file. You can copy the file as a starting point. Note that you will have to change the existing `project_id`.
+Create a `terraform.tfvars` file. `./sample-tfvars/gpu-sample.tfvars` is provided as an example file. You can copy the file as a starting point. Note that you will have to change the existing `project_id`.
 
 ```bash
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 ```
 
 Fill out your `terraform.tfvars` with the desired project and cluster configuration, referring to the list of required and optional variables [here](#variables). Variables `cluster_name` and `project_id` are required.
diff --git a/benchmarks/infra/stage-1/sample-terraform.tfvars b/benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars
similarity index 100%
rename from benchmarks/infra/stage-1/sample-terraform.tfvars
rename to benchmarks/infra/stage-1/sample-tfvars/gpu-sample.tfvars
diff --git a/benchmarks/infra/stage-2/README.md b/benchmarks/infra/stage-2/README.md
index b6e594d8f..7d02cf6c3 100644
--- a/benchmarks/infra/stage-2/README.md
+++ b/benchmarks/infra/stage-2/README.md
@@ -29,10 +29,10 @@ In particular, stage-2 provisions:
 
 ### Step 1: create and configure terraform.tfvars
 
-Create a `terraform.tfvars` file. `sample-terraform.tfvars` is provided as an example file. You can copy the file as a starting point. Note that you will have to change the existing `project_id`.
+Create a `terraform.tfvars` file. `./sample-tfvars/gpu-sample.tfvars` is provided as an example file. You can copy the file as a starting point. Note that you will have to change the existing `project_id`.
 
 ```bash
-cp sample-terraform.tfvars terraform.tfvars
+cp ./sample-tfvars/gpu-sample.tfvars terraform.tfvars
 ```
 
 Fill out your `terraform.tfvars` with the desired project and cluster configuration, referring to the list of required and optional variables [here](#variables). Variables `credentials_config` and `project_id` are required.
diff --git a/benchmarks/infra/stage-2/sample-terraform.tfvars b/benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars
similarity index 100%
rename from benchmarks/infra/stage-2/sample-terraform.tfvars
rename to benchmarks/infra/stage-2/sample-tfvars/gpu-sample.tfvars