From c0782ef23eedd21fb67e8e50b0895bf5e737d4f8 Mon Sep 17 00:00:00 2001
From: Somasundaram <somasundaram.sindhu@gmail.com>
Date: Tue, 25 Jun 2024 15:33:42 -0700
Subject: [PATCH] [ci] add trtllm chat test

---
 tests/integration/llm/client.py  | 11 +++++++++++
 tests/integration/llm/prepare.py |  6 ++++++
 tests/integration/tests.py       |  6 ++++++
 3 files changed, 23 insertions(+)

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 71df58eda..742c0a4b4 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -553,6 +553,15 @@ def get_model_name():
     }
 }
 
+trtllm_chat_model_spec = {
+    "llama2-7b-chat": {
+        "max_memory_per_gpu": [25.0],
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16"
+    }
+}
+
 no_code_rolling_batch_spec = {
     "llama-7b": {
         "max_memory_per_gpu": [25.0],
@@ -1286,6 +1295,8 @@ def run(raw_args):
         test_handler_rolling_batch(args.model, lmi_dist_aiccl_model_spec)
     elif args.handler == "trtllm":
         test_handler_rolling_batch(args.model, trtllm_model_spec)
+    elif args.handler == "trtllm_chat":
+        test_handler_rolling_batch_chat(args.model, trtllm_chat_model_spec)
     elif args.handler == "no_code":
         test_handler_rolling_batch(args.model, no_code_rolling_batch_spec)
 
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index abe8de3ab..430566f08 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -862,6 +862,12 @@
         "option.use_custom_all_reduce": False,
         "option.max_rolling_batch_size": 32,
         "option.output_formatter": "jsonlines"
+    },
+    "llama2-7b-chat": {
+        "option.model_id": "s3://djl-llm/meta-llama-Llama-2-7b-chat-hf/",
+        "option.dtype": "fp16",
+        "option.tensor_parallel_degree": 4,
+        "option.max_rolling_batch_size": 4
     }
 }
 
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index c8cabba6c..94463eb93 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -173,6 +173,12 @@ def test_qwen_7b(self):
             r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
             client.run("trtllm qwen-7b".split())
 
+    def test_llama2_7b_chat(self):
+        with Runner('tensorrt-llm', 'llama2-7b-chat') as r:
+            prepare.build_trtllm_handler_model("llama2-7b-chat")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm_chat llama2-7b-chat".split())
+
 
 class TestSchedulerSingleGPU:
     # Runs on g5.12xl