From c0782ef23eedd21fb67e8e50b0895bf5e737d4f8 Mon Sep 17 00:00:00 2001 From: Somasundaram Date: Tue, 25 Jun 2024 15:33:42 -0700 Subject: [PATCH] [ci] add trtllm chat test --- tests/integration/llm/client.py | 11 +++++++++++ tests/integration/llm/prepare.py | 6 ++++++ tests/integration/tests.py | 6 ++++++ 3 files changed, 23 insertions(+) diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 71df58eda..742c0a4b4 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -553,6 +553,15 @@ def get_model_name(): } } +trtllm_chat_model_spec = { + "llama2-7b-chat": { + "max_memory_per_gpu": [25.0], + "batch_size": [1, 4], + "seq_length": [256], + "tokenizer": "TheBloke/Llama-2-7B-Chat-fp16" + } +} + no_code_rolling_batch_spec = { "llama-7b": { "max_memory_per_gpu": [25.0], @@ -1286,6 +1295,8 @@ def run(raw_args): test_handler_rolling_batch(args.model, lmi_dist_aiccl_model_spec) elif args.handler == "trtllm": test_handler_rolling_batch(args.model, trtllm_model_spec) + elif args.handler == "trtllm_chat": + test_handler_rolling_batch_chat(args.model, trtllm_chat_model_spec) elif args.handler == "no_code": test_handler_rolling_batch(args.model, no_code_rolling_batch_spec) diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index abe8de3ab..430566f08 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -862,6 +862,12 @@ "option.use_custom_all_reduce": False, "option.max_rolling_batch_size": 32, "option.output_formatter": "jsonlines" + }, + "llama2-7b-chat": { + "option.model_id": "s3://djl-llm/meta-llama-Llama-2-7b-chat-hf/", + "option.dtype": "fp16", + "option.tensor_parallel_degree": 4, + "option.max_rolling_batch_size": 4 } } diff --git a/tests/integration/tests.py b/tests/integration/tests.py index c8cabba6c..94463eb93 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -173,6 +173,12 @@ def test_qwen_7b(self): r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") client.run("trtllm qwen-7b".split()) + def test_llama2_7b_chat(self): + with Runner('tensorrt-llm', 'llama2-7b-chat') as r: + prepare.build_trtllm_handler_model("llama2-7b-chat") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm_chat llama2-7b-chat".split()) + class TestSchedulerSingleGPU: # Runs on g5.12xl