Skip to content

Commit

Permalink
Use TensorRT-LLM native parameter names in nemo.export module (NVIDIA…
Browse files Browse the repository at this point in the history
…#9424)

* Use native TRT-LLM param names in export (partial)

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* max_input_len & max_output_len rename cont'd

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Renames in infer_data_path.py

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Allow for max_output_token in TensorRTLLM forward with deprecation warning

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
  • Loading branch information
janekl and janekl authored Jun 10, 2024
1 parent f375d51 commit 69954ef
Show file tree
Hide file tree
Showing 7 changed files with 108 additions and 78 deletions.
18 changes: 9 additions & 9 deletions nemo/deploy/nlp/query_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def query_llm(
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_token=512,
max_output_len=512,
top_k=1,
top_p=0.0,
temperature=1.0,
Expand Down Expand Up @@ -81,7 +81,7 @@ def query_llm(
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_token=512,
max_output_len=512,
top_k=1,
top_p=0.0,
temperature=1.0,
Expand All @@ -95,7 +95,7 @@ def query_llm(
Args:
prompts (List(str)): list of sentences.
max_output_token (int): max generated tokens.
max_output_len (int): max generated tokens.
top_k (int): limits us to a certain number (K) of the top tokens to consider.
top_p (float): limits us to the top tokens within a certain probability mass (p).
temperature (float): A parameter of the softmax function, which is the last layer in the network.
Expand All @@ -110,8 +110,8 @@ def query_llm(
prompts = str_list2numpy(prompts)
inputs = {"prompts": prompts}

if max_output_token is not None:
inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
if max_output_len is not None:
inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

if top_k is not None:
inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
Expand Down Expand Up @@ -157,7 +157,7 @@ def query_llm_streaming(
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_token=512,
max_output_len=512,
top_k=1,
top_p=0.0,
temperature=1.0,
Expand All @@ -171,7 +171,7 @@ def query_llm_streaming(
Args:
prompts (List(str)): list of sentences.
max_output_token (int): max generated tokens.
max_output_len (int): max generated tokens.
top_k (int): limits us to a certain number (K) of the top tokens to consider.
top_p (float): limits us to the top tokens within a certain probability mass (p).
temperature (float): A parameter of the softmax function, which is the last layer in the network.
Expand All @@ -186,8 +186,8 @@ def query_llm_streaming(
prompts = str_list2numpy(prompts)
inputs = {"prompts": prompts}

if max_output_token is not None:
inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
if max_output_len is not None:
inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

if top_k is not None:
inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
Expand Down
66 changes: 48 additions & 18 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@
import pickle
import shutil
import tempfile
import warnings
from pathlib import Path
from typing import List
from typing import List, Optional

import numpy as np
import tensorrt_llm
Expand Down Expand Up @@ -119,8 +120,10 @@ def export(
n_gpus: int = 1,
tensor_parallel_size: int = None,
pipeline_parallel_size: int = None,
max_input_token: int = 256,
max_output_token: int = 256,
max_input_len: int = 256,
max_output_len: int = 256,
max_input_token: Optional[int] = None,
max_output_token: Optional[int] = None,
max_batch_size: int = 8,
max_prompt_embedding_table_size=None,
use_parallel_embedding: bool = False,
Expand All @@ -146,8 +149,10 @@ def export(
n_gpus (int): number of GPUs to use for inference.
tensor_parallel_size (int): tensor parallelism.
pipeline_parallel_size (int): pipeline parallelism.
max_input_token (int): max input length.
max_output_token (int): max output length.
max_input_len (int): max input length.
max_output_len (int): max output length.
max_input_token (int): max input length. Deprecated, use max_input_len instead.
max_output_token (int): max output length. Deprecated, use max_output_len instead.
max_batch_size (int): max batch size.
max_prompt_embedding_table_size (int): max prompt embedding size.
use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
Expand Down Expand Up @@ -204,6 +209,22 @@ def export(

self.model = None

if max_input_token is not None:
warnings.warn(
"Parameter max_input_token is deprecated and will be removed. Please use max_input_len instead.",
DeprecationWarning,
stacklevel=2,
)
max_input_len = max_input_token

if max_output_token is not None:
warnings.warn(
"Parameter max_output_token is deprecated and will be removed. Please use max_output_len instead.",
DeprecationWarning,
stacklevel=2,
)
max_output_len = max_output_token

if tensorrt_llm.mpi_rank() == 0:
tmp_dir = tempfile.TemporaryDirectory()
nemo_export_dir = Path(tmp_dir.name)
Expand All @@ -219,8 +240,8 @@ def export(
qnemo_to_tensorrt_llm(
nemo_checkpoint_path=nemo_checkpoint_path,
engine_dir=self.model_dir,
max_input_len=max_input_token,
max_output_len=max_output_token,
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
max_prompt_embedding_table_size=max_prompt_embedding_table_size,
lora_target_modules=lora_target_modules,
Expand All @@ -240,8 +261,8 @@ def export(

for weight_dict, model_config in zip(weights_dicts, model_configs):
build_and_save_engine(
max_input_len=max_input_token,
max_output_len=max_output_token,
max_input_len=max_input_len,
max_output_len=max_output_len,
max_batch_size=max_batch_size,
model_config=model_config,
model_weights=weight_dict,
Expand Down Expand Up @@ -280,7 +301,8 @@ def export(
def forward(
self,
input_texts: List[str],
max_output_token: int = 64,
max_output_len: int = 64,
max_output_token: Optional[int] = None,
top_k: int = 1,
top_p: float = 0.0,
temperature: float = 1.0,
Expand All @@ -300,7 +322,8 @@ def forward(
Args:
input_texts (List(str)): list of sentences.
max_output_token (int): max generated tokens.
max_output_len (int): max generated tokens.
max_output_token (int): max generated tokens. Deprecated, use max_output_len instead.
top_k (int): limits us to a certain number (K) of the top tokens to consider.
top_p (float): limits us to the top tokens within a certain probability mass (p).
temperature (float): A parameter of the softmax function, which is the last layer in the network.
Expand All @@ -319,6 +342,13 @@ def forward(
"then it should be loaded first to run inference."
)
else:
if max_output_token is not None:
warnings.warn(
"Parameter max_output_token is deprecated and will be removed. Please use max_output_len instead.",
DeprecationWarning,
stacklevel=2,
)
max_output_len = max_output_token
if prompt_embeddings_table is not None or prompt_embeddings_checkpoint_path is not None:
prompt_table = self._get_prompt_embedding_table(
prompt_embeddings_table, prompt_embeddings_checkpoint_path
Expand Down Expand Up @@ -366,7 +396,7 @@ def forward(

return generate(
input_texts=input_texts,
max_output_len=max_output_token,
max_output_len=max_output_len,
host_context=self.model,
top_k=top_k,
top_p=top_p,
Expand All @@ -386,7 +416,7 @@ def forward(
else:
return generate_streaming(
input_texts=input_texts,
max_output_len=max_output_token,
max_output_len=max_output_len,
host_context=self.model,
top_k=top_k,
top_p=top_p,
Expand Down Expand Up @@ -449,7 +479,7 @@ def get_hidden_size(self):
def get_triton_input(self):
inputs = (
Tensor(name="prompts", shape=(-1,), dtype=bytes),
Tensor(name="max_output_token", shape=(-1,), dtype=np.int_, optional=True),
Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
Expand All @@ -471,8 +501,8 @@ def get_triton_output(self):
def triton_infer_fn(self, **inputs: np.ndarray):
try:
infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
if "max_output_token" in inputs:
infer_input["max_output_token"] = inputs.pop("max_output_token")[0][0]
if "max_output_len" in inputs:
infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0]
if "top_k" in inputs:
infer_input["top_k"] = inputs.pop("top_k")[0][0]
if "top_p" in inputs:
Expand Down Expand Up @@ -508,8 +538,8 @@ def triton_infer_fn(self, **inputs: np.ndarray):
def triton_infer_fn_streaming(self, **inputs: np.ndarray):
try:
infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
if "max_output_token" in inputs:
infer_input["max_output_token"] = inputs.pop("max_output_token")[0][0]
if "max_output_len" in inputs:
infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0]
if "top_k" in inputs:
infer_input["top_k"] = inputs.pop("top_k")[0][0]
if "top_p" in inputs:
Expand Down
4 changes: 2 additions & 2 deletions scripts/deploy/nlp/deploy_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,8 +229,8 @@ def nemo_deploy(argv):
n_gpus=args.num_gpus,
tensor_parallel_size=args.num_gpus,
pipeline_parallel_size=1,
max_input_token=args.max_input_len,
max_output_token=args.max_output_len,
max_input_len=args.max_input_len,
max_output_len=args.max_output_len,
max_batch_size=args.max_batch_size,
max_num_tokens=args.max_num_tokens,
opt_num_tokens=args.opt_num_tokens,
Expand Down
18 changes: 9 additions & 9 deletions scripts/deploy/nlp/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def get_args(argv):
parser.add_argument("-swl", "--stop_words_list", type=str, help="Stop words list")
parser.add_argument("-bwl", "--bad_words_list", type=str, help="Bad words list")
parser.add_argument("-nrns", "--no_repeat_ngram_size", type=int, help="No repeat ngram size")
parser.add_argument("-mot", "--max_output_token", default=128, type=int, help="Max output token length")
parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
Expand Down Expand Up @@ -67,7 +67,7 @@ def query_llm(
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_token=128,
max_output_len=128,
top_k=1,
top_p=0.0,
temperature=1.0,
Expand All @@ -79,8 +79,8 @@ def query_llm(
prompts = str_list2numpy(prompts)
inputs = {"prompts": prompts}

if max_output_token is not None:
inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
if max_output_len is not None:
inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

if top_k is not None:
inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
Expand Down Expand Up @@ -131,7 +131,7 @@ def query_llm_streaming(
stop_words_list=None,
bad_words_list=None,
no_repeat_ngram_size=None,
max_output_token=512,
max_output_len=512,
top_k=1,
top_p=0.0,
temperature=1.0,
Expand All @@ -143,8 +143,8 @@ def query_llm_streaming(
prompts = str_list2numpy(prompts)
inputs = {"prompts": prompts}

if max_output_token is not None:
inputs["max_output_token"] = np.full(prompts.shape, max_output_token, dtype=np.int_)
if max_output_len is not None:
inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)

if top_k is not None:
inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
Expand Down Expand Up @@ -202,7 +202,7 @@ def query(argv):
stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
no_repeat_ngram_size=args.no_repeat_ngram_size,
max_output_token=args.max_output_token,
max_output_len=args.max_output_len,
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
Expand Down Expand Up @@ -232,7 +232,7 @@ def query(argv):
stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
no_repeat_ngram_size=args.no_repeat_ngram_size,
max_output_token=args.max_output_token,
max_output_len=args.max_output_len,
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
Expand Down
4 changes: 2 additions & 2 deletions scripts/export/export_to_trt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def nemo_export_trt_llm(argv):
n_gpus=args.num_gpus,
tensor_parallel_size=args.tensor_parallelism_size,
pipeline_parallel_size=args.pipeline_parallelism_size,
max_input_token=args.max_input_len,
max_output_token=args.max_output_len,
max_input_len=args.max_input_len,
max_output_len=args.max_output_len,
max_batch_size=args.max_batch_size,
max_num_tokens=args.max_num_tokens,
opt_num_tokens=args.opt_num_tokens,
Expand Down
Loading

0 comments on commit 69954ef

Please sign in to comment.