Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support user-sepcified data type #2473

Merged
merged 9 commits into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,16 @@ def add_parser_convert():
default=None,
help='the name of the built-in chat template, which can be '
'overviewed by `lmdeploy list`')
parser.add_argument(
'--dtype',
type=str,
default='auto',
choices=['auto', 'float16', 'bfloat16'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models. This option will be ignored if '
'the model is a quantized model')
parser.set_defaults(run=CLI.convert)

@staticmethod
Expand Down Expand Up @@ -113,6 +123,7 @@ def add_parser_chat():
ArgumentHelper.adapters(pt_group)
ArgumentHelper.device(pt_group)
# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
tp_act = ArgumentHelper.tp(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
Expand All @@ -121,6 +132,7 @@ def add_parser_chat():
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(dtype_act)
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(cache_max_entry_act)
Expand Down Expand Up @@ -245,6 +257,7 @@ def chat(args):

adapters = get_lora_adapters(args.adapters)
engine_config = PytorchEngineConfig(
dtype=args.dtype,
tp=args.tp,
session_len=args.session_len,
cache_max_entry_count=args.cache_max_entry_count,
Expand Down
8 changes: 8 additions & 0 deletions lmdeploy/cli/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def add_parser_gradio():
pt_group = parser.add_argument_group('PyTorch engine arguments')

# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
tp_act = ArgumentHelper.tp(pt_group)
ArgumentHelper.device(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
Expand All @@ -71,6 +72,7 @@ def add_parser_gradio():
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(dtype_act)
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
Expand Down Expand Up @@ -150,6 +152,7 @@ def add_parser_api_server():
ArgumentHelper.adapters(pt_group)
ArgumentHelper.device(pt_group)
# common engine args
dtype_act = ArgumentHelper.dtype(pt_group)
tp_act = ArgumentHelper.tp(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
Expand All @@ -161,6 +164,7 @@ def add_parser_api_server():
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(dtype_act)
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
Expand Down Expand Up @@ -213,6 +217,7 @@ def gradio(args):
backend = autoget_backend(args.model_path_or_server)
if backend == 'pytorch':
backend_config = PytorchEngineConfig(
dtype=args.dtype,
tp=args.tp,
max_batch_size=max_batch_size,
cache_max_entry_count=args.cache_max_entry_count,
Expand All @@ -223,6 +228,7 @@ def gradio(args):
max_prefill_token_num=args.max_prefill_token_num)
else:
backend_config = TurbomindEngineConfig(
dtype=args.dtype,
tp=args.tp,
max_batch_size=max_batch_size,
session_len=args.session_len,
Expand Down Expand Up @@ -258,6 +264,7 @@ def api_server(args):
from lmdeploy.messages import PytorchEngineConfig
adapters = get_lora_adapters(args.adapters)
backend_config = PytorchEngineConfig(
dtype=args.dtype,
tp=args.tp,
max_batch_size=max_batch_size,
cache_max_entry_count=args.cache_max_entry_count,
Expand All @@ -270,6 +277,7 @@ def api_server(args):
else:
from lmdeploy.messages import TurbomindEngineConfig
backend_config = TurbomindEngineConfig(
dtype=args.dtype,
tp=args.tp,
max_batch_size=max_batch_size,
session_len=args.session_len,
Expand Down
13 changes: 13 additions & 0 deletions lmdeploy/cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,19 @@ def model_name(parser):
'by the RESTful API `/v1/models`. If it is not specified, '
'`model_path` will be adopted')

@staticmethod
def dtype(parser, default: str = 'auto'):
return parser.add_argument(
'--dtype',
type=str,
default=default,
choices=['auto', 'float16', 'bfloat16'],
help='data type for model weights and activations. '
'The "auto" option will use FP16 precision '
'for FP32 and FP16 models, and BF16 precision '
'for BF16 models. This option will be ignored if '
'the model is a quantized model')

@staticmethod
def model_format(parser, default: str = None):
return parser.add_argument(
Expand Down
122 changes: 92 additions & 30 deletions lmdeploy/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,29 @@ class GenerationConfig:
in the decoding. Default to be True.
logprobs (int): Number of log probabilities to return per output token.
response_format (Dict): Only pytorch backend support formatting
response. Examples: `{"type": "json_schema", "json_schema": {"name":"test","schema": {"properties": {"name": {"type": "string"}}, "required": ["name"], "type": "object"}}}`
or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
response. Examples:
{
"type": "json_schema",
"json_schema": {
"name": "test",
"schema": {
"properties": {
"name": {
"type": "string"
}
},
"required": ["name"],
"type": "object"
}
}
}
or,
{
"type": "regex_schema",
"regex_schema": "call me [A-Za-z]{1,10}"
}
logits_processors (List[Callable]): Custom logit processors.
""" # noqa
"""

n: int = 1
max_new_tokens: int = 512
Expand Down Expand Up @@ -117,30 +136,58 @@ class TurbomindEngineConfig:
"""TurboMind Engine config.

Args:
model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq, gptq],
`hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth),
`awq` and `gptq` meaning the quantized model by AWQ and GPTQ, respectively.
If it is not specified, i.e. None, it will be extracted from the input model
tp (int): the number of GPU cards used in tensor parallelism, default to 1
session_len (int): the max session length of a sequence, default to None
max_batch_size (int): the max batch size during inference. If it is not specified,
the engine will automatically set it according to the device
cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache.
For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache
cache_chunk_size (int): The policy to apply for KV block from the block manager, default to -1.
cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
enable_prefix_caching (bool): enable cache prompts for block reuse, default to False
quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
dtype (str): data type for model weights and activations. It can be
one of the following values, ['auto', 'float16', 'bfloat16']
The `auto` option will use FP16 precision for FP32 and FP16
models, and BF16 precision for BF16 models.
model_format (str): the layout of the deployed model. It can be one
of the following values [hf, meta_llama, awq, gptq],`hf` meaning
huggingface model(.bin, .safetensors), `meta_llama` being
meta llama's format(.pth), `awq` and `gptq` meaning the quantized
model by AWQ and GPTQ, respectively. If it is not specified,
i.e. None, it will be extracted from the input model
tp (int): the number of GPU cards used in tensor parallelism,
default to 1
session_len (int): the max session length of a sequence, default to
None
max_batch_size (int): the max batch size during inference. If it is
not specified, the engine will automatically set it according to
the device
cache_max_entry_count (float): the percentage of gpu memory occupied
by the k/v cache.
For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it
defaults to 0.5, depicting the percentage of TOTAL GPU memory to
be allocated to the k/v cache.
For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8,
signifying the percentage of FREE GPU memory to be reserved for
the k/v cache
cache_chunk_size (int): The policy to apply for KV block from
the block manager, default to -1.
cache_block_seq_len (int): the length of the token sequence in
a k/v block, default to 64
enable_prefix_caching (bool): enable cache prompts for block reuse,
default to False
quant_policy (int): default to 0. When k/v is quantized into 4 or 8
bit, set it to 4 or 8, respectively
rope_scaling_factor (float): scaling factor used for dynamic ntk,
default to 0. TurboMind follows the implementation of transformer
LlamaAttention
use_logn_attn (bool): whether or not to use log attn: default to False
download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
max_prefill_token_num(int): the number of tokens each iteration during prefill, default to 8192
num_tokens_per_iter(int): the number of tokens processed in each forward pass. Working with `max_prefill_iters` enables "Dynamic SplitFuse"-like scheduling
max_prefill_iters(int): the max number of forward pass during prefill stage
""" # noqa: E501
download_dir (str): Directory to download and load the weights,
default to the default cache directory of huggingface.
revision (str): The specific model version to use. It can be a branch
name, a tag name, or a commit id. If unspecified, will use the
default version.
max_prefill_token_num(int): the number of tokens each iteration during
prefill, default to 8192
num_tokens_per_iter(int): the number of tokens processed in each
forward pass. Working with `max_prefill_iters` enables the
"Dynamic SplitFuse"-like scheduling
max_prefill_iters(int): the max number of forward pass during prefill
stage
"""

dtype: str = 'auto'
model_format: Optional[str] = None
tp: int = 1
session_len: Optional[int] = None
Expand All @@ -160,11 +207,14 @@ class TurbomindEngineConfig:

def __post_init__(self):
"""Check input validation."""
assert self.dtype in ['auto', 'float16', 'bfloat16']
assert self.tp >= 1, 'tp must be a positive integer'
assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count' # noqa
assert 0 < self.cache_max_entry_count < 1, \
'invalid cache_max_entry_count'
assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
assert self.rope_scaling_factor >= 0, 'invalid rope_scaling_factor'
assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
assert self.max_prefill_token_num >= 0, \
'invalid max_prefill_token_num'
assert self.num_tokens_per_iter >= 0, 'invalid num_tokens_per_iter'


Expand All @@ -173,6 +223,10 @@ class PytorchEngineConfig:
"""PyTorch Engine Config.

Args:
dtype (str): data type for model weights and activations. It can be
one of the following values, ['auto', 'float16', 'bfloat16']
The `auto` option will use FP16 precision for FP32 and FP16
models, and BF16 precision for BF16 models.
tp (int): Tensor Parallelism. default 1.
session_len (int): Max session length. Default None.
max_batch_size (int): Max batch size. If it is not specified,
Expand All @@ -193,12 +247,17 @@ class PytorchEngineConfig:
thread_safe (bool): thread safe engine instance.
enable_prefix_caching (bool): Enable token match and sharing caches.
device_type (str): The inference device type, options ['cuda']
eager_mode (bool): Enable "eager" mode or not
custom_module_map (Dict): nn module map customized by users. Once
provided, the original nn modules of the model will be
substituted by the mapping ones
download_dir (str): Directory to download and load the weights,
default to the default cache directory of huggingface.
revision (str): The specific model version to use.
It can be a branch name, a tag name, or a commit id.
If unspecified, will use the default version.
"""
dtype: str = 'auto'
tp: int = 1
session_len: int = None
max_batch_size: int = None
Expand All @@ -213,16 +272,19 @@ class PytorchEngineConfig:
enable_prefix_caching: bool = False
device_type: str = 'cuda'
eager_mode: bool = False
custom_module_map: str = None
custom_module_map: Dict[str, str] = None
download_dir: str = None
revision: str = None

def __post_init__(self):
"""Check input validation."""
assert self.dtype in ['auto', 'float16', 'bfloat16']
assert self.tp >= 1, 'invalid tp'
assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count' # noqa
assert 0 < self.cache_max_entry_count < 1, \
'invalid cache_max_entry_count'
assert self.num_cpu_blocks >= 0, 'invalid num_cpu_blocks'
assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
assert self.max_prefill_token_num >= 0, \
'invalid max_prefill_token_num'
assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
assert self.device_type in [
'cuda', 'ascend'
Expand Down
Loading
Loading