InternLM · lvhan028 · Sep 18, 2024 · Sep 14, 2024 · Sep 14, 2024 · Sep 14, 2024
diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -66,6 +66,16 @@ def add_parser_convert():
             default=None,
             help='the name of the built-in chat template, which can be '
             'overviewed by `lmdeploy list`')
+        parser.add_argument(
+            '--dtype',
+            type=str,
+            default='auto',
+            choices=['auto', 'float16', 'bfloat16'],
+            help='data type for model weights and activations. '
+            'The "auto" option will use FP16 precision '
+            'for FP32 and FP16 models, and BF16 precision '
+            'for BF16 models. This option will be ignored if '
+            'the model is a quantized model')
         parser.set_defaults(run=CLI.convert)
 
     @staticmethod
@@ -113,6 +123,7 @@ def add_parser_chat():
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
         # common engine args
+        dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
@@ -121,6 +132,7 @@ def add_parser_chat():
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
+        tb_group._group_actions.append(dtype_act)
         tb_group._group_actions.append(tp_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(cache_max_entry_act)
@@ -245,6 +257,7 @@ def chat(args):
 
             adapters = get_lora_adapters(args.adapters)
             engine_config = PytorchEngineConfig(
+                dtype=args.dtype,
                 tp=args.tp,
                 session_len=args.session_len,
                 cache_max_entry_count=args.cache_max_entry_count,

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -59,6 +59,7 @@ def add_parser_gradio():
         pt_group = parser.add_argument_group('PyTorch engine arguments')
 
         # common engine args
+        dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
         ArgumentHelper.device(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
@@ -71,6 +72,7 @@ def add_parser_gradio():
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
+        tb_group._group_actions.append(dtype_act)
         tb_group._group_actions.append(tp_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
@@ -150,6 +152,7 @@ def add_parser_api_server():
         ArgumentHelper.adapters(pt_group)
         ArgumentHelper.device(pt_group)
         # common engine args
+        dtype_act = ArgumentHelper.dtype(pt_group)
         tp_act = ArgumentHelper.tp(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
         max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
@@ -161,6 +164,7 @@ def add_parser_api_server():
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
+        tb_group._group_actions.append(dtype_act)
         tb_group._group_actions.append(tp_act)
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(max_batch_size_act)
@@ -213,6 +217,7 @@ def gradio(args):
             backend = autoget_backend(args.model_path_or_server)
         if backend == 'pytorch':
             backend_config = PytorchEngineConfig(
+                dtype=args.dtype,
                 tp=args.tp,
                 max_batch_size=max_batch_size,
                 cache_max_entry_count=args.cache_max_entry_count,
@@ -223,6 +228,7 @@ def gradio(args):
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             backend_config = TurbomindEngineConfig(
+                dtype=args.dtype,
                 tp=args.tp,
                 max_batch_size=max_batch_size,
                 session_len=args.session_len,
@@ -258,6 +264,7 @@ def api_server(args):
             from lmdeploy.messages import PytorchEngineConfig
             adapters = get_lora_adapters(args.adapters)
             backend_config = PytorchEngineConfig(
+                dtype=args.dtype,
                 tp=args.tp,
                 max_batch_size=max_batch_size,
                 cache_max_entry_count=args.cache_max_entry_count,
@@ -270,6 +277,7 @@ def api_server(args):
         else:
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(
+                dtype=args.dtype,
                 tp=args.tp,
                 max_batch_size=max_batch_size,
                 session_len=args.session_len,

diff --git a/lmdeploy/cli/utils.py b/lmdeploy/cli/utils.py
@@ -100,6 +100,19 @@ def model_name(parser):
             'by the RESTful API `/v1/models`. If it is not specified, '
             '`model_path` will be adopted')
 
+    @staticmethod
+    def dtype(parser, default: str = 'auto'):
+        return parser.add_argument(
+            '--dtype',
+            type=str,
+            default=default,
+            choices=['auto', 'float16', 'bfloat16'],
+            help='data type for model weights and activations. '
+            'The "auto" option will use FP16 precision '
+            'for FP32 and FP16 models, and BF16 precision '
+            'for BF16 models. This option will be ignored if '
+            'the model is a quantized model')
+
     @staticmethod
     def model_format(parser, default: str = None):
         return parser.add_argument(

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -54,10 +54,29 @@ class GenerationConfig:
             in the decoding. Default to be True.
         logprobs (int): Number of log probabilities to return per output token.
         response_format (Dict): Only pytorch backend support formatting
-            response. Examples: `{"type": "json_schema", "json_schema": {"name":"test","schema": {"properties": {"name": {"type": "string"}}, "required": ["name"], "type": "object"}}}`
-            or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
+        response. Examples:
+            {
+                "type": "json_schema",
+                "json_schema": {
+                    "name": "test",
+                    "schema": {
+                    "properties": {
+                        "name": {
+                        "type": "string"
+                        }
+                    },
+                    "required": ["name"],
+                    "type": "object"
+                    }
+                }
+            }
+        or,
+            {
+                "type": "regex_schema",
+                "regex_schema": "call me [A-Za-z]{1,10}"
+            }
         logits_processors (List[Callable]): Custom logit processors.
-    """  # noqa
+    """
 
     n: int = 1
     max_new_tokens: int = 512
@@ -117,30 +136,58 @@ class TurbomindEngineConfig:
     """TurboMind Engine config.
 
     Args:
-        model_format (str): the layout of the deployed model. It can be one of the following values [hf, meta_llama, awq, gptq],
-            `hf` meaning huggingface model(.bin, .safetensors), `meta_llama` being meta llama's format(.pth),
-            `awq` and `gptq` meaning the quantized model by AWQ and GPTQ, respectively.
-            If it is not specified, i.e. None, it will be extracted from the input model
-        tp (int): the number of GPU cards used in tensor parallelism, default to 1
-        session_len (int): the max session length of a sequence, default to None
-        max_batch_size (int): the max batch size during inference. If it is not specified,
-            the engine will automatically set it according to the device
-        cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
-            For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache.
-            For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache
-        cache_chunk_size (int): The policy to apply for KV block from the block manager, default to -1.
-        cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
-        enable_prefix_caching (bool): enable cache prompts for block reuse, default to False
-        quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
-        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
+        dtype (str): data type for model weights and activations. It can be
+            one of the following values, ['auto', 'float16', 'bfloat16']
+            The `auto` option will use FP16 precision for FP32 and FP16
+            models, and BF16 precision for BF16 models.
+        model_format (str): the layout of the deployed model. It can be one
+            of the following values [hf, meta_llama, awq, gptq],`hf` meaning
+            huggingface model(.bin, .safetensors), `meta_llama` being
+            meta llama's format(.pth), `awq` and `gptq` meaning the quantized
+            model by AWQ and GPTQ, respectively. If it is not specified,
+            i.e. None, it will be extracted from the input model
+        tp (int): the number of GPU cards used in tensor parallelism,
+            default to 1
+        session_len (int): the max session length of a sequence, default to
+            None
+        max_batch_size (int): the max batch size during inference. If it is
+            not specified, the engine will automatically set it according to
+            the device
+        cache_max_entry_count (float): the percentage of gpu memory occupied
+            by the k/v cache.
+            For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it
+            defaults to 0.5, depicting the percentage of TOTAL GPU memory to
+            be allocated to the k/v cache.
+            For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8,
+            signifying the percentage of FREE GPU memory to be reserved for
+            the k/v cache
+        cache_chunk_size (int): The policy to apply for KV block from
+            the block manager, default to -1.
+        cache_block_seq_len (int): the length of the token sequence in
+            a k/v block, default to 64
+        enable_prefix_caching (bool): enable cache prompts for block reuse,
+            default to False
+        quant_policy (int): default to 0. When k/v is quantized into 4 or 8
+            bit, set it to 4 or 8, respectively
+        rope_scaling_factor (float): scaling factor used for dynamic ntk,
+            default to 0. TurboMind follows the implementation of transformer
+            LlamaAttention
         use_logn_attn (bool): whether or not to use log attn: default to False
-        download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
-        revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
-        max_prefill_token_num(int): the number of tokens each iteration during prefill, default to 8192
-        num_tokens_per_iter(int): the number of tokens processed in each forward pass. Working with `max_prefill_iters` enables "Dynamic SplitFuse"-like scheduling
-        max_prefill_iters(int): the max number of forward pass during prefill stage
-    """  # noqa: E501
+        download_dir (str): Directory to download and load the weights,
+            default to the default cache directory of huggingface.
+        revision (str): The specific model version to use. It can be a branch
+            name, a tag name, or a commit id. If unspecified, will use the
+            default version.
+        max_prefill_token_num(int): the number of tokens each iteration during
+            prefill, default to 8192
+        num_tokens_per_iter(int): the number of tokens processed in each
+            forward pass. Working with `max_prefill_iters` enables the
+            "Dynamic SplitFuse"-like scheduling
+        max_prefill_iters(int): the max number of forward pass during prefill
+            stage
+    """
 
+    dtype: str = 'auto'
     model_format: Optional[str] = None
     tp: int = 1
     session_len: Optional[int] = None
@@ -160,11 +207,14 @@ class TurbomindEngineConfig:
 
     def __post_init__(self):
         """Check input validation."""
+        assert self.dtype in ['auto', 'float16', 'bfloat16']
         assert self.tp >= 1, 'tp must be a positive integer'
-        assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count'  # noqa
+        assert 0 < self.cache_max_entry_count < 1, \
+            'invalid cache_max_entry_count'
         assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.rope_scaling_factor >= 0, 'invalid rope_scaling_factor'
-        assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
+        assert self.max_prefill_token_num >= 0, \
+            'invalid max_prefill_token_num'
         assert self.num_tokens_per_iter >= 0, 'invalid num_tokens_per_iter'
 
 
@@ -173,6 +223,10 @@ class PytorchEngineConfig:
     """PyTorch Engine Config.
 
     Args:
+        dtype (str): data type for model weights and activations. It can be
+            one of the following values, ['auto', 'float16', 'bfloat16']
+            The `auto` option will use FP16 precision for FP32 and FP16
+            models, and BF16 precision for BF16 models.
         tp (int): Tensor Parallelism. default 1.
         session_len (int): Max session length. Default None.
         max_batch_size (int): Max batch size. If it is not specified,
@@ -193,12 +247,17 @@ class PytorchEngineConfig:
         thread_safe (bool): thread safe engine instance.
         enable_prefix_caching (bool): Enable token match and sharing caches.
         device_type (str): The inference device type, options ['cuda']
+        eager_mode (bool): Enable "eager" mode or not
+        custom_module_map (Dict): nn module map customized by users. Once
+            provided, the original nn modules of the model will be
+            substituted by the mapping ones
         download_dir (str): Directory to download and load the weights,
             default to the default cache directory of huggingface.
         revision (str): The specific model version to use.
             It can be a branch name, a tag name, or a commit id.
             If unspecified, will use the default version.
     """
+    dtype: str = 'auto'
     tp: int = 1
     session_len: int = None
     max_batch_size: int = None
@@ -213,16 +272,19 @@ class PytorchEngineConfig:
     enable_prefix_caching: bool = False
     device_type: str = 'cuda'
     eager_mode: bool = False
-    custom_module_map: str = None
+    custom_module_map: Dict[str, str] = None
     download_dir: str = None
     revision: str = None
 
     def __post_init__(self):
         """Check input validation."""
+        assert self.dtype in ['auto', 'float16', 'bfloat16']
         assert self.tp >= 1, 'invalid tp'
-        assert self.cache_max_entry_count > 0 and self.cache_max_entry_count < 1, 'invalid cache_max_entry_count'  # noqa
+        assert 0 < self.cache_max_entry_count < 1, \
+            'invalid cache_max_entry_count'
         assert self.num_cpu_blocks >= 0, 'invalid num_cpu_blocks'
-        assert self.max_prefill_token_num >= 0, 'invalid max_prefill_token_num'
+        assert self.max_prefill_token_num >= 0, \
+            'invalid max_prefill_token_num'
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
         assert self.device_type in [
             'cuda', 'ascend'