InternLM · lvhan028 · Oct 14, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 11, 2024
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -286,6 +286,7 @@ def parse_args():
     cache_count_act = ArgumentHelper.cache_max_entry_count(pt_group)
     cache_block_seq_len_act = ArgumentHelper.cache_block_seq_len(pt_group)
     prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+    quant_policy_act = ArgumentHelper.quant_policy(pt_group, default=0)
 
     # turbomind engine args
     tb_group = parser.add_argument_group('TurboMind engine argument')
@@ -294,8 +295,8 @@ def parse_args():
     tb_group._group_actions.append(cache_count_act)
     tb_group._group_actions.append(cache_block_seq_len_act)
     tb_group._group_actions.append(prefix_caching_act)
+    tb_group._group_actions.append(quant_policy_act)
     ArgumentHelper.model_format(tb_group, default='hf')
-    ArgumentHelper.quant_policy(tb_group, default=0)
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
 
@@ -328,6 +329,7 @@ def main():
             tp=args.tp,
             thread_safe=True,
             enable_prefix_caching=args.enable_prefix_caching,
+            quant_policy=args.quant_policy,
         )
 
     engine = Engine(args.model_path, engine_config, csv=args.csv)

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -43,47 +43,47 @@ The TurboMind engine doesn't support window attention. Therefore, for models tha
 
 ## PyTorchEngine on CUDA Platform
 
-|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | W8A8 | W4A16 |
-| :------------: | :---------: | :--: | :-------: | :-----: | :--: | :---: |
-|     Llama      |  7B - 65B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama2     |  7B - 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama3     |   8B, 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   No    |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
-|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
-|   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
-|    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   No    |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   No    |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |  No  |  No   |
-|    MiniCPM3    |     4B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Gemma      |    2B-7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|      Dbrx      |    132B     | LLM  |    Yes    |   No    |  No  |  No   |
-|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   No    |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   No    |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   No    |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   No    |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   No    |  No  |   -   |
-|      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
+|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W8A8 | W4A16 |
+| :------------: | :---------: | :--: | :-------: | :-----: | :-----: | :--: | :---: |
+|     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
+|   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
+|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
 ## PyTorchEngine on Huawei Ascend Platform
 

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -43,47 +43,47 @@ turbomind 引擎不支持 window attention。所以，对于应用了 window att
 
 ## PyTorchEngine CUDA 平台
 
-|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | W8A8 | W4A16 |
-| :------------: | :---------: | :--: | :-------: | :-----: | :--: | :---: |
-|     Llama      |  7B - 65B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama2     |  7B - 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|     Llama3     |   8B, 70B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   No    |  No  |   -   |
-|    InternLM    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |   -   |
-|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|  InternLM2.5   |     7B      | LLM  |    Yes    |   No    | Yes  |  Yes  |
-|   Baichuan2    |     7B      | LLM  |    Yes    |   No    | Yes  |  No   |
-|   Baichuan2    |     13B     | LLM  |    Yes    |   No    |  No  |  No   |
-|    ChatGLM2    |     6B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Falcon     |  7B - 180B  | LLM  |    Yes    |   No    |  No  |  No   |
-|       YI       |  6B - 34B   | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    Mistral     |     7B      | LLM  |    Yes    |   No    |  No  |  No   |
-|    Mixtral     |    8x7B     | LLM  |    Yes    |   No    |  No  |  No   |
-|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   No    |  No  |  Yes  |
-|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |  No  |  No   |
-|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |  No  |  No   |
-|    MiniCPM3    |     4B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     Gemma      |    2B-7B    | LLM  |    Yes    |   No    |  No  |  No   |
-|      Dbrx      |    132B     | LLM  |    Yes    |   No    |  No  |  No   |
-|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   No    |  No  |  No   |
-|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   No    |  No  |  Yes  |
-|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   No    |  No  |   -   |
-|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   No    |  No  |   -   |
-| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   No    |  No  |   -   |
-| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   No    |  No  |  Yes  |
-|   InternVL2    |   1B-40B    | MLLM |    Yes    |   No    |  No  |   -   |
-|     Gemma2     |   9B-27B    | LLM  |    Yes    |   No    |  No  |   -   |
-|      GLM4      |     9B      | LLM  |    Yes    |   No    |  No  |  No   |
-|     GLM-4V     |     9B      | MLLM |    Yes    |   No    |  No  |  No   |
-|   CodeGeeX4    |     9B      | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   No    |  No  |   -   |
-|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   No    |  No  |   -   |
-| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   No    |  No  |   -   |
+|     Model      |    Size     | Type | FP16/BF16 | KV INT8 | KV INT4 | W8A8 | W4A16 |
+| :------------: | :---------: | :--: | :-------: | :-----: | :-----: | :--: | :---: |
+|     Llama      |  7B - 65B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama2     |  7B - 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|     Llama3     |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|    Llama3.1    |   8B, 70B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|    InternLM    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |   -   |
+|   InternLM2    |  7B - 20B   | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|  InternLM2.5   |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  Yes  |
+|   Baichuan2    |     7B      | LLM  |    Yes    |   Yes   |   Yes   | Yes  |  No   |
+|   Baichuan2    |     13B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    ChatGLM2    |     6B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Falcon     |  7B - 180B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|       YI       |  6B - 34B   | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    Mistral     |     7B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|    Mixtral     |    8x7B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      QWen      | 1.8B - 72B  | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|    QWen1.5     | 0.5B - 110B | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  QWen1.5-MoE   |    A2.7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     QWen2      | 0.5B - 72B  | LLM  |    Yes    |   Yes   |   No    |  No  |  Yes  |
+|    QWen2-VL    |   2B, 7B    | MLLM |    Yes    |   Yes   |   No    |  No  |  No   |
+|  DeepSeek-MoE  |     16B     | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|  DeepSeek-V2   |  16B, 236B  | LLM  |    Yes    |   No    |   No    |  No  |  No   |
+|    MiniCPM3    |     4B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     Gemma      |    2B-7B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|      Dbrx      |    132B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   StarCoder2   |   3B-15B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   Phi-3-mini   |    3.8B     | LLM  |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|  Phi-3-vision  |    4.2B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM-Chat   |     17B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  CogVLM2-Chat  |     19B     | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| LLaVA(1.5,1.6) |   7B-34B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+| InternVL(v1.5) |   2B-26B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |  Yes  |
+|   InternVL2    |   1B-40B    | MLLM |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|     Gemma2     |   9B-27B    | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|      GLM4      |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|     GLM-4V     |     9B      | MLLM |    Yes    |   Yes   |   Yes   |  No  |  No   |
+|   CodeGeeX4    |     9B      | LLM  |    Yes    |   Yes   |   Yes   |  No  |   -   |
+|  Phi-3.5-mini  |    3.8B     | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+|  Phi-3.5-MoE   |   16x3.8B   | LLM  |    Yes    |   Yes   |   No    |  No  |   -   |
+| Phi-3.5-vision |    4.2B     | MLLM |    Yes    |   Yes   |   No    |  No  |   -   |
 
 ## PyTorchEngine 华为昇腾平台
 

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -128,6 +128,7 @@ def add_parser_chat():
         session_len_act = ArgumentHelper.session_len(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
+        quant_policy = ArgumentHelper.quant_policy(pt_group)
 
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
@@ -137,8 +138,8 @@ def add_parser_chat():
         tb_group._group_actions.append(session_len_act)
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(prefix_caching_act)
+        tb_group._group_actions.append(quant_policy)
         ArgumentHelper.model_format(tb_group)
-        ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
 
     @staticmethod
@@ -263,7 +264,8 @@ def chat(args):
                 cache_max_entry_count=args.cache_max_entry_count,
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
-                device_type=args.device)
+                device_type=args.device,
+                quant_policy=args.quant_policy)
             run_chat(args.model_path,
                      engine_config,
                      chat_template_config=chat_template_config)

diff --git a/lmdeploy/cli/serve.py b/lmdeploy/cli/serve.py
@@ -168,6 +168,7 @@ def add_parser_api_server():
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
         max_prefill_token_num_act = ArgumentHelper.max_prefill_token_num(
             pt_group)
+        quant_policy = ArgumentHelper.quant_policy(pt_group)
         # turbomind args
         tb_group = parser.add_argument_group('TurboMind engine arguments')
         # common engine args
@@ -179,8 +180,8 @@ def add_parser_api_server():
         tb_group._group_actions.append(cache_block_seq_len_act)
         tb_group._group_actions.append(prefix_caching_act)
         tb_group._group_actions.append(max_prefill_token_num_act)
+        tb_group._group_actions.append(quant_policy)
         ArgumentHelper.model_format(tb_group)
-        ArgumentHelper.quant_policy(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.num_tokens_per_iter(tb_group)
         ArgumentHelper.max_prefill_iters(tb_group)
@@ -258,6 +259,7 @@ def gradio(args):
                 session_len=args.session_len,
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
+                quant_policy=args.quant_policy,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             backend_config = TurbomindEngineConfig(
@@ -307,6 +309,7 @@ def api_server(args):
                 adapters=adapters,
                 enable_prefix_caching=args.enable_prefix_caching,
                 device_type=args.device,
+                quant_policy=args.quant_policy,
                 max_prefill_token_num=args.max_prefill_token_num)
         else:
             from lmdeploy.messages import TurbomindEngineConfig

diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -256,6 +256,8 @@ class PytorchEngineConfig:
         revision (str): The specific model version to use.
             It can be a branch name, a tag name, or a commit id.
             If unspecified, will use the default version.
+        quant_policy (int): default to 0. When k/v is quantized into 4 or 8
+            bit, set it to 4 or 8, respectively
     """
     dtype: str = 'auto'
     tp: int = 1
@@ -275,6 +277,7 @@ class PytorchEngineConfig:
     custom_module_map: Dict[str, str] = None
     download_dir: str = None
     revision: str = None
+    quant_policy: Literal[0, 4, 8] = 0
 
     def __post_init__(self):
         """Check input validation."""
@@ -286,9 +289,12 @@ def __post_init__(self):
         assert self.max_prefill_token_num >= 0, \
             'invalid max_prefill_token_num'
         assert self.num_gpu_blocks >= 0, 'invalid num_gpu_blocks'
+        assert self.quant_policy in (0, 4, 8), 'invalid quant_policy'
         assert self.device_type in [
             'cuda', 'ascend'
         ], (f'invalid device_type: {self.device_type}')
+        if self.quant_policy > 0 and self.device_type != 'cuda':
+            assert False, 'kv cache quantization only works for CUDA.'
 
 
 class ResponseType(enum.Enum):

diff --git a/lmdeploy/pytorch/backends/ascend/attention.py b/lmdeploy/pytorch/backends/ascend/attention.py
@@ -57,6 +57,8 @@ def forward(
         k_cache: Tensor,
         v_cache: Tensor,
         attn_metadata: AscendAttentionMetadata,
+        k_scales_zeros: Tensor = None,
+        v_scales_zeros: Tensor = None,
         inplace: bool = True,
     ) -> Tensor:
         """forward."""

diff --git a/lmdeploy/pytorch/backends/attention.py b/lmdeploy/pytorch/backends/attention.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Generic, TypeVar
+from typing import Generic, Literal, TypeVar
 
 import torch
 
@@ -14,6 +14,7 @@ class AttentionMetadata:
     q_start_loc: torch.Tensor = None
     q_seqlens: torch.Tensor = None
     kv_seqlens: torch.Tensor = None
+    quant_policy: Literal[0, 4, 8] = 0
 
 
 T = TypeVar('T', bound=AttentionMetadata)