[vLLM] add prefix caching support (#2239)

deepjavalibrary · Jul 30, 2024 · 6387eb9 · 6387eb9
1 parent 734065b
commit 6387eb9
Show file tree

Hide file tree

Showing 7 changed files with 10 additions and 1 deletion.
diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
@@ -61,6 +61,7 @@ class LmiDistRbProperties(Properties):
     max_logprobs: Optional[int] = 20
     enable_chunked_prefill: Optional[bool] = False
     cpu_offload_gb_per_gpu: Optional[int] = 0
+    enable_prefix_caching: Optional[bool] = False
 
     @model_validator(mode='after')
     def validate_mpi(self):

diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -56,6 +56,7 @@ class VllmRbProperties(Properties):
     max_logprobs: Optional[int] = 20
     enable_chunked_prefill: Optional[bool] = False
     cpu_offload_gb_per_gpu: Optional[int] = 0
+    enable_prefix_caching: Optional[bool] = False
 
     @field_validator('engine')
     def validate_engine(cls, engine):

diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
@@ -83,6 +83,7 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
             revision=self.lmi_dist_config.revision,
             enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill,
             cpu_offload_gb=self.lmi_dist_config.cpu_offload_gb_per_gpu,
+            enable_prefix_caching=self.lmi_dist_config.enable_prefix_caching,
             **engine_kwargs)
 
         kwargs = {}

diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
@@ -232,7 +232,8 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs:
             revision=config.revision,
             max_logprobs=config.max_logprobs,
             enable_chunked_prefill=config.enable_chunked_prefill,
-            cpu_offload_gb=config.cpu_offload_gb_per_gpu)
+            cpu_offload_gb=config.cpu_offload_gb_per_gpu,
+            enable_prefix_caching=config.enable_prefix_caching)
 
 
 def get_multi_modal_data(request: Request) -> dict:

diff --git a/serving/docs/lmi/user_guides/lmi-dist_user_guide.md b/serving/docs/lmi/user_guides/lmi-dist_user_guide.md
@@ -137,3 +137,4 @@ Here are the advanced parameters that are available when using LMI-Dist.
 | option.max_cpu_loras                    | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters to cache in memory. All others will be evicted to disk.                                                                                                                                                                                                                                                                                        | Default: `None`       |
 | option.enable_chunked_prefill                    | \>= 0.29.0  | Pass Through       | This config enables chunked prefill support. With chunked prefill, longer prompts will be chunked and batched with decode requests to reduce inter token latency. This option is EXPERIMENTAL and tested for llama and falcon models only. This does not work with LoRA and speculative decoding yet.                                                                                                     | Default: `False`      |
 | option.cpu_offload_gb_per_gpu                    | \>= 0.29.0  | Pass Through       | This config allows offloading model weights into CPU to enable large model running with limited GPU memory.                                                                                                                                                                                                                                                                                               | Default: `0`          |
+| option.enable_prefix_caching                    | \>= 0.29.0  | Pass Through       | This config allows the engine to cache the context memory and reuse to speed up inference.                                                                                                                                                                                                                                                                                                                | Default: `False`      |
diff --git a/serving/docs/lmi/user_guides/vllm_user_guide.md b/serving/docs/lmi/user_guides/vllm_user_guide.md
@@ -126,3 +126,5 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.lora_extra_vocab_size            | \>= 0.27.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                                                                                                           | Default: `256`        |
 | option.max_cpu_loras                    | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters to cache in memory. All others will be evicted to disk.                                                                                                                                                                                                                                                           | Default: `None`       |
 | option.enable_chunked_prefill                    | \>= 0.29.0  | Pass Through       | This config enables chunked prefill support. With chunked prefill, longer prompts will be chunked and batched with decode requests to reduce inter token latency. This option is EXPERIMENTAL and tested for llama and falcon models only. This does not work with LoRA and speculative decoding yet.                                                                                                                                                                                                                                                                                                                     | Default: `False`       |
+| option.cpu_offload_gb_per_gpu                    | \>= 0.29.0  | Pass Through       | This config allows offloading model weights into CPU to enable large model running with limited GPU memory.                                                                                                                                                                                                                                                                                               | Default: `0`          |
+| option.enable_prefix_caching                    | \>= 0.29.0  | Pass Through       | This config allows the engine to cache the context memory and reuse to speed up inference.                                                                                                                                                                                                                                                                                                                | Default: `False`      |
diff --git a/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java b/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java
@@ -71,6 +71,8 @@ public final class LmiConfigRecommender {
                     Map.entry("phi3_v", "lmi-dist"),
                     // vllm 0.5.3
                     Map.entry("chameleon", "lmi-dist"),
+                    Map.entry("deepseek", "lmi-dist"),
+                    Map.entry("deepseek_v2", "lmi-dist"),
                     Map.entry("fuyu", "lmi-dist"));
 
     private static final Set<String> OPTIMIZED_TASK_ARCHITECTURES =