From 521b0dc8a896087e04cbb3ec998d9e054321d2a7 Mon Sep 17 00:00:00 2001
From: Qing Lan <qingla@amazon.com>
Date: Mon, 29 Jul 2024 14:16:16 -0700
Subject: [PATCH 1/2] add prefix caching support

---
 .../djl_python/properties_manager/lmi_dist_rb_properties.py    | 1 +
 .../setup/djl_python/properties_manager/vllm_rb_properties.py  | 1 +
 .../setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py   | 1 +
 .../setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py | 3 ++-
 serving/docs/lmi/user_guides/lmi-dist_user_guide.md            | 1 +
 serving/docs/lmi/user_guides/vllm_user_guide.md                | 2 ++
 6 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
index 828a2d311..cf2b363c0 100644
--- a/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
+++ b/engines/python/setup/djl_python/properties_manager/lmi_dist_rb_properties.py
@@ -61,6 +61,7 @@ class LmiDistRbProperties(Properties):
     max_logprobs: Optional[int] = 20
     enable_chunked_prefill: Optional[bool] = False
     cpu_offload_gb_per_gpu: Optional[int] = 0
+    enable_prefix_caching: Optional[bool] = False
 
     @model_validator(mode='after')
     def validate_mpi(self):
diff --git a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
index 005f92cb2..699431fa7 100644
--- a/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
+++ b/engines/python/setup/djl_python/properties_manager/vllm_rb_properties.py
@@ -56,6 +56,7 @@ class VllmRbProperties(Properties):
     max_logprobs: Optional[int] = 20
     enable_chunked_prefill: Optional[bool] = False
     cpu_offload_gb_per_gpu: Optional[int] = 0
+    enable_prefix_caching: Optional[bool] = False
 
     @field_validator('engine')
     def validate_engine(cls, engine):
diff --git a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
index 387f52d07..6f7f5c66c 100644
--- a/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
+++ b/engines/python/setup/djl_python/rolling_batch/lmi_dist_rolling_batch.py
@@ -83,6 +83,7 @@ def __init__(self, model_id_or_path: str, properties: dict, **kwargs):
             revision=self.lmi_dist_config.revision,
             enable_chunked_prefill=self.lmi_dist_config.enable_chunked_prefill,
             cpu_offload_gb=self.lmi_dist_config.cpu_offload_gb_per_gpu,
+            enable_prefix_caching=self.lmi_dist_config.enable_prefix_caching,
             **engine_kwargs)
 
         kwargs = {}
diff --git a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
index e04db20fd..3480ee46a 100644
--- a/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
+++ b/engines/python/setup/djl_python/rolling_batch/rolling_batch_vllm_utils.py
@@ -232,7 +232,8 @@ def get_engine_args_from_config(config: VllmRbProperties) -> EngineArgs:
             revision=config.revision,
             max_logprobs=config.max_logprobs,
             enable_chunked_prefill=config.enable_chunked_prefill,
-            cpu_offload_gb=config.cpu_offload_gb_per_gpu)
+            cpu_offload_gb=config.cpu_offload_gb_per_gpu,
+            enable_prefix_caching=config.enable_prefix_caching)
 
 
 def get_multi_modal_data(request: Request) -> dict:
diff --git a/serving/docs/lmi/user_guides/lmi-dist_user_guide.md b/serving/docs/lmi/user_guides/lmi-dist_user_guide.md
index c2ebe7936..2b1cac132 100644
--- a/serving/docs/lmi/user_guides/lmi-dist_user_guide.md
+++ b/serving/docs/lmi/user_guides/lmi-dist_user_guide.md
@@ -137,3 +137,4 @@ Here are the advanced parameters that are available when using LMI-Dist.
 | option.max_cpu_loras                    | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters to cache in memory. All others will be evicted to disk.                                                                                                                                                                                                                                                                                        | Default: `None`       |
 | option.enable_chunked_prefill                    | \>= 0.29.0  | Pass Through       | This config enables chunked prefill support. With chunked prefill, longer prompts will be chunked and batched with decode requests to reduce inter token latency. This option is EXPERIMENTAL and tested for llama and falcon models only. This does not work with LoRA and speculative decoding yet.                                                                                                     | Default: `False`      |
 | option.cpu_offload_gb_per_gpu                    | \>= 0.29.0  | Pass Through       | This config allows offloading model weights into CPU to enable large model running with limited GPU memory.                                                                                                                                                                                                                                                                                               | Default: `0`          |
+| option.enable_prefix_caching                    | \>= 0.29.0  | Pass Through       | This config allows the engine to cache the context memory and reuse to speed up inference.                                                                                                                                                                                                                                                                                                                | Default: `False`      |
diff --git a/serving/docs/lmi/user_guides/vllm_user_guide.md b/serving/docs/lmi/user_guides/vllm_user_guide.md
index f17a185a0..2ebbfafb4 100644
--- a/serving/docs/lmi/user_guides/vllm_user_guide.md
+++ b/serving/docs/lmi/user_guides/vllm_user_guide.md
@@ -126,3 +126,5 @@ In that situation, there is nothing LMI can do until the issue is fixed in the b
 | option.lora_extra_vocab_size            | \>= 0.27.0  | Pass Through       | This config determines the maximum additional vocabulary that can be added through a LoRA adapter.                                                                                                                                                                                                                                                                           | Default: `256`        |
 | option.max_cpu_loras                    | \>= 0.27.0  | Pass Through       | This config determines the maximum number of LoRA adapters to cache in memory. All others will be evicted to disk.                                                                                                                                                                                                                                                           | Default: `None`       |
 | option.enable_chunked_prefill                    | \>= 0.29.0  | Pass Through       | This config enables chunked prefill support. With chunked prefill, longer prompts will be chunked and batched with decode requests to reduce inter token latency. This option is EXPERIMENTAL and tested for llama and falcon models only. This does not work with LoRA and speculative decoding yet.                                                                                                                                                                                                                                                                                                                     | Default: `False`       |
+| option.cpu_offload_gb_per_gpu                    | \>= 0.29.0  | Pass Through       | This config allows offloading model weights into CPU to enable large model running with limited GPU memory.                                                                                                                                                                                                                                                                                               | Default: `0`          |
+| option.enable_prefix_caching                    | \>= 0.29.0  | Pass Through       | This config allows the engine to cache the context memory and reuse to speed up inference.                                                                                                                                                                                                                                                                                                                | Default: `False`      |

From 110ddf5faa36365d4f04ae2188468bcf6cb313b9 Mon Sep 17 00:00:00 2001
From: Qing Lan <qingla@amazon.com>
Date: Tue, 30 Jul 2024 08:57:57 -0700
Subject: [PATCH 2/2] add deepseek

---
 wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java b/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java
index ce096ff1b..303100a06 100644
--- a/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java
+++ b/wlm/src/main/java/ai/djl/serving/wlm/LmiConfigRecommender.java
@@ -71,6 +71,8 @@ public final class LmiConfigRecommender {
                     Map.entry("phi3_v", "lmi-dist"),
                     // vllm 0.5.3
                     Map.entry("chameleon", "lmi-dist"),
+                    Map.entry("deepseek", "lmi-dist"),
+                    Map.entry("deepseek_v2", "lmi-dist"),
                     Map.entry("fuyu", "lmi-dist"));
 
     private static final Set<String> OPTIMIZED_TASK_ARCHITECTURES =