AI-Hypercomputer · FanhaiLu1 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
@@ -784,6 +784,7 @@ def _detokenize_thread(self, idx: int):
               # Place the slot back on the free queue.
               my_live_requests[slot] = None
               my_slots.put(slot, block=False)  # This should always have space.
+              my_generate_engine.free_resource(slot)
         logging.info(
             "Detokenizing generate step %d took %.2fms",
             generate_timestep_added,

@@ -187,6 +187,18 @@ def insert(
     a [0, n) range of slots and converted internally.
     """
 
+  def free_resource(
+      self,
+      slot: int, # pylint: disable=unused-argument
+  ) -> Any:
+    """Free cache and other decode resource for the slot.
+
+    This function is needed for advanced attetnion kenel like PageAttetion.
+    After finishing one request, the engine need to free all used page block
+    resource and reuse for coming requests.
+    """
+    return None
+
   @abc.abstractmethod
   def load_params(self, *args, **kwargs) -> Params:
     """Loads parameters.