vllm-project · alexm-neuralmagic · Jul 30, 2024 · Aug 20, 2024 · Aug 20, 2024 · Aug 20, 2024
diff --git a/tests/core/test_chunked_prefill_scheduler.py b/tests/core/test_chunked_prefill_scheduler.py
@@ -21,7 +21,7 @@ def append_new_token(seq_group, token_id: int):
 
 
 def schedule_and_update_computed_tokens(scheduler):
-    metas, out = scheduler.schedule()
+    metas, out, _, _ = scheduler.schedule()
     for s, meta in zip(out.scheduled_seq_groups, metas):
         s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
     return metas, out
@@ -180,7 +180,7 @@ def test_maximal_decoding():
     """Verify decoding requests are prioritized."""
     block_size = 4
     max_seqs = 2
-    max_model_len = 2
+    max_model_len = 8
     max_num_batched_tokens = 2
     scheduler_config = SchedulerConfig(max_num_batched_tokens,
                                        max_seqs,

diff --git a/tests/core/utils.py b/tests/core/utils.py
@@ -199,7 +199,7 @@ def append_new_token(out, token_id: int):
 
 
 def schedule_and_update_computed_tokens(scheduler):
-    metas, out = scheduler.schedule()
+    metas, out, _, _ = scheduler.schedule()
     for s, meta in zip(out.scheduled_seq_groups, metas):
         s.seq_group.update_num_computed_tokens(meta.token_chunk_size)
     return metas, out

@@ -49,6 +49,8 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME,
         str(TP_SIZE),
         "--distributed-executor-backend",
         DIST_BACKEND,
+        # disable output proc callback to test PP
+        "--disable-output-proc-callback",
     ]
 
     # compare without pipeline parallelism

diff --git a/tests/distributed/test_pp_cudagraph.py b/tests/distributed/test_pp_cudagraph.py
@@ -22,6 +22,8 @@ def test_pp_cudagraph(PP_SIZE, MODEL_NAME, ATTN_BACKEND):
         str(PP_SIZE),
         "--distributed-executor-backend",
         "mp",
+        # disable output proc callback to test PP
+        "--disable-output-proc-callback",
     ]
     os.environ["VLLM_ATTENTION_BACKEND"] = ATTN_BACKEND
 

diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
@@ -98,6 +98,10 @@ def _test_stopping(llm_engine: LLMEngine,
     output: Optional[CompletionOutput] = None
     output_text = ""
     stop_reason = None
+
+    # Run first (because of async callback)
+    llm_engine.step()
+
     while llm_engine.has_unfinished_requests():
         (request_output, ) = llm_engine.step()
         (output, ) = request_output.outputs

@@ -60,7 +60,7 @@ async def test_multi_step(example_prompts, model: str, tp_size: int,
 
     server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
     ms_server_args = DEFAULT_SERVER_ARGS + \
-        ["--num-scheduler-steps", f"{num_scheduler_steps}"]
+        ["--num-scheduler-steps", f"{num_scheduler_steps}"]#, "--disable-output-proc-callback"]
 
     if eager_mode:
         ms_server_args.append("--enforce-eager")
@@ -82,4 +82,11 @@ def get_text_generations(completions):
 
     ref_generations = get_text_generations(ref_completions)
     test_generations = get_text_generations(test_completions)
+
+    print("ref_generations:")
+    for gen in ref_generations:
+        print("ref_gen: {}".format(gen))
+    print("test_generations:")
+    for gen in test_generations:
+        print("test_gen: {}".format(gen))
     assert ref_generations == test_generations
diff --git a/vllm/config.py b/vllm/config.py
@@ -137,6 +137,7 @@ def __init__(
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
         limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
+        use_output_proc_callback: Optional[bool] = True,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -167,6 +168,7 @@ def __init__(
                                     code_revision, rope_scaling, rope_theta)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+        self.use_output_proc_callback = use_output_proc_callback
 
         # Choose a default enforce_eager value if the user did not specify
         # a value (enforce_eager is None)
@@ -320,6 +322,30 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
+    def verify_output_proc_callback(self, speculative_config,
+                                    device_config) -> None:
+        if device_config.device_type != "cuda":
+            logger.warning(
+                "Output proc callback can only be enabled with CUDA")
+            self.use_output_proc_callback = False
+            return
+        if self.enforce_eager:
+            logger.warning(
+                "To see benefits of output processor callback, enable CUDA "
+                "graph. Since, enforce-eager is enabled, output processor "
+                "callback cannot be used")
+            self.use_output_proc_callback = not self.enforce_eager
+            return
+        # Async postprocessor is not necessary with embedding mode
+        # since there is no token generation
+        if self.embedding_mode:
+            self.use_output_proc_callback = False
+
+        if speculative_config:
+            self.use_output_proc_callback = False
+
+        # TO DO: assert mp backend
+
     def verify_with_parallel_config(
         self,
         parallel_config: "ParallelConfig",
@@ -352,6 +378,11 @@ def verify_with_parallel_config(
                            "fallback to the eager mode.")
             self.enforce_eager = True
 
+        if (pipeline_parallel_size > 1) and (self.use_output_proc_callback):
+            raise NotImplementedError(
+                "Output processor callback is not supported with "
+                "pipeline parallelism currently. Disable the callback.")
+
     def get_hf_config_sliding_window(self) -> Optional[int]:
         """Get the sliding window size, or None if disabled."""
 
@@ -1754,6 +1785,8 @@ class EngineConfig:
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
         """
+        self.model_config.verify_output_proc_callback(self.speculative_config,
+                                                      self.device_config)
         self.model_config.verify_with_parallel_config(self.parallel_config)
         self.cache_config.verify_with_parallel_config(self.parallel_config)