Skip to content

Commit

Permalink
async_output_proc: Add virtual engine support
Browse files Browse the repository at this point in the history
  • Loading branch information
alexm-neuralmagic committed Aug 27, 2024
1 parent 6fc4e6e commit 9e8f61e
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 51 deletions.
115 changes: 75 additions & 40 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import time
from collections import deque
from contextlib import contextmanager
from dataclasses import dataclass
from dataclasses import dataclass, field
from typing import (TYPE_CHECKING, Any, ClassVar, Deque, Dict, Iterable, List,
Mapping, Optional)
from typing import Sequence as GenericSequence
Expand Down Expand Up @@ -40,7 +40,8 @@
from vllm.sampling_params import SamplingParams
from vllm.sequence import (EmbeddingSequenceGroupOutput, ExecuteModelRequest,
SamplerOutput, Sequence, SequenceGroup,
SequenceGroupMetadata, SequenceStatus)
SequenceGroupMetadata, SequenceStatus,
AsyncCallbackData)
from vllm.tracing import (SpanAttributes, SpanKind, extract_trace_context,
init_tracer)
from vllm.transformers_utils.config import try_get_generation_config
Expand Down Expand Up @@ -88,6 +89,19 @@ class SchedulerOutputState:
last_output: Optional[SamplerOutput] = None


@dataclass
class SchedulerContext:
output_queue: Deque[Tuple[List[SamplerOutput],
List[Tuple[ScheduledSequenceGroup,
SequenceGroupMetadata]],
SchedulerOutputs]] = field(
default_factory=lambda: deque())

request_outputs: List[Union[RequestOutput,
EmbeddingRequestOutput]] = field(
default_factory=lambda: [])


class LLMEngine:
"""An LLM engine that receives requests and generates texts.
Expand Down Expand Up @@ -406,12 +420,17 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
for _ in range(self.parallel_config.pipeline_parallel_size)
]

# Async output processing pointers
self.output_queue: Deque[Tuple[List[SamplerOutput],
List[SequenceGroupMetadata],
SchedulerOutputs]] = deque()
self.request_outputs: List[Union[RequestOutput,
EmbeddingRequestOutput]] = []
self.scheduler_contexts = [
SchedulerContext()
for _ in range(self.parallel_config.pipeline_parallel_size)
]

self.async_callback_data = [
AsyncCallbackData(self._process_model_outputs, {
"virtual_engine": v_id,
"is_async": True,
}) for v_id in range(self.parallel_config.pipeline_parallel_size)
]

def _initialize_kv_caches(self) -> None:
"""Initialize the KV cache in the worker(s).
Expand Down Expand Up @@ -1214,32 +1233,28 @@ def _process_sequence_group_outputs(

return

def _process_model_outputs(self,
is_async: bool,
clear_outputs: bool = True) -> None:
def _process_model_outputs(self, virtual_engine: int,
is_async: bool) -> None:
"""Apply the model output to the sequences in the scheduled seq groups.
virtual_engine: The engine id to operate on
is_async: Indicates whether this postprocessor runs in
parallel with the GPU forward pass and is processing
tokens from the previous step. If this is true, then
no tokens need to be appended since it is already done
externally (before the next schedule() call)
clear_outputs: Sometimes existing outputs need to be combined
with outputs of this call. This happens for postprocessor
draining at the final stage (like when sequences are finished)
Returns RequestOutputs that can be returned to the client.
"""
now = time.time()

if clear_outputs:
self.request_outputs.clear()
ctx: SchedulerContext = self.scheduler_contexts[virtual_engine]

if len(self.output_queue) == 0:
if len(ctx.output_queue) == 0:
return None

(outputs, seq_group_metadata_list,
scheduler_outputs) = self.output_queue.popleft()
scheduler_outputs) = ctx.output_queue.popleft()

# Sanity check
assert len(seq_group_metadata_list) == len(
Expand Down Expand Up @@ -1314,11 +1329,11 @@ def _process_model_outputs(self,
if (seq_group.is_finished()
if self.step_return_finished_only else True):
request_output = RequestOutputFactory.create(seq_group)
self.request_outputs.append(request_output)
ctx.request_outputs.append(request_output)

for seq_group in scheduler_outputs.ignored_seq_groups:
request_output = RequestOutputFactory.create(seq_group)
self.request_outputs.append(request_output)
ctx.request_outputs.append(request_output)

if is_async:
# Log stats.
Expand Down Expand Up @@ -1414,29 +1429,41 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
"Pipeline parallelism is only supported through AsyncLLMEngine "
"as performance will be severely degraded otherwise.")

virtual_engine = 0

# These are cached outputs from previous iterations. None if on first
# iteration
cached_outputs = self.cached_scheduler_outputs[0]
cached_outputs = self.cached_scheduler_outputs[virtual_engine]
seq_group_metadata_list = cached_outputs.seq_group_metadata_list
scheduler_outputs = cached_outputs.scheduler_outputs
allow_async_output_proc = cached_outputs.allow_async_output_proc

ctx = self.scheduler_contexts[virtual_engine]

# Skip the scheduler if there are any remaining steps in the seq groups.
# This ensures that the scheduler is only called again when the current
# batch has completed.
if not self._has_remaining_steps(seq_group_metadata_list):

# Clear outputs on scheduler iteration start
ctx.request_outputs.clear()

# Schedule iteration
(seq_group_metadata_list, scheduler_outputs,
allow_async_output_proc) = self.scheduler[0].schedule()
allow_async_output_proc
) = self.scheduler[virtual_engine].schedule()

if not allow_async_output_proc and len(self.output_queue) > 0:
self._process_model_outputs(is_async=True)
# Maybe switch from async mode to sync mode
if not allow_async_output_proc and len(ctx.output_queue) > 0:
self._process_model_outputs(virtual_engine=virtual_engine,
is_async=True)

if (self.scheduler_config.is_multi_step
and scheduler_outputs.num_lookahead_slots > 0):
# cache the scheduler outputs for the next iteration if we have
# lookahead slots
self._cache_scheduler_outputs_for_multi_step(
0, seq_group_metadata_list, scheduler_outputs,
virtual_engine, seq_group_metadata_list, scheduler_outputs,
allow_async_output_proc)

assert seq_group_metadata_list is not None
Expand All @@ -1447,14 +1474,14 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:

if not scheduler_outputs.is_empty():
finished_requests_ids = self.scheduler[
0].get_and_reset_finished_requests_ids()
virtual_engine].get_and_reset_finished_requests_ids()

# Check if we have a cached last_output from the previous iteration.
# For supporting PP this is probably the best way to pass the
# sampled_token_ids, as a separate broadcast over all the PP stages
# will cause one virtual engine's microbatch to block the pipeline.
last_sampled_token_ids = \
self._get_last_sampled_token_ids(0)
self._get_last_sampled_token_ids(virtual_engine)

execute_model_req = ExecuteModelRequest(
seq_group_metadata_list=seq_group_metadata_list,
Expand All @@ -1469,20 +1496,24 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
last_sampled_token_ids=last_sampled_token_ids)

if allow_async_output_proc:
execute_model_req.output_proc_callback_fn = \
self._process_model_outputs
execute_model_req.async_callback = self.async_callback_data[
virtual_engine]

output = self.model_executor.execute_model(
execute_model_req=execute_model_req)

# we need to do this here so that last step's sampled_token_ids can
# We need to do this here so that last step's sampled_token_ids can
# be passed to the next iteration for PP.
if self.scheduler_config.is_multi_step:
self._update_cached_scheduler_output(0, output)
self._update_cached_scheduler_output(virtual_engine, output)
else:
if len(self.output_queue) > 0:
# Nothing scheduled => If there is pending async postprocessor,
# then finish it here.
if len(ctx.output_queue) > 0:
assert not self.scheduler_config.is_multi_step
self._process_model_outputs(is_async=True)
self._process_model_outputs(virtual_engine=virtual_engine,
is_async=True)
# No outputs in this case
output = []

# Finish the current step for all the sequence groups.
Expand All @@ -1497,7 +1528,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:

# Add results to the output_queue
# (for async or non-async postprocessing)
self.output_queue.append(
ctx.output_queue.append(
(output, seq_group_metadata_list, scheduler_outputs))

if output and allow_async_output_proc:
Expand All @@ -1508,23 +1539,27 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
output[0], seq_group_metadata_list,
scheduler_outputs.scheduled_seq_groups)

# Check if need to run the usual non-async path
if not allow_async_output_proc:
self._process_model_outputs(is_async=False)
self._process_model_outputs(virtual_engine=virtual_engine,
is_async=False)

# Log stats.
self.do_log_stats(scheduler_outputs, output)

# Tracing
self.do_tracing(scheduler_outputs)
else:
# Multi-step case
self.request_outputs = []

if not self.has_unfinished_requests():
# Drain async postprocessor
if len(self.output_queue) > 0:
# Drain async postprocessor (if exists)
if len(ctx.output_queue) > 0:
assert not self.scheduler_config.is_multi_step
self._process_model_outputs(is_async=True, clear_outputs=False)
assert len(self.output_queue) == 0
self._process_model_outputs(virtual_engine=virtual_engine,
is_async=True)
assert len(ctx.output_queue) == 0

# Stop the execute model loop in parallel workers until there are
# more requests to process. This avoids waiting indefinitely in
Expand All @@ -1533,7 +1568,7 @@ def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]:
# queued control plane messages, such as add/remove lora adapters.
self.model_executor.stop_remote_worker_execution_loop()

return self.request_outputs
return ctx.request_outputs

def _has_remaining_steps(
self, seq_group_metadata_list: Optional[List[SequenceGroupMetadata]]
Expand Down
15 changes: 12 additions & 3 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,6 +811,9 @@ def remove(self, seq_id: int) -> None:
self.is_single_seq = len(self.seqs) == 1

def is_finished(self) -> bool:
if self.is_single_seq:
return self.seqs[0].is_finished()

return all(seq.is_finished() for seq in self.seqs)

def is_prefill(self) -> bool:
Expand Down Expand Up @@ -1259,6 +1262,12 @@ def expand_with_bonus_tokens(
[self.hidden_states, self.second_last_token_hidden_states])[index]


@dataclass
class AsyncCallbackData:
func: Callable
kw_args: Dict[str, Any]


class ExecuteModelRequest(
msgspec.Struct,
array_like=True, # type: ignore[call-arg]
Expand Down Expand Up @@ -1290,8 +1299,8 @@ class ExecuteModelRequest(
finished_requests_ids: List[str] = msgspec.field(default_factory=list)
# The last sampled token ids for multi step decoding.
last_sampled_token_ids: Optional[torch.Tensor] = None
# Async postprocessor
output_proc_callback_fn: Optional[Callable] = None
# Async callback
async_callback: Optional[AsyncCallbackData] = None

@property
def is_first_multi_step(self) -> bool:
Expand Down Expand Up @@ -1338,4 +1347,4 @@ def clone(
finished_requests_ids=self.finished_requests_ids,
last_sampled_token_ids=self.last_sampled_token_ids.clone()
if self.last_sampled_token_ids is not None else None,
output_proc_callback_fn=self.output_proc_callback_fn)
async_callback=self.async_callback)
12 changes: 7 additions & 5 deletions vllm/worker/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
LRUCacheWorkerPromptAdapterManager)
from vllm.sampling_params import SamplingParams
from vllm.sequence import (IntermediateTensors, SamplerOutput,
SequenceGroupMetadata)
SequenceGroupMetadata, AsyncCallbackData)
from vllm.utils import (CudaMemoryProfiler, PyObjectCache, async_tensor_h2d,
flatten_2d_lists, is_hip, is_pin_memory_available)
from vllm.worker.model_runner_base import (
Expand Down Expand Up @@ -90,7 +90,7 @@ class ModelInputForGPU(ModelRunnerInputBase):
request_ids_to_seq_ids: Optional[Dict[str, List[int]]] = None
finished_requests_ids: Optional[List[str]] = None
virtual_engine: int = 0
output_proc_callback_fn: Optional[Callable] = None
async_callback: Optional[AsyncCallbackData] = None

def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
tensor_dict = {
Expand Down Expand Up @@ -1456,9 +1456,11 @@ def execute_model(
if not self.is_driver_worker:
return []

if model_input.output_proc_callback_fn is not None:
model_input.output_proc_callback_fn(is_async=True)

if model_input.async_callback is not None:
func = model_input.async_callback.func
kw_args = model_input.async_callback.kw_args
func(**kw_args)

# Sample the next token.
output: SamplerOutput = self.model.sample(
logits=logits,
Expand Down
5 changes: 2 additions & 3 deletions vllm/worker/worker_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,10 @@ def _get_driver_input_and_broadcast(
broadcast_data.update(kwargs)
broadcast_tensor_dict(broadcast_data, src=0)

if execute_model_req.output_proc_callback_fn:
if execute_model_req.async_callback:
model_input = dataclasses.replace( # type: ignore
model_input,
output_proc_callback_fn=execute_model_req.
output_proc_callback_fn)
async_callback=execute_model_req.async_callback)

return model_input, worker_input, kwargs

Expand Down

0 comments on commit 9e8f61e

Please sign in to comment.