[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (

vllm-project#8403)
MengqingCao · Sep 30, 2024 · f845a12 · f845a12
1 parent f1bc362
commit f845a12
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
@@ -230,12 +230,15 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
         self._base_model_runner: GPUModelRunnerBase = base_model_runner
 
         self.is_multi_step = self.scheduler_config.is_multi_step
-        # used to copy tensors from GPU to CPU asynchronously
-        self._copy_stream = torch.cuda.Stream()
         self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
 
         self.pythonization_cache = PythonizationCache()
 
+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
     def make_model_input_from_broadcasted_tensor_dict(
             self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
         model_input = (StatefulModelInput.from_broadcasted_tensor_dict(