vllm-project · youkaichao · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024 · Jul 4, 2024
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
@@ -1,6 +1,9 @@
 import ray
+import torch
 
 import vllm.envs as envs
+from vllm.model_executor.model_loader.weight_utils import (
+    safetensors_weights_iterator)
 from vllm.utils import (cuda_device_count_stateless, is_hip,
                         update_environment_variables)
 
@@ -36,3 +39,49 @@ def test_cuda_device_count_stateless():
     assert ray.get(actor.get_count.remote()) == 1
     ray.get(actor.set_cuda_visible_devices.remote(""))
     assert ray.get(actor.get_count.remote()) == 0
+
+
+def test_deferred_tensor():
+    from safetensors import safe_open
+    from safetensors.torch import save_file
+    tensors = {
+        "scalar": torch.ones(tuple()),
+        "vector": torch.ones(2),
+        "matrix": torch.ones((2, 3)),
+        "tensor": torch.ones((2, 3, 4)),
+    }
+    save_file(tensors, "model.safetensors")
+
+    for name, dt in safetensors_weights_iterator(["model.safetensors"]):
+        with safe_open("model.safetensors",
+                       framework="pt") as f:  # type: ignore
+            real_tensor = f.get_tensor(name)
+            real_tensor.copy_(dt)  # test we can use `copy_`
+            stacked = torch.stack([real_tensor, real_tensor])
+            stacked[0] = dt  # test we can use `__setitem__` to assign
+            if name != "scalar":
+                real_tensor[1:] = dt[1:]  # test we can assign slices
+            if name in ["matrix", "tensor"]:
+                real_norm = torch.nn.functional.normalize(real_tensor)
+                dt_norm = torch.nn.functional.normalize(dt)
+                assert torch.allclose(real_norm,
+                                      dt_norm)  # test we can use `normalize`
+            assert torch.allclose(real_tensor.cpu(),
+                                  dt.cpu())  # test we can move to device
+            assert torch.allclose(
+                real_tensor.to(dtype=torch.float64),
+                dt.to(dtype=torch.float64))  # test we can change dtype
+
+            assert torch.allclose(real_tensor + 1, dt + 1)
+            assert torch.allclose(real_tensor.float(),
+                                  dt.float())  # test we can use `.float()`
+            assert torch.allclose(real_tensor.data,
+                                  dt.data)  # test we can use `.data`
+            assert torch.allclose(real_tensor.view(-1),
+                                  dt.view(-1))  # test we can use `view`
+            assert torch.allclose(torch.reshape(real_tensor, (-1, )),
+                                  torch.reshape(
+                                      dt, (-1, )))  # test we can use `reshape`
+            if name != "tensor":
+                assert torch.allclose(real_tensor.t(),
+                                      dt.t())  # test we can use `t()`
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
@@ -6,7 +6,7 @@
 import os
 import tempfile
 from collections import defaultdict
-from typing import Any, Generator, Iterable, List, Optional, Tuple
+from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple
 
 import filelock
 import huggingface_hub.constants
@@ -22,6 +22,7 @@
 from vllm.model_executor.layers.quantization import (QuantizationConfig,
                                                      get_quantization_config)
 from vllm.model_executor.layers.quantization.schema import QuantParamSchema
+from vllm.utils import DeferredTensor
 
 logger = init_logger(__name__)
 
@@ -350,15 +351,52 @@ def np_cache_weights_iterator(
         yield name, torch.from_numpy(param)
 
 
+def _parse_metadata_from_safetensors(
+        filepath: str) -> Dict[str, Dict[str, Any]]:
+    # format from https://huggingface.co/docs/safetensors/en/index#format
+    with open(filepath, "rb") as f:
+        size = int.from_bytes(f.read(8), "little")
+        data = json.loads(f.read(size).decode('utf-8'))
+    return data
+
+
 def safetensors_weights_iterator(
     hf_weights_files: List[str]
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
-    """Iterate over the weights in the model safetensor files."""
+    """Iterate over the weights in the model safetensor files.
+    NOTE: we read the file as lazily as possible. If this process
+    does not need any weight inside a safetensor file (e.g. pipeline
+    parallel), that file is not opened by safetensors library at all.
+    """
+    st_handles: Dict[str, Any] = {}
+
+    def layz_open_st(filename):
+        # lazily open safetensor files
+        if filename not in st_handles:
+            st_handles[filename] = safe_open(filename,
+                                             framework="pt").__enter__()
+        return st_handles[filename]
+
+    name_and_tensors = []
     for st_file in hf_weights_files:
-        with safe_open(st_file, framework="pt") as f:
-            for name in f.keys():  # noqa: SIM118
-                param = f.get_tensor(name)
-                yield name, param
+        data = _parse_metadata_from_safetensors(st_file)
+        for k, v in data.items():
+            if k == "__metadata__":
+                continue
+            dtype = v["dtype"]
+            shape = v["shape"]
+            name_and_tensors.append(
+                [k, DeferredTensor(layz_open_st, st_file, k, dtype, shape)])
+    for name, v in name_and_tensors:
+        # we actually return the DeferredTensor here
+        # but use `torch.Tensor` as the type hint to avoid
+        # changing too many user-side code
+        # users can use this value just like a torch.Tensor,
+        # except that slicing and `narrow` are optimized for I/O
+        yield name, v
+
+    for v in st_handles.values():
+        v.__exit__(None, None, None)  # type: ignore
 
 
 def pt_weights_iterator(
@@ -413,21 +451,6 @@ def kv_cache_scales_loader(
     return []
 
 
-def convert_pyslice_to_tensor(x: Any) -> torch.Tensor:
-    """convert PySafeSlice object from safetensors to torch.Tensor
-
-    PySafeSlice object supports indexing, which is done before loading the
-    actual tensor and can reduce the amount of memory being read into the
-    memory. However, it does not support more advanced functionalities
-    like `.view()` or `.t()`. Therefore, if we need to modify the loaded
-    tensor with these more complicated operators, we need to convert to
-    tensor first.
-    """
-    if not isinstance(x, torch.Tensor):
-        x = x[:]
-    return x
-
-
 def default_weight_loader(param: torch.Tensor,
                           loaded_weight: torch.Tensor) -> None:
     """Default weight loader."""

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -943,3 +943,209 @@ def parse_args(self, args=None, namespace=None):
                 processed_args.append(arg)
 
         return super().parse_args(processed_args, namespace)
+
+
+class DeferredTensor:
+    """This class is a placeholder for a tensor that is not materialized yet.
+    When we pass the object around, it will not materialize the tensor until
+    torch functions are called on it.
+    Notable exceptions are `shape`, `dtype`, `size`, `stride` which will be
+    returned directly without materializing the tensor.
+    Notable optimization is `narrow` method which will only materialize the
+    tensor slice that is narrowed, reducing the disk reads. Either `torch.narrow`
+    or `tensor.narrow` will materialize the tensor.
+
+    Basically, you can use instances of this class when you need values of the
+    tensor, but don't need in-place update of the tensor.
+    """ # noqa
+
+    def __init__(self, layz_open_st, st_file, name, dtype, shape):
+        self.layz_open_st = layz_open_st
+        self.st_file = st_file
+        self.name = name
+
+        # code from https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L40 # noqa
+        type_mapping = {
+            "BOOL": torch.bool,
+            "I8": torch.int8,
+            "U8": torch.uint8,
+            "I16": torch.int16,
+            "U16": torch.uint16,
+            "I32": torch.int32,
+            "U32": torch.uint32,
+            "I64": torch.int64,
+            "U64": torch.uint64,
+            "F16": torch.float16,
+            "F32": torch.float32,
+            "F64": torch.float64,
+            "BF16": torch.bfloat16,
+            "F8_E4M3": torch.float8_e4m3fn,
+            "F8_E5M2": torch.float8_e5m2
+        }
+        dtype = type_mapping[dtype]
+        shape = tuple(shape)
+        if shape:
+            self._meta_tensor = torch.zeros(*shape, dtype=dtype, device="meta")
+        else:
+            self._meta_tensor = torch.zeros(tuple(),
+                                            dtype=dtype,
+                                            device="meta")
+
+    def __getattr__(self, name):
+        if name in ["shape", "dtype", "size", "stride"]:
+            # redirect metadata information queries to the meta tensor
+            return getattr(self._meta_tensor, name)
+        if hasattr(torch.Tensor, name):
+            # the rest functions will materialize the tensor and call the
+            # function on the materialized tensor
+            tensor = self.materialize()
+            return getattr(tensor, name)
+        raise AttributeError(f"Attribute {name} not found")
+
+    def __getitem__(self, key) -> torch.Tensor:
+        return self.layz_open_st(self.st_file).get_slice(self.name)[key]
+
+    def materialize(self) -> torch.Tensor:
+        return self.layz_open_st(self.st_file).get_tensor(self.name)
+
+    def narrow(input, dim, start, length) -> torch.Tensor:
+        # `input` is a `DeferredTensor` object
+        # it does not use `self`, but `input` instead
+        # to better match https://pytorch.org/docs/stable/generated/torch.narrow.html signature # noqa
+
+        # `DeferredTensor` will only respond to `narrow` method
+        # which reads the corresponding data from disk and returns
+        # a materialized tensor
+        slices = [slice(None, None, None) for x in input._meta_tensor.shape]
+        slices[dim] = slice(start, start + length)
+        return input[tuple(slices)]
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        if func == torch.narrow:
+            if len(args) >= 2:
+                kwargs["dim"] = args[1]
+            if len(args) >= 3:
+                kwargs["start"] = args[2]
+            if len(args) >= 4:
+                kwargs["length"] = args[3]
+            return args[0].narrow(**kwargs)
+        new_args = []
+        for arg in args:
+            if isinstance(arg, DeferredTensor):
+                new_args.append(arg.materialize())
+            else:
+                new_args.append(arg)
+        return func(*new_args, **kwargs)
+
+    # implement common tensor operations, except for in-place operations
 loaded_weight = torch.nn.functional.normalize( 
 loaded_weight = torch.nn.functional.normalize( 
+
+    def __add__(self, other):
+        return self.materialize() + other
+
+    def __radd__(self, other):
+        return other + self.materialize()
+
+    def __sub__(self, other):
+        return self.materialize() - other
+
+    def __rsub__(self, other):
+        return other - self.materialize()
+
+    def __mul__(self, other):
+        return self.materialize() * other
+
+    def __rmul__(self, other):
+        return other * self.materialize()
+
+    def __truediv__(self, other):
+        return self.materialize() / other
+
+    def __rtruediv__(self, other):
+        return other / self.materialize()
+
+    def __floordiv__(self, other):
+        return self.materialize() // other
+
+    def __rfloordiv__(self, other):
+        return other // self.materialize()
+
+    def __mod__(self, other):
+        return self.materialize() % other
+
+    def __rmod__(self, other):
+        return other % self.materialize()
+
+    def __pow__(self, other):
+        return self.materialize()**other
+
+    def __rpow__(self, other):
+        return other**self.materialize()
+
+    def __matmul__(self, other):
+        return self.materialize() @ other
+
+    def __rmatmul__(self, other):
+        return other @ self.materialize()
+
+    def __and__(self, other):
+        return self.materialize() & other
+
+    def __rand__(self, other):
+        return other & self.materialize()
+
+    def __or__(self, other):
+        return self.materialize() | other
+
+    def __ror__(self, other):
+        return other | self.materialize()
+
+    def __xor__(self, other):
+        return self.materialize() ^ other
+
+    def __rxor__(self, other):
+        return other ^ self.materialize()
+
+    def __lshift__(self, other):
+        return self.materialize() << other
+
+    def __rlshift__(self, other):
+        return other << self.materialize()
+
+    def __rshift__(self, other):
+        return self.materialize() >> other
+
+    def __rrshift__(self, other):
+        return other >> self.materialize()
+
+    def __eq__(self, other):
+        return self.materialize() == other
+
+    def __ne__(self, other):
+        return self.materialize() != other
+
+    def __lt__(self, other):
+        return self.materialize() < other
+
+    def __le__(self, other):
+        return self.materialize() <= other
+
+    def __gt__(self, other):
+        return self.materialize() > other
+
+    def __ge__(self, other):
+        return self.materialize() >= other
+
+    def __neg__(self):
+        return -self.materialize()
+
+    def __pos__(self):
+        return +self.materialize()
+
+    def __abs__(self):
+        return abs(self.materialize())
+
+    def __invert__(self):
+        return ~self.materialize()
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -252,6 +252,8 @@ def __init__(
         self.flashinfer_prefill_wrapper = None
 
     def load_model(self) -> None:
+        logger.info("Start loading model")
+        start_time = time.perf_counter()
         with CudaMemoryProfiler() as m:
             self.model = get_model(model_config=self.model_config,
                                    device_config=self.device_config,
@@ -261,10 +263,12 @@ def load_model(self) -> None:
                                    parallel_config=self.parallel_config,
                                    scheduler_config=self.scheduler_config,
                                    cache_config=self.cache_config)
+        end_time = time.perf_counter()
+        elapsed_time = end_time - start_time
 
         self.model_memory_usage = m.consumed_memory
-        logger.info("Loading model weights took %.4f GB",
-                    self.model_memory_usage / float(2**30))
+        logger.info("Loading model weights took %.4f GB memory and %.4f sec",
+                    self.model_memory_usage / float(2**30), elapsed_time)
 
         if self.lora_config:
             assert supports_lora(self.model), "Model does not support LoRA"