From d4b79cec41bc5756232e5c4894851151f6f49c37 Mon Sep 17 00:00:00 2001 From: Dipika Date: Fri, 23 Aug 2024 21:05:37 +0000 Subject: [PATCH 1/5] update compressed tensors lifecycle to remove prefix from create_weights --- vllm/model_executor/layers/linear.py | 9 ++---- .../compressed_tensors/compressed_tensors.py | 28 ++++++++----------- .../schemes/compressed_tensors_unquantized.py | 9 +++--- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e5b40a64abc41..1e8b0514d6762 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -208,8 +208,7 @@ def __init__(self, self.input_size, self.output_size, self.params_dtype, - weight_loader=self.weight_loader, - prefix=prefix) + weight_loader=self.weight_loader) if bias: self.bias = Parameter( @@ -307,8 +306,7 @@ def __init__(self, params_dtype=self.params_dtype, weight_loader=( self.weight_loader_v2 if self.quant_method.__class__.__name__ - in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), - prefix=prefix) + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -976,8 +974,7 @@ def __init__(self, params_dtype=self.params_dtype, weight_loader=( self.weight_loader_v2 if self.quant_method.__class__.__name__ - in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), - prefix=prefix) + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " "results can lead to incorrect results") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ae75781927381..3225d3c6d2942 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union import torch from pydantic import BaseModel @@ -52,15 +52,20 @@ def get_min_capability(cls) -> int: def get_name(self) -> str: return "compressed_tensors" - # TODO (@robertgshaw2-neuralmagic): do layer skipping though here - # rather than though create_weights to match other methods - def get_quant_method( + def get_quant_method( #type: ignore self, layer: torch.nn.Module, prefix: str, - ) -> Optional["QuantizeMethodBase"]: + ) -> Optional[Union["CompressedTensorsUnquantized", "QuantizeMethodBase"]]: from vllm.attention.layer import Attention # Avoid circular import + + # Check if the layer is skipped for quantization. + # TODO (@robertgshaw2): support module names + if should_ignore_layer(prefix, ignore=self.ignore): + return CompressedTensorsUnquantized() if isinstance(layer, LinearBase): + scheme = self.get_scheme(layer=layer, layer_name=prefix) + layer.scheme = scheme return CompressedTensorsLinearMethod(self) if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) @@ -281,15 +286,11 @@ def get_scheme( to select the CompressedTensorsScheme used for infernece. """ - # Check if the layer is skipped for quantization. - # TODO (@robertgshaw2): support module names - if should_ignore_layer(layer_name, ignore=self.ignore): - return CompressedTensorsUnquantized() - # Find the "target" in the compressed-tensors config # that our layer conforms to. # TODO (@robertgshaw): add compressed-tensors as dep # so we do not have to re-write these functions + # need to make accelerate optional in ct to do this matched_target = find_matched_target( layer_name=layer_name, module=layer, @@ -327,10 +328,7 @@ def create_weights(self, layer: torch.nn.Module, details """ weight_loader = extra_weight_attrs.get("weight_loader") - layer_name = extra_weight_attrs.get("prefix") - - scheme = self.quantization_config.get_scheme(layer, layer_name) - scheme.create_weights( + layer.scheme.create_weights( layer=layer, input_size=input_size, input_size_per_partition=input_size_per_partition, @@ -339,8 +337,6 @@ def create_weights(self, layer: torch.nn.Module, params_dtype=params_dtype, weight_loader=weight_loader) - layer.scheme = scheme - def apply(self, layer: torch.nn.Module, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py index 2e8d520eacc81..a0edc43b4b8ea 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py @@ -1,4 +1,4 @@ -from typing import Callable, List, Optional +from typing import List, Optional import torch import torch.nn.functional as F @@ -28,11 +28,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: requires_grad=False) def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): + output_partition_sizes: List[int], input_size: int, + output_size: int, params_dtype: torch.dtype, + **extra_weight_attrs): + weight_loader = extra_weight_attrs.get("weight_loader") weight = ModelWeightParameter(data=torch.empty( sum(output_partition_sizes), input_size_per_partition, From bf7fc4d70f41d9076b52c6b6229d0ecaee2cce61 Mon Sep 17 00:00:00 2001 From: Dipika Date: Sun, 25 Aug 2024 23:49:44 +0000 Subject: [PATCH 2/5] use UnquantizedLinearMethod --- .../compressed_tensors/compressed_tensors.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 3225d3c6d2942..0a76c73679728 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -3,15 +3,15 @@ import torch from pydantic import BaseModel -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, - CompressedTensorsScheme, CompressedTensorsUnquantized, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, - CompressedTensorsWNA16) + CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, + CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_matched_target, is_activation_quantization_format, @@ -52,17 +52,17 @@ def get_min_capability(cls) -> int: def get_name(self) -> str: return "compressed_tensors" - def get_quant_method( #type: ignore + def get_quant_method( self, layer: torch.nn.Module, prefix: str, - ) -> Optional[Union["CompressedTensorsUnquantized", "QuantizeMethodBase"]]: + ) -> Optional[Union["QuantizeMethodBase"]]: from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization. # TODO (@robertgshaw2): support module names if should_ignore_layer(prefix, ignore=self.ignore): - return CompressedTensorsUnquantized() + return UnquantizedLinearMethod() if isinstance(layer, LinearBase): scheme = self.get_scheme(layer=layer, layer_name=prefix) layer.scheme = scheme @@ -351,7 +351,7 @@ def apply(self, scheme = layer.scheme if scheme is None: raise ValueError("A scheme must be defined for each layer") - return scheme.apply_weights(layer, x, bias=bias) + return scheme.apply(layer, x, bias=bias) class CompressedTensorsKVCacheMethod(BaseKVCacheMethod): From f598955b8308d9774974dce1a00543edfac050c2 Mon Sep 17 00:00:00 2001 From: Dipika Sikka Date: Sun, 25 Aug 2024 20:35:03 -0400 Subject: [PATCH 3/5] Update compressed_tensors.py --- .../quantization/compressed_tensors/compressed_tensors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 0a76c73679728..5a3f5ac571f4e 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -351,7 +351,7 @@ def apply(self, scheme = layer.scheme if scheme is None: raise ValueError("A scheme must be defined for each layer") - return scheme.apply(layer, x, bias=bias) + return scheme.apply_weights(layer, x, bias=bias) class CompressedTensorsKVCacheMethod(BaseKVCacheMethod): From 946787149b47b6402f97822b3ebcdd6bd0f38911 Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 26 Aug 2024 14:48:21 +0000 Subject: [PATCH 4/5] remove unquantized scheme --- .../compressed_tensors/schemes/__init__.py | 2 - .../schemes/compressed_tensors_unquantized.py | 50 ------------------- 2 files changed, 52 deletions(-) delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index ca9e286ce5b2d..5d259ec72051c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,5 +1,4 @@ from .compressed_tensors_scheme import CompressedTensorsScheme -from .compressed_tensors_unquantized import CompressedTensorsUnquantized from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 @@ -10,7 +9,6 @@ __all__ = [ "CompressedTensorsScheme", - "CompressedTensorsUnquantized", "CompressedTensorsWNA16", "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py deleted file mode 100644 index a0edc43b4b8ea..0000000000000 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ /dev/null @@ -1,50 +0,0 @@ -from typing import List, Optional - -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) -from vllm.model_executor.parameter import ModelWeightParameter - -__all__ = ["CompressedTensorsUnquantized"] - - -class CompressedTensorsUnquantized(CompressedTensorsScheme): - """ - Implements the scheme for all layers which are ignored - in the CompressedTensors config. The input and loaded weight are used - in a linear transformation. - """ - - @classmethod - def get_min_capability(cls) -> int: - # volta and up - return 70 - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile to be torch.nn.Parameter - layer.weight = torch.nn.Parameter(layer.weight.data, - requires_grad=False) - - def create_weights(self, layer: torch.nn.Module, - input_size_per_partition: int, - output_partition_sizes: List[int], input_size: int, - output_size: int, params_dtype: torch.dtype, - **extra_weight_attrs): - - weight_loader = extra_weight_attrs.get("weight_loader") - weight = ModelWeightParameter(data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) - - layer.register_parameter("weight", weight) - - def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - - return F.linear(x, layer.weight, bias) From 7675add4eb1716a42158d47099c14730dcdfcc9d Mon Sep 17 00:00:00 2001 From: Dipika Date: Mon, 26 Aug 2024 21:06:43 +0000 Subject: [PATCH 5/5] remove Union --- .../quantization/compressed_tensors/compressed_tensors.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index 5a3f5ac571f4e..f0e0b9db80884 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional import torch from pydantic import BaseModel @@ -56,7 +56,7 @@ def get_quant_method( self, layer: torch.nn.Module, prefix: str, - ) -> Optional[Union["QuantizeMethodBase"]]: + ) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import # Check if the layer is skipped for quantization.