diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index e5b40a64abc41..1e8b0514d6762 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -208,8 +208,7 @@ def __init__(self, self.input_size, self.output_size, self.params_dtype, - weight_loader=self.weight_loader, - prefix=prefix) + weight_loader=self.weight_loader) if bias: self.bias = Parameter( @@ -307,8 +306,7 @@ def __init__(self, params_dtype=self.params_dtype, weight_loader=( self.weight_loader_v2 if self.quant_method.__class__.__name__ - in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), - prefix=prefix) + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) if bias: self.bias = Parameter( torch.empty(self.output_size_per_partition, @@ -976,8 +974,7 @@ def __init__(self, params_dtype=self.params_dtype, weight_loader=( self.weight_loader_v2 if self.quant_method.__class__.__name__ - in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader), - prefix=prefix) + in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader)) if not reduce_results and (bias and not skip_bias_add): raise ValueError("When not reduce the results, adding bias to the " "results can lead to incorrect results") diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py index ae75781927381..f0e0b9db80884 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py @@ -3,15 +3,15 @@ import torch from pydantic import BaseModel -from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase +from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase, + UnquantizedLinearMethod) from vllm.model_executor.layers.quantization.base_config import ( # noqa: E501 QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS, - CompressedTensorsScheme, CompressedTensorsUnquantized, - CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8, - CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8, - CompressedTensorsWNA16) + CompressedTensorsScheme, CompressedTensorsW4A16Sparse24, + CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8, + CompressedTensorsW8A16Fp8, CompressedTensorsWNA16) from vllm.model_executor.layers.quantization.compressed_tensors.utils import ( CompressionFormat, QuantizationArgs, QuantizationStrategy, QuantizationType, find_matched_target, is_activation_quantization_format, @@ -52,15 +52,20 @@ def get_min_capability(cls) -> int: def get_name(self) -> str: return "compressed_tensors" - # TODO (@robertgshaw2-neuralmagic): do layer skipping though here - # rather than though create_weights to match other methods def get_quant_method( self, layer: torch.nn.Module, prefix: str, ) -> Optional["QuantizeMethodBase"]: from vllm.attention.layer import Attention # Avoid circular import + + # Check if the layer is skipped for quantization. + # TODO (@robertgshaw2): support module names + if should_ignore_layer(prefix, ignore=self.ignore): + return UnquantizedLinearMethod() if isinstance(layer, LinearBase): + scheme = self.get_scheme(layer=layer, layer_name=prefix) + layer.scheme = scheme return CompressedTensorsLinearMethod(self) if isinstance(layer, Attention): return CompressedTensorsKVCacheMethod(self) @@ -281,15 +286,11 @@ def get_scheme( to select the CompressedTensorsScheme used for infernece. """ - # Check if the layer is skipped for quantization. - # TODO (@robertgshaw2): support module names - if should_ignore_layer(layer_name, ignore=self.ignore): - return CompressedTensorsUnquantized() - # Find the "target" in the compressed-tensors config # that our layer conforms to. # TODO (@robertgshaw): add compressed-tensors as dep # so we do not have to re-write these functions + # need to make accelerate optional in ct to do this matched_target = find_matched_target( layer_name=layer_name, module=layer, @@ -327,10 +328,7 @@ def create_weights(self, layer: torch.nn.Module, details """ weight_loader = extra_weight_attrs.get("weight_loader") - layer_name = extra_weight_attrs.get("prefix") - - scheme = self.quantization_config.get_scheme(layer, layer_name) - scheme.create_weights( + layer.scheme.create_weights( layer=layer, input_size=input_size, input_size_per_partition=input_size_per_partition, @@ -339,8 +337,6 @@ def create_weights(self, layer: torch.nn.Module, params_dtype=params_dtype, weight_loader=weight_loader) - layer.scheme = scheme - def apply(self, layer: torch.nn.Module, x: torch.Tensor, diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py index ca9e286ce5b2d..5d259ec72051c 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py @@ -1,5 +1,4 @@ from .compressed_tensors_scheme import CompressedTensorsScheme -from .compressed_tensors_unquantized import CompressedTensorsUnquantized from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS, CompressedTensorsW4A16Sparse24) from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8 @@ -10,7 +9,6 @@ __all__ = [ "CompressedTensorsScheme", - "CompressedTensorsUnquantized", "CompressedTensorsWNA16", "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24", diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py deleted file mode 100644 index 2e8d520eacc81..0000000000000 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py +++ /dev/null @@ -1,49 +0,0 @@ -from typing import Callable, List, Optional - -import torch -import torch.nn.functional as F - -from vllm.model_executor.layers.quantization.compressed_tensors.schemes import ( - CompressedTensorsScheme) -from vllm.model_executor.parameter import ModelWeightParameter - -__all__ = ["CompressedTensorsUnquantized"] - - -class CompressedTensorsUnquantized(CompressedTensorsScheme): - """ - Implements the scheme for all layers which are ignored - in the CompressedTensors config. The input and loaded weight are used - in a linear transformation. - """ - - @classmethod - def get_min_capability(cls) -> int: - # volta and up - return 70 - - def process_weights_after_loading(self, layer: torch.nn.Module) -> None: - # required by torch.compile to be torch.nn.Parameter - layer.weight = torch.nn.Parameter(layer.weight.data, - requires_grad=False) - - def create_weights(self, layer: torch.nn.Module, - output_partition_sizes: List[int], - input_size_per_partition: int, - params_dtype: torch.dtype, weight_loader: Callable, - **kwargs): - - weight = ModelWeightParameter(data=torch.empty( - sum(output_partition_sizes), - input_size_per_partition, - dtype=params_dtype), - input_dim=1, - output_dim=0, - weight_loader=weight_loader) - - layer.register_parameter("weight", weight) - - def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor, - bias: Optional[torch.Tensor]) -> torch.Tensor: - - return F.linear(x, layer.weight, bias)