From d4b79cec41bc5756232e5c4894851151f6f49c37 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Fri, 23 Aug 2024 21:05:37 +0000
Subject: [PATCH 1/5] update compressed tensors lifecycle to remove prefix from
 create_weights

---
 vllm/model_executor/layers/linear.py          |  9 ++----
 .../compressed_tensors/compressed_tensors.py  | 28 ++++++++-----------
 .../schemes/compressed_tensors_unquantized.py |  9 +++---
 3 files changed, 20 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index e5b40a64abc41..1e8b0514d6762 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -208,8 +208,7 @@ def __init__(self,
                                          self.input_size,
                                          self.output_size,
                                          self.params_dtype,
-                                         weight_loader=self.weight_loader,
-                                         prefix=prefix)
+                                         weight_loader=self.weight_loader)
 
         if bias:
             self.bias = Parameter(
@@ -307,8 +306,7 @@ def __init__(self,
             params_dtype=self.params_dtype,
             weight_loader=(
                 self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader),
-            prefix=prefix)
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
         if bias:
             self.bias = Parameter(
                 torch.empty(self.output_size_per_partition,
@@ -976,8 +974,7 @@ def __init__(self,
             params_dtype=self.params_dtype,
             weight_loader=(
                 self.weight_loader_v2 if self.quant_method.__class__.__name__
-                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader),
-            prefix=prefix)
+                in WEIGHT_LOADER_V2_SUPPORTED else self.weight_loader))
         if not reduce_results and (bias and not skip_bias_add):
             raise ValueError("When not reduce the results, adding bias to the "
                              "results can lead to incorrect results")
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ae75781927381..3225d3c6d2942 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import torch
 from pydantic import BaseModel
@@ -52,15 +52,20 @@ def get_min_capability(cls) -> int:
     def get_name(self) -> str:
         return "compressed_tensors"
 
-    # TODO (@robertgshaw2-neuralmagic): do layer skipping though here
-    # rather than though create_weights to match other methods
-    def get_quant_method(
+    def get_quant_method(  #type: ignore
         self,
         layer: torch.nn.Module,
         prefix: str,
-    ) -> Optional["QuantizeMethodBase"]:
+    ) -> Optional[Union["CompressedTensorsUnquantized", "QuantizeMethodBase"]]:
         from vllm.attention.layer import Attention  # Avoid circular import
+
+        # Check if the layer is skipped for quantization.
+        # TODO (@robertgshaw2): support module names
+        if should_ignore_layer(prefix, ignore=self.ignore):
+            return CompressedTensorsUnquantized()
         if isinstance(layer, LinearBase):
+            scheme = self.get_scheme(layer=layer, layer_name=prefix)
+            layer.scheme = scheme
             return CompressedTensorsLinearMethod(self)
         if isinstance(layer, Attention):
             return CompressedTensorsKVCacheMethod(self)
@@ -281,15 +286,11 @@ def get_scheme(
         to select the CompressedTensorsScheme used for infernece.
         """
 
-        # Check if the layer is skipped for quantization.
-        # TODO (@robertgshaw2): support module names
-        if should_ignore_layer(layer_name, ignore=self.ignore):
-            return CompressedTensorsUnquantized()
-
         # Find the "target" in the compressed-tensors config
         # that our layer conforms to.
         # TODO (@robertgshaw): add compressed-tensors as dep
         # so we do not have to re-write these functions
+        # need to make accelerate optional in ct to do this
         matched_target = find_matched_target(
             layer_name=layer_name,
             module=layer,
@@ -327,10 +328,7 @@ def create_weights(self, layer: torch.nn.Module,
         details
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
-        layer_name = extra_weight_attrs.get("prefix")
-
-        scheme = self.quantization_config.get_scheme(layer, layer_name)
-        scheme.create_weights(
+        layer.scheme.create_weights(
             layer=layer,
             input_size=input_size,
             input_size_per_partition=input_size_per_partition,
@@ -339,8 +337,6 @@ def create_weights(self, layer: torch.nn.Module,
             params_dtype=params_dtype,
             weight_loader=weight_loader)
 
-        layer.scheme = scheme
-
     def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
index 2e8d520eacc81..a0edc43b4b8ea 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
@@ -1,4 +1,4 @@
-from typing import Callable, List, Optional
+from typing import List, Optional
 
 import torch
 import torch.nn.functional as F
@@ -28,11 +28,12 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
                                           requires_grad=False)
 
     def create_weights(self, layer: torch.nn.Module,
-                       output_partition_sizes: List[int],
                        input_size_per_partition: int,
-                       params_dtype: torch.dtype, weight_loader: Callable,
-                       **kwargs):
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
 
+        weight_loader = extra_weight_attrs.get("weight_loader")
         weight = ModelWeightParameter(data=torch.empty(
             sum(output_partition_sizes),
             input_size_per_partition,

From bf7fc4d70f41d9076b52c6b6229d0ecaee2cce61 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Sun, 25 Aug 2024 23:49:44 +0000
Subject: [PATCH 2/5] use UnquantizedLinearMethod

---
 .../compressed_tensors/compressed_tensors.py   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 3225d3c6d2942..0a76c73679728 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -3,15 +3,15 @@
 import torch
 from pydantic import BaseModel
 
-from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig, QuantizeMethodBase)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
     W4A16SPARSE24_SUPPORTED_BITS, WNA16_SUPPORTED_BITS,
-    CompressedTensorsScheme, CompressedTensorsUnquantized,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsScheme, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     CompressionFormat, QuantizationArgs, QuantizationStrategy,
     QuantizationType, find_matched_target, is_activation_quantization_format,
@@ -52,17 +52,17 @@ def get_min_capability(cls) -> int:
     def get_name(self) -> str:
         return "compressed_tensors"
 
-    def get_quant_method(  #type: ignore
+    def get_quant_method(
         self,
         layer: torch.nn.Module,
         prefix: str,
-    ) -> Optional[Union["CompressedTensorsUnquantized", "QuantizeMethodBase"]]:
+    ) -> Optional[Union["QuantizeMethodBase"]]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         # Check if the layer is skipped for quantization.
         # TODO (@robertgshaw2): support module names
         if should_ignore_layer(prefix, ignore=self.ignore):
-            return CompressedTensorsUnquantized()
+            return UnquantizedLinearMethod()
         if isinstance(layer, LinearBase):
             scheme = self.get_scheme(layer=layer, layer_name=prefix)
             layer.scheme = scheme
@@ -351,7 +351,7 @@ def apply(self,
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply_weights(layer, x, bias=bias)
+        return scheme.apply(layer, x, bias=bias)
 
 
 class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):

From f598955b8308d9774974dce1a00543edfac050c2 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sun, 25 Aug 2024 20:35:03 -0400
Subject: [PATCH 3/5] Update compressed_tensors.py

---
 .../quantization/compressed_tensors/compressed_tensors.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 0a76c73679728..5a3f5ac571f4e 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -351,7 +351,7 @@ def apply(self,
         scheme = layer.scheme
         if scheme is None:
             raise ValueError("A scheme must be defined for each layer")
-        return scheme.apply(layer, x, bias=bias)
+        return scheme.apply_weights(layer, x, bias=bias)
 
 
 class CompressedTensorsKVCacheMethod(BaseKVCacheMethod):

From 946787149b47b6402f97822b3ebcdd6bd0f38911 Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 26 Aug 2024 14:48:21 +0000
Subject: [PATCH 4/5] remove unquantized scheme

---
 .../compressed_tensors/schemes/__init__.py    |  2 -
 .../schemes/compressed_tensors_unquantized.py | 50 -------------------
 2 files changed, 52 deletions(-)
 delete mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index ca9e286ce5b2d..5d259ec72051c 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,5 +1,4 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme
-from .compressed_tensors_unquantized import CompressedTensorsUnquantized
 from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
                                           CompressedTensorsW4A16Sparse24)
 from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
@@ -10,7 +9,6 @@
 
 __all__ = [
     "CompressedTensorsScheme",
-    "CompressedTensorsUnquantized",
     "CompressedTensorsWNA16",
     "CompressedTensorsW8A16Fp8",
     "CompressedTensorsW4A16Sparse24",
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
deleted file mode 100644
index a0edc43b4b8ea..0000000000000
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_unquantized.py
+++ /dev/null
@@ -1,50 +0,0 @@
-from typing import List, Optional
-
-import torch
-import torch.nn.functional as F
-
-from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme)
-from vllm.model_executor.parameter import ModelWeightParameter
-
-__all__ = ["CompressedTensorsUnquantized"]
-
-
-class CompressedTensorsUnquantized(CompressedTensorsScheme):
-    """
-    Implements the scheme for all layers which are ignored 
-    in the CompressedTensors config. The input and loaded weight are used 
-    in a linear transformation.
-    """
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        # volta and up
-        return 70
-
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
-        # required by torch.compile to be torch.nn.Parameter
-        layer.weight = torch.nn.Parameter(layer.weight.data,
-                                          requires_grad=False)
-
-    def create_weights(self, layer: torch.nn.Module,
-                       input_size_per_partition: int,
-                       output_partition_sizes: List[int], input_size: int,
-                       output_size: int, params_dtype: torch.dtype,
-                       **extra_weight_attrs):
-
-        weight_loader = extra_weight_attrs.get("weight_loader")
-        weight = ModelWeightParameter(data=torch.empty(
-            sum(output_partition_sizes),
-            input_size_per_partition,
-            dtype=params_dtype),
-                                      input_dim=1,
-                                      output_dim=0,
-                                      weight_loader=weight_loader)
-
-        layer.register_parameter("weight", weight)
-
-    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
-                      bias: Optional[torch.Tensor]) -> torch.Tensor:
-
-        return F.linear(x, layer.weight, bias)

From 7675add4eb1716a42158d47099c14730dcdfcc9d Mon Sep 17 00:00:00 2001
From: Dipika <dipikasikka1@gmail.com>
Date: Mon, 26 Aug 2024 21:06:43 +0000
Subject: [PATCH 5/5] remove Union

---
 .../quantization/compressed_tensors/compressed_tensors.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 5a3f5ac571f4e..f0e0b9db80884 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional
 
 import torch
 from pydantic import BaseModel
@@ -56,7 +56,7 @@ def get_quant_method(
         self,
         layer: torch.nn.Module,
         prefix: str,
-    ) -> Optional[Union["QuantizeMethodBase"]]:
+    ) -> Optional["QuantizeMethodBase"]:
         from vllm.attention.layer import Attention  # Avoid circular import
 
         # Check if the layer is skipped for quantization.