Observer Restructure: Remove Observers, calibration, and applying `…

…frozen` steps from lifecycle (#189) * temporary workaround * separate out calibration from forward pass * fix missing import * fix tests * update all other tests * clean * update * clean-up * fix test case * remove calibration and init observer steps * update * update * clean-up/fix * cleanup * cleanup * remove cache * clean-up * remove frozen * more clean-up * remove observer, cache, and frozen state * update more test cases * fix bit_depth test * fix more tests * clean-up remaining tests * clean-up * dont skip * more clean-up * fix
neuralmagic · Oct 31, 2024 · 2b79056 · 2b79056
1 parent 13b5c0b
commit 2b79056
Show file tree

Hide file tree

Showing 33 changed files with 420 additions and 1,763 deletions.
diff --git a/src/compressed_tensors/quantization/__init__.py b/src/compressed_tensors/quantization/__init__.py
@@ -19,4 +19,3 @@
 from .quant_config import *
 from .quant_scheme import *
 from .lifecycle import *
-from .cache import QuantizedKVParameterCache
diff --git a/src/compressed_tensors/quantization/cache.py b/src/compressed_tensors/quantization/cache.py
diff --git a/src/compressed_tensors/quantization/lifecycle/__init__.py b/src/compressed_tensors/quantization/lifecycle/__init__.py
@@ -15,9 +15,7 @@
 # flake8: noqa
 # isort: skip_file
 
-from .calibration import *
 from .forward import *
-from .frozen import *
 from .initialize import *
 from .compressed import *
 from .apply import *

diff --git a/src/compressed_tensors/quantization/lifecycle/apply.py b/src/compressed_tensors/quantization/lifecycle/apply.py
@@ -22,13 +22,9 @@
 
 import torch
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization.lifecycle.calibration import (
-    set_module_for_calibration,
-)
 from compressed_tensors.quantization.lifecycle.compressed import (
     compress_quantized_weights,
 )
-from compressed_tensors.quantization.lifecycle.frozen import freeze_module_quantization
 from compressed_tensors.quantization.lifecycle.initialize import (
     initialize_module_for_quantization,
 )
@@ -233,6 +229,7 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
     :param model: model to apply quantization to
     :param status: status to update the module to
     """
+
     current_status = infer_quantization_status(model)
 
     if status >= QuantizationStatus.INITIALIZED > current_status:
@@ -243,18 +240,6 @@ def apply_quantization_status(model: Module, status: QuantizationStatus):
             )
         )
 
-    if current_status < status >= QuantizationStatus.CALIBRATION > current_status:
-        # only quantize weights up front when our end goal state is calibration,
-        # weight quantization parameters are already loaded for frozen/compressed
-        quantize_weights_upfront = status == QuantizationStatus.CALIBRATION
-        model.apply(
-            lambda module: set_module_for_calibration(
-                module, quantize_weights_upfront=quantize_weights_upfront
-            )
-        )
-    if current_status < status >= QuantizationStatus.FROZEN > current_status:
-        model.apply(freeze_module_quantization)
-
     if current_status < status >= QuantizationStatus.COMPRESSED > current_status:
         model.apply(compress_quantized_weights)
 

diff --git a/src/compressed_tensors/quantization/lifecycle/calibration.py b/src/compressed_tensors/quantization/lifecycle/calibration.py