neuralmagic · dsikka · May 1, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -6,9 +6,19 @@
 
 static inline __device__ int8_t float_to_int8_rn(float x)
 {
+#ifdef USE_ROCM
+    float dst;
+    // Round to nearest even
+    asm volatile("v_rndne_f32 %0, %1;\n" : "=r"(dst) : "v"(x));
+    // Saturate
+    dst = dst < -128.0f ? -128.0f : dst:
+    dst = dst > 127.0f ? 127.0f : dst;
+    return static_cast<int8_t>(dst);
+#else
     uint32_t dst;
     asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=r"(dst) : "f"(x));
     return reinterpret_cast<const int8_t&>(dst);
+#endif
 }
 
 namespace vllm {

diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -6,8 +6,7 @@
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsUnquantized,
-    CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsUnquantized)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -80,7 +79,11 @@ def _get_schema(self, weight_quant: Dict, input_quant: Dict):
         is_tensor = weight_strategy == input_strategy == "tensor"
         is_symmetric = weight_symmetric and input_symmetric
 
-        if is_8_bits and is_tensor and is_symmetric:
+        if is_8_bits and is_tensor and is_symmetric and \
+           torch.cuda.is_available():
+            # CompressedTensorsW8A8StaticTensor only supports CUDA path for now.
+            from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (  # noqa: E501
+                CompressedTensorsW8A8StaticTensor)
             return CompressedTensorsW8A8StaticTensor(
                 fake_quant=self.fake_quant)
         raise NotImplementedError(