Fixed cpu bindings

vllm-project · Sep 12, 2024 · a3b9f6a · a3b9f6a
1 parent 04a539e
commit a3b9f6a
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 7 deletions.
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
@@ -257,11 +257,13 @@ void int8_scaled_mm(torch::Tensor& c,               // [M, OC], row-major
 // static-per-tensor quantization.
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                               const torch::Tensor& input,  // [..., hidden_size]
-                              const torch::Tensor& scale) {
+                              const torch::Tensor& scale,
+                              c10::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scale.numel() == 1);
+  TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
 
   const int hidden_size = input.size(-1);
   const int num_tokens = input.numel() / hidden_size;
@@ -277,11 +279,12 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
 void dynamic_scaled_int8_quant(
     torch::Tensor& out,          // [..., hidden_size]
     const torch::Tensor& input,  // [..., hidden_size]
-    torch::Tensor& scale         // [..., 1]
-) {
+    torch::Tensor& scale,        // [..., 1]
+    c10::optional<torch::Tensor> const& azp) {
   CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");
 
   int const hidden_size = input.size(-1);
   int const num_tokens = input.numel() / hidden_size;

diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
@@ -94,13 +94,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 #ifdef __AVX512F__
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
-      "()");
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
+
   // Compute int8 quantized tensor and scaling factor
   ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
-      "()");
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
            &dynamic_scaled_int8_quant);
   // W8A8 GEMM, supporting symmetric per-tensor or per-row/column