Skip to content

Commit

Permalink
Fixed cpu bindings
Browse files Browse the repository at this point in the history
  • Loading branch information
ProExpertProg committed Sep 12, 2024
1 parent 04a539e commit a3b9f6a
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 7 deletions.
9 changes: 6 additions & 3 deletions csrc/cpu/quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,11 +257,13 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
// static-per-tensor quantization.
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
const torch::Tensor& scale) {
const torch::Tensor& scale,
c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1);
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");

const int hidden_size = input.size(-1);
const int num_tokens = input.numel() / hidden_size;
Expand All @@ -277,11 +279,12 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
torch::Tensor& scale // [..., 1]
) {
torch::Tensor& scale, // [..., 1]
c10::optional<torch::Tensor> const& azp) {
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(!azp.has_value(), "Zero point is not supported on CPU.");

int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
Expand Down
9 changes: 5 additions & 4 deletions csrc/cpu/torch_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#ifdef __AVX512F__
// Compute int8 quantized tensor for given scaling factor.
ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
"()");
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
"Tensor? azp) -> ()");
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);

// Compute int8 quantized tensor and scaling factor
ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
"()");
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
"Tensor!? azp) -> ()");
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
&dynamic_scaled_int8_quant);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
Expand Down

0 comments on commit a3b9f6a

Please sign in to comment.