From 6267a2bd93267396ca9209b5a56d7bf65c10baf3 Mon Sep 17 00:00:00 2001 From: newway <237745+newway@users.noreply.github.com> Date: Wed, 7 Aug 2024 10:28:57 +0800 Subject: [PATCH] [XPU] Support int31 weight dynamic quantization for fc and conv2d (#59981) (#67058) Co-authored-by: Travis-Lee --- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 13 +++++ .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 13 +++++ paddle/fluid/framework/ir/xpu/pass_utils.cc | 12 +++++ paddle/fluid/framework/ir/xpu/quant_utils.cc | 53 ++++++++++++++++--- .../kernels/fusion/xpu/conv2d_xpu_kernel.cc | 2 + .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 2 + 6 files changed, 89 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index ecf21ce0a4340..8965307e3fcb2 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -763,6 +763,19 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( false, weight_scale, true); + } else if (quant_post_type.find("conv2d") != quant_post_type.end() && + quant_post_type.find("conv2d")->second == 4) { + VLOG(5) << "Use int31 per-tensor weight"; + PrepareWeight(graph, + scope, + block, + conv_filter_replicated_node, + &filter_intx, + &filter_max, + &scale_max, + false, + weight_scale, + false); } else if (quant_post_type.find("conv2d") != quant_post_type.end() && quant_post_type.find("conv2d")->second == 0 || quant_post_type.find("conv2d") != quant_post_type.end() && diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 04a2e9019f200..0ba4c53b9eda2 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -572,6 +572,19 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( !transpose_w, weight_scale, true); + } else if (quant_post_type.find("fc") != quant_post_type.end() && + quant_post_type.find("fc")->second == 4) { + VLOG(5) << "Use int31 per-tensor weight"; + PrepareWeight(graph, + scope, + block, + mul_w_replicated_node, + &filter_intx, + &filter_max, + &scale_max, + !transpose_w, + weight_scale, + false); } else if (quant_post_type.find("fc") != quant_post_type.end() && quant_post_type.find("fc")->second == 0 || quant_post_type.find("fc") != quant_post_type.end() && diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc index dbc899c93120a..293af7e523f74 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.cc +++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc @@ -256,6 +256,18 @@ void PrepareWeight(Graph* graph, } } +template void PrepareWeight( + Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** dst_weight, + Node** dst_weight_max, + Node** dst_scale_max, + bool transpose, + const std::vector& weight_scales, + bool per_channel_quant = false); + template void PrepareWeight( Graph* graph, Scope* scope, diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index 691e243a13a61..113e2ec0fe080 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -245,6 +245,16 @@ static void QuantFP32ToIntX(const float* src_ptr, LOG(FATAL) << "Not support."; } +template <> +void QuantFP32ToIntX(const float* src_ptr, + float* dst_ptr, + float max_val, + int numel) { + for (int i = 0; i < numel; i++) { + dst_ptr[i] = static_cast(src_ptr[i]); + } +} + template <> void QuantFP32ToIntX(const float* src_ptr, int16_t* dst_ptr, @@ -364,16 +374,16 @@ void ConvertWithoutQuant(phi::DenseTensor* weight, phi::DenseTensor* scale_max, bool transpose, const std::vector& weight_scales) { - PADDLE_ENFORCE_EQ( - weight_scales.empty(), - false, - platform::errors::InvalidArgument( - "ConvertWithoutQuant is not allowed weight scales is empty!")); if (transpose) { Transpose2D(weight); } bool per_tensor_quant = weight_scales.size() == 1; if (std::is_same::value || std::is_same::value) { + PADDLE_ENFORCE_EQ( + weight_scales.empty(), + false, + platform::errors::InvalidArgument( + "ConvertWithoutQuant is not allowed weight scales is empty!")); auto* cpu_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); if (per_tensor_quant) { @@ -400,8 +410,32 @@ void ConvertWithoutQuant(phi::DenseTensor* weight, weight_scales.data(), weight_scales.size() * sizeof(float)); } + } else if (std::is_same::value) { + // Convert fp16 to fp32 + phi::DenseTensor weight_fp32; + CastToFp32(weight, &weight_fp32); + // Find max + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + int size = weight_fp32.numel(); + auto* weight_data = weight_fp32.data(); + float max_val = FindMaxAbs(weight_data, size); + std::vector max_vec(max_ptr_size, max_val); + weight_max->set_type(phi::DataType::FLOAT32); + weight_max->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + memcpy(cpu_ctx->Alloc(weight_max), + max_vec.data(), + max_ptr_size * sizeof(float)); + + // Quant + weight->set_type(phi::DataType::FLOAT32); + weight->Resize(weight_fp32.dims()); + QuantFP32ToIntX( + weight_data, cpu_ctx->Alloc(weight), max_val, size); } else { - LOG(FATAL) << "Only support int8<->int8 and int16<->int16 convert."; + LOG(FATAL) + << "Only support float<->int31, int8<->int8 and int16<->int16 convert."; } } @@ -424,6 +458,13 @@ template void ConvertWithoutQuant( bool transpose, const std::vector& weight_scales); +template void ConvertWithoutQuant( + phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + phi::DenseTensor* scale_max, + bool transpose, + const std::vector& weight_scales); + bool IsPerTensorQuant(const std::vector& weight_max) { bool per_tensor = true; PADDLE_ENFORCE_GT( diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc index aa5d4738aafa9..198e03bac0991 100644 --- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc @@ -221,6 +221,8 @@ void Conv2dXPUKernel(const Context& ctx, DataTypeToString(filter.dtype()), DataTypeToString(out_dtype))); } + } else if (filter.dtype() == DataType::FLOAT32) { + CONV2D_XPU_KERNEL_IMPL(float, float, float, int32_t); } else { PADDLE_THROW(phi::errors::Unimplemented( "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.", diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc index d6153eff096cb..c4cb02ce21184 100644 --- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -165,6 +165,8 @@ void FcXPUKernel(const Context& ctx, DataTypeToString(w.dtype()), DataTypeToString(out_dtype))); } + } else if (w.dtype() == DataType::FLOAT32) { + FC_XPU_KERNEL_IMPL(float, float, float, int32_t); } else { PADDLE_THROW(phi::errors::Unimplemented( "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.",