From 67f0e6e8e503187dfd9ae2328b56a19e01dce7b2 Mon Sep 17 00:00:00 2001 From: wangleilei001 Date: Mon, 10 Jan 2022 13:58:18 +0800 Subject: [PATCH] [Cherry-Pick][XPU] add conv3d, fix instance_norm, fix conv2d_transpose (#7642), test=develop --- lite/kernels/x86/instance_norm_compute.cc | 3 + lite/kernels/xpu/CMakeLists.txt | 1 + lite/kernels/xpu/conv2d_transpose_compute.cc | 117 ++++++++++++++---- lite/kernels/xpu/conv3d_compute.cc | 75 +++++++++++ lite/kernels/xpu/conv3d_compute.h | 37 ++++++ lite/kernels/xpu/instance_norm_compute.cc | 33 ++++- lite/operators/conv_op.cc | 3 +- lite/operators/conv_op.h | 6 +- .../kernels/conv_transpose_compute_test.cc | 9 ++ .../kernels/instance_norm_compute_test.cc | 2 + 10 files changed, 257 insertions(+), 29 deletions(-) create mode 100644 lite/kernels/xpu/conv3d_compute.cc create mode 100644 lite/kernels/xpu/conv3d_compute.h mode change 100755 => 100644 lite/operators/conv_op.h diff --git a/lite/kernels/x86/instance_norm_compute.cc b/lite/kernels/x86/instance_norm_compute.cc index 00b50df374b..10375688b96 100644 --- a/lite/kernels/x86/instance_norm_compute.cc +++ b/lite/kernels/x86/instance_norm_compute.cc @@ -42,6 +42,9 @@ void InstanceNormCompute::Run() { int c = param.x->dims()[1]; int height = param.x->dims()[2]; int width = param.x->dims()[3]; + if (param.x->dims().size() == 5) { + width = param.x->dims()[3] * param.x->dims()[4]; + } lite::x86::math::instance_norm(in, out, diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt index 3d3720ba7f0..ceabed4fa7a 100644 --- a/lite/kernels/xpu/CMakeLists.txt +++ b/lite/kernels/xpu/CMakeLists.txt @@ -19,6 +19,7 @@ else() # basic add_kernel(rnn_compute_xpu XPU basic SRCS rnn_compute.cc) add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc) + add_kernel(conv3d_compute_xpu XPU basic SRCS conv3d_compute.cc) add_kernel(conv2d_transpose_compute_xpu XPU basic SRCS conv2d_transpose_compute.cc) add_kernel(calib_compute_xpu XPU basic SRCS calib_compute.cc) add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc) diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc index f9f1a7c4847..7dc6abf2b8f 100644 --- a/lite/kernels/xpu/conv2d_transpose_compute.cc +++ b/lite/kernels/xpu/conv2d_transpose_compute.cc @@ -36,27 +36,102 @@ void Conv2dTransposeCompute::Run() { auto paddings = *param.paddings; auto dilations = *param.dilations; - int ret = xdnn::conv2d_transpose( - ctx.GetRawContext(), - param.x->data(), - param.filter->data(), - param.output->mutable_data(TARGET(kXPU)), - in_dims[0], - in_dims[1], - in_dims[2], - in_dims[3], - out_dims[1], - std::vector{static_cast(w_dims[2]), - static_cast(w_dims[3])}, - strides, - paddings, - dilations, - groups, - nullptr, - nullptr, - nullptr, - true); - CHECK_EQ(ret, 0); + if (param.output_padding.empty()) { + int ret = xdnn::conv2d_transpose( + ctx.GetRawContext(), + param.x->data(), + param.filter->data(), + param.output->mutable_data(TARGET(kXPU)), + in_dims[0], + in_dims[1], + in_dims[2], + in_dims[3], + out_dims[1], + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + groups, + nullptr, + nullptr, + nullptr, + true); + CHECK_EQ(ret, 0); + } else { + int n = in_dims[0]; + int yc = in_dims[1]; + int yh = in_dims[2]; + int yw = in_dims[3]; + int xc = out_dims[1]; + int xh = out_dims[2]; + int xw = out_dims[3]; + int kh = w_dims[2]; + int kw = w_dims[3]; + float* x_trans = nullptr; + XPU_CALL(xpu_malloc(reinterpret_cast(&x_trans), + (param.x->numel()) * sizeof(float))); + float* x_col_before_concat = nullptr; + XPU_CALL(xpu_malloc(reinterpret_cast(&x_col_before_concat), + (n * yh * yw * kh * kw * xc) * sizeof(float))); + float* x_col = nullptr; + XPU_CALL(xpu_malloc(reinterpret_cast(&x_col), + (n * yh * yw * kh * kw * xc) * sizeof(float))); + const float* weight = param.filter->data(); + int ret = xdnn::transpose(ctx.GetRawContext(), + param.x->data(), + x_trans, + {n, groups, yc / groups, yh, yw}, + {1, 0, 3, 4, 2}); + CHECK_EQ(ret, 0); + for (int g = 0; g < groups; g++) { + const float* curr_y = x_trans + g * n * yh * yw * (yc / groups); + const float* curr_w = + weight + g * (yc / groups) * (xc / groups) * kh * kw; + float* curr_x = + x_col_before_concat + g * n * yh * yw * (xc / groups) * kh * kw; + int mac_m = n * yh * yw; + int mac_k = yc / groups; + int mac_n = xc / groups * kh * kw; + ret = xdnn::fc(ctx.GetRawContext(), + curr_y, + curr_w, + curr_x, + mac_m, + mac_n, + mac_k, + false, + false, + nullptr, + nullptr, + nullptr); + CHECK_EQ(ret, 0); + } + ret = xdnn::transpose(ctx.GetRawContext(), + x_col_before_concat, + x_col, + {groups, n * yh * yw, (xc / groups) * kh * kw}, + {1, 0, 2}); + CHECK_EQ(ret, 0); + + ret = xdnn::col2im(ctx.GetRawContext(), + x_col, + param.output->mutable_data(TARGET(kXPU)), + n, + xc, + xh, + xw, + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3])}, + strides, + paddings, + dilations, + true); + CHECK_EQ(ret, 0); + XPU_CALL(xpu_free(x_trans)); + XPU_CALL(xpu_free(x_col_before_concat)); + XPU_CALL(xpu_free(x_col)); + } } } // namespace xpu diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc new file mode 100644 index 00000000000..1589a14b892 --- /dev/null +++ b/lite/kernels/xpu/conv3d_compute.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/xpu/conv3d_compute.h" +#include +#include "lite/backends/xpu/xpu_header_sitter.h" +#include "lite/core/op_registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template <> +void Conv3DCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->As(); + + auto& x_dims = param.x->dims(); + auto& w_dims = param.filter->dims(); + int groups = param.groups; + auto& strides = param.strides; + auto paddings = *param.paddings; + auto dilations = *param.dilations; + + int r = xdnn::conv3d( + ctx.GetRawContext(), /* context */ + param.x->data(), + param.filter->data(), /* weight */ + param.output->mutable_data(TARGET(kXPU)), + x_dims[0], /* input_n */ + x_dims[1], /* input_c */ + x_dims[2], /* input_d */ + x_dims[3], /* input_h */ + x_dims[4], /* input_w */ + w_dims[0], /* num_filter */ + std::vector{static_cast(w_dims[2]), + static_cast(w_dims[3]), + static_cast(w_dims[4])}, /* kernel size*/ + strides, + paddings, + dilations, + groups, + nullptr, + nullptr, + nullptr, + true /*is_ncdhw*/); + CHECK_EQ(r, 0); +} + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle + +namespace xpu = paddle::lite::kernels::xpu; +using Conv3dFp32 = xpu::Conv3DCompute; + +REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def) + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))}) + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) + .Finalize(); diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h new file mode 100644 index 00000000000..caadb82a1e8 --- /dev/null +++ b/lite/kernels/xpu/conv3d_compute.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/kernel.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace xpu { + +template +class Conv3DCompute : public KernelLite { + public: + using param_t = operators::ConvParam; + + void Run() override; + + virtual ~Conv3DCompute() = default; +}; + +} // namespace xpu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/xpu/instance_norm_compute.cc b/lite/kernels/xpu/instance_norm_compute.cc index 2f57478ed41..00fddb6f416 100644 --- a/lite/kernels/xpu/instance_norm_compute.cc +++ b/lite/kernels/xpu/instance_norm_compute.cc @@ -25,12 +25,32 @@ void InstanceNormCompute::Run() { auto& param = this->Param(); auto& ctx = this->ctx_->As(); auto x_dims = param.x->dims(); - CHECK_EQ(x_dims.size(), 4); + CHECK(x_dims.size() == 4 || x_dims.size() == 5) + << "Not support x_dims_rank = " << x_dims.size(); + int n = x_dims[0]; int c = x_dims[1]; int h = x_dims[2]; int w = x_dims[3]; + if (x_dims.size() == 5) { + h = x_dims[2] * x_dims[3]; + w = x_dims[4]; + } + float* xpu_scale = nullptr; + if (param.scale == nullptr) { + XPU_CALL( + xpu_malloc(reinterpret_cast(&xpu_scale), c * sizeof(float))); + int ret = xdnn::constant(ctx.GetRawContext(), xpu_scale, c, 1.0f); + CHECK_EQ(ret, 0); + } + float* xpu_bias = nullptr; + if (param.bias == nullptr) { + XPU_CALL( + xpu_malloc(reinterpret_cast(&xpu_bias), c * sizeof(float))); + int ret = xdnn::constant(ctx.GetRawContext(), xpu_bias, c, 0.0f); + CHECK_EQ(ret, 0); + } int ret = xdnn::instance_norm( ctx.GetRawContext(), param.x->data(), @@ -40,13 +60,18 @@ void InstanceNormCompute::Run() { h, w, param.epsilon, - param.scale->data(), - param.bias->data(), + (param.scale == nullptr) ? xpu_scale : param.scale->data(), + (param.bias == nullptr) ? xpu_bias : param.bias->data(), param.saved_mean->mutable_data(TARGET(kXPU)), param.saved_variance->mutable_data(TARGET(kXPU)), true); - CHECK_EQ(ret, 0); + if (xpu_scale != nullptr) { + XPU_CALL(xpu_free(xpu_scale)); + } + if (xpu_bias != nullptr) { + XPU_CALL(xpu_free(xpu_bias)); + } } } // namespace xpu diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc index fa18a384fb1..6aa8f1b9eab 100644 --- a/lite/operators/conv_op.cc +++ b/lite/operators/conv_op.cc @@ -34,7 +34,7 @@ bool ConvOpLite::CheckShape() const { CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size()); CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U); - CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL); + CHECK_OR_FALSE(filter_dims.size() == 4UL || filter_dims.size() == 5UL); return true; } @@ -115,4 +115,5 @@ bool ConvOpLite::InferShapeImpl() const { } // namespace paddle REGISTER_LITE_OP(conv2d, paddle::lite::operators::ConvOpLite); +REGISTER_LITE_OP(conv3d, paddle::lite::operators::ConvOpLite); REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite); diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h old mode 100755 new mode 100644 index 51f13366fa9..275c14ff8f1 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -215,14 +215,14 @@ class ConvOpLite : public OpLite { } #endif - // 2-pad to 4-pad - if (paddings.size() == 2L) { + // conv3d: 3-pad to 6-pad, or conv2d: 2-pad to 4-pad + if (paddings.size() == 2L || paddings.size() == 3L) { for (size_t i = 0; i < param_.strides.size(); ++i) { int copy_pad = *(paddings.begin() + 2 * i); paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); } } else { - if (paddings.size() != 4L) { + if (paddings.size() != 4L && paddings.size() != 6L) { LOG(FATAL) << "Paddings size should be the same or twice as the input size."; } diff --git a/lite/tests/kernels/conv_transpose_compute_test.cc b/lite/tests/kernels/conv_transpose_compute_test.cc index ab3defd8c85..adf4ec80f6b 100644 --- a/lite/tests/kernels/conv_transpose_compute_test.cc +++ b/lite/tests/kernels/conv_transpose_compute_test.cc @@ -425,6 +425,15 @@ TEST(Conv_transpose, precision) { #elif defined(LITE_WITH_NPU) place = TARGET(kNPU); abs_error = 5e-2; // Using fp16 in NPU +#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) + place = TARGET(kXPU); + abs_error = 5e-4; + TestConvTransposeKsize(place, abs_error); + TestConvTransposeStrides(place, abs_error); + TestConvTransposePaddings(place, abs_error); + TestConvTransposeGroups(place, abs_error); + TestConvTransposeOutputPadding(place, abs_error); + return; #elif defined(LITE_WITH_ARM) place = TARGET(kARM); TestConvTransposeOutputPadding(place, abs_error); diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc index 759ecd5e635..bfe96419bbb 100644 --- a/lite/tests/kernels/instance_norm_compute_test.cc +++ b/lite/tests/kernels/instance_norm_compute_test.cc @@ -187,6 +187,8 @@ TEST(InstanceNorm, precision) { place = TARGET(kNPU); abs_error = 1e-2; // Using fp16 in NPU ignored_outs = {"saved_mean", "saved_variance"}; +#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) + place = TARGET(kXPU); #elif defined(LITE_WITH_ARM) place = TARGET(kARM); #elif defined(LITE_WITH_X86)