Skip to content

Commit

Permalink
[Cherry-Pick][XPU] add conv3d, fix instance_norm, fix conv2d_transpose (
Browse files Browse the repository at this point in the history
PaddlePaddle#7642), test=develop
  • Loading branch information
wangleilei001 committed Jan 24, 2022
1 parent 07e5d99 commit 67f0e6e
Show file tree
Hide file tree
Showing 10 changed files with 257 additions and 29 deletions.
3 changes: 3 additions & 0 deletions lite/kernels/x86/instance_norm_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ void InstanceNormCompute::Run() {
int c = param.x->dims()[1];
int height = param.x->dims()[2];
int width = param.x->dims()[3];
if (param.x->dims().size() == 5) {
width = param.x->dims()[3] * param.x->dims()[4];
}

lite::x86::math::instance_norm(in,
out,
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ else()
# basic
add_kernel(rnn_compute_xpu XPU basic SRCS rnn_compute.cc)
add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc)
add_kernel(conv3d_compute_xpu XPU basic SRCS conv3d_compute.cc)
add_kernel(conv2d_transpose_compute_xpu XPU basic SRCS conv2d_transpose_compute.cc)
add_kernel(calib_compute_xpu XPU basic SRCS calib_compute.cc)
add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc)
Expand Down
117 changes: 96 additions & 21 deletions lite/kernels/xpu/conv2d_transpose_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,27 +36,102 @@ void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
auto paddings = *param.paddings;
auto dilations = *param.dilations;

int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
if (param.output_padding.empty()) {
int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
ctx.GetRawContext(),
param.x->data<float>(),
param.filter->data<float>(),
param.output->mutable_data<float>(TARGET(kXPU)),
in_dims[0],
in_dims[1],
in_dims[2],
in_dims[3],
out_dims[1],
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true);
CHECK_EQ(ret, 0);
} else {
int n = in_dims[0];
int yc = in_dims[1];
int yh = in_dims[2];
int yw = in_dims[3];
int xc = out_dims[1];
int xh = out_dims[2];
int xw = out_dims[3];
int kh = w_dims[2];
int kw = w_dims[3];
float* x_trans = nullptr;
XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_trans),
(param.x->numel()) * sizeof(float)));
float* x_col_before_concat = nullptr;
XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_col_before_concat),
(n * yh * yw * kh * kw * xc) * sizeof(float)));
float* x_col = nullptr;
XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_col),
(n * yh * yw * kh * kw * xc) * sizeof(float)));
const float* weight = param.filter->data<float>();
int ret = xdnn::transpose<float>(ctx.GetRawContext(),
param.x->data<float>(),
x_trans,
{n, groups, yc / groups, yh, yw},
{1, 0, 3, 4, 2});
CHECK_EQ(ret, 0);
for (int g = 0; g < groups; g++) {
const float* curr_y = x_trans + g * n * yh * yw * (yc / groups);
const float* curr_w =
weight + g * (yc / groups) * (xc / groups) * kh * kw;
float* curr_x =
x_col_before_concat + g * n * yh * yw * (xc / groups) * kh * kw;
int mac_m = n * yh * yw;
int mac_k = yc / groups;
int mac_n = xc / groups * kh * kw;
ret = xdnn::fc<float, float, float, int16_t>(ctx.GetRawContext(),
curr_y,
curr_w,
curr_x,
mac_m,
mac_n,
mac_k,
false,
false,
nullptr,
nullptr,
nullptr);
CHECK_EQ(ret, 0);
}
ret = xdnn::transpose<float>(ctx.GetRawContext(),
x_col_before_concat,
x_col,
{groups, n * yh * yw, (xc / groups) * kh * kw},
{1, 0, 2});
CHECK_EQ(ret, 0);

ret = xdnn::col2im<float>(ctx.GetRawContext(),
x_col,
param.output->mutable_data<float>(TARGET(kXPU)),
n,
xc,
xh,
xw,
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3])},
strides,
paddings,
dilations,
true);
CHECK_EQ(ret, 0);
XPU_CALL(xpu_free(x_trans));
XPU_CALL(xpu_free(x_col_before_concat));
XPU_CALL(xpu_free(x_col));
}
}

} // namespace xpu
Expand Down
75 changes: 75 additions & 0 deletions lite/kernels/xpu/conv3d_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/conv3d_compute.h"
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <>
void Conv3DCompute<PRECISION(kFloat)>::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();

auto& x_dims = param.x->dims();
auto& w_dims = param.filter->dims();
int groups = param.groups;
auto& strides = param.strides;
auto paddings = *param.paddings;
auto dilations = *param.dilations;

int r = xdnn::conv3d<float, float, float, int16_t>(
ctx.GetRawContext(), /* context */
param.x->data<float>(),
param.filter->data<float>(), /* weight */
param.output->mutable_data<float>(TARGET(kXPU)),
x_dims[0], /* input_n */
x_dims[1], /* input_c */
x_dims[2], /* input_d */
x_dims[3], /* input_h */
x_dims[4], /* input_w */
w_dims[0], /* num_filter */
std::vector<int>{static_cast<int>(w_dims[2]),
static_cast<int>(w_dims[3]),
static_cast<int>(w_dims[4])}, /* kernel size*/
strides,
paddings,
dilations,
groups,
nullptr,
nullptr,
nullptr,
true /*is_ncdhw*/);
CHECK_EQ(r, 0);
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

namespace xpu = paddle::lite::kernels::xpu;
using Conv3dFp32 = xpu::Conv3DCompute<PRECISION(kFloat)>;

REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def)
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();
37 changes: 37 additions & 0 deletions lite/kernels/xpu/conv3d_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "lite/core/kernel.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <PrecisionType FilterPtype>
class Conv3DCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
public:
using param_t = operators::ConvParam;

void Run() override;

virtual ~Conv3DCompute() = default;
};

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
33 changes: 29 additions & 4 deletions lite/kernels/xpu/instance_norm_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,32 @@ void InstanceNormCompute::Run() {
auto& param = this->Param<param_t>();
auto& ctx = this->ctx_->As<XPUContext>();
auto x_dims = param.x->dims();
CHECK_EQ(x_dims.size(), 4);
CHECK(x_dims.size() == 4 || x_dims.size() == 5)
<< "Not support x_dims_rank = " << x_dims.size();

int n = x_dims[0];
int c = x_dims[1];
int h = x_dims[2];
int w = x_dims[3];
if (x_dims.size() == 5) {
h = x_dims[2] * x_dims[3];
w = x_dims[4];
}

float* xpu_scale = nullptr;
if (param.scale == nullptr) {
XPU_CALL(
xpu_malloc(reinterpret_cast<void**>(&xpu_scale), c * sizeof(float)));
int ret = xdnn::constant<float>(ctx.GetRawContext(), xpu_scale, c, 1.0f);
CHECK_EQ(ret, 0);
}
float* xpu_bias = nullptr;
if (param.bias == nullptr) {
XPU_CALL(
xpu_malloc(reinterpret_cast<void**>(&xpu_bias), c * sizeof(float)));
int ret = xdnn::constant<float>(ctx.GetRawContext(), xpu_bias, c, 0.0f);
CHECK_EQ(ret, 0);
}
int ret = xdnn::instance_norm<float>(
ctx.GetRawContext(),
param.x->data<float>(),
Expand All @@ -40,13 +60,18 @@ void InstanceNormCompute::Run() {
h,
w,
param.epsilon,
param.scale->data<float>(),
param.bias->data<float>(),
(param.scale == nullptr) ? xpu_scale : param.scale->data<float>(),
(param.bias == nullptr) ? xpu_bias : param.bias->data<float>(),
param.saved_mean->mutable_data<float>(TARGET(kXPU)),
param.saved_variance->mutable_data<float>(TARGET(kXPU)),
true);

CHECK_EQ(ret, 0);
if (xpu_scale != nullptr) {
XPU_CALL(xpu_free(xpu_scale));
}
if (xpu_bias != nullptr) {
XPU_CALL(xpu_free(xpu_bias));
}
}

} // namespace xpu
Expand Down
3 changes: 2 additions & 1 deletion lite/operators/conv_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ bool ConvOpLite::CheckShape() const {

CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
CHECK_OR_FALSE(filter_dims.size() == 4UL || filter_dims.size() == 5UL);

return true;
}
Expand Down Expand Up @@ -115,4 +115,5 @@ bool ConvOpLite::InferShapeImpl() const {
} // namespace paddle

REGISTER_LITE_OP(conv2d, paddle::lite::operators::ConvOpLite);
REGISTER_LITE_OP(conv3d, paddle::lite::operators::ConvOpLite);
REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite);
6 changes: 3 additions & 3 deletions lite/operators/conv_op.h
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -215,14 +215,14 @@ class ConvOpLite : public OpLite {
}
#endif

// 2-pad to 4-pad
if (paddings.size() == 2L) {
// conv3d: 3-pad to 6-pad, or conv2d: 2-pad to 4-pad
if (paddings.size() == 2L || paddings.size() == 3L) {
for (size_t i = 0; i < param_.strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
} else {
if (paddings.size() != 4L) {
if (paddings.size() != 4L && paddings.size() != 6L) {
LOG(FATAL)
<< "Paddings size should be the same or twice as the input size.";
}
Expand Down
9 changes: 9 additions & 0 deletions lite/tests/kernels/conv_transpose_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,15 @@ TEST(Conv_transpose, precision) {
#elif defined(LITE_WITH_NPU)
place = TARGET(kNPU);
abs_error = 5e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
place = TARGET(kXPU);
abs_error = 5e-4;
TestConvTransposeKsize(place, abs_error);
TestConvTransposeStrides(place, abs_error);
TestConvTransposePaddings(place, abs_error);
TestConvTransposeGroups(place, abs_error);
TestConvTransposeOutputPadding(place, abs_error);
return;
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
TestConvTransposeOutputPadding(place, abs_error);
Expand Down
2 changes: 2 additions & 0 deletions lite/tests/kernels/instance_norm_compute_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,8 @@ TEST(InstanceNorm, precision) {
place = TARGET(kNPU);
abs_error = 1e-2; // Using fp16 in NPU
ignored_outs = {"saved_mean", "saved_variance"};
#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
place = TARGET(kXPU);
#elif defined(LITE_WITH_ARM)
place = TARGET(kARM);
#elif defined(LITE_WITH_X86)
Expand Down

0 comments on commit 67f0e6e

Please sign in to comment.