From 67f0e6e8e503187dfd9ae2328b56a19e01dce7b2 Mon Sep 17 00:00:00 2001
From: wangleilei001 <wangll4397@gmail.com>
Date: Mon, 10 Jan 2022 13:58:18 +0800
Subject: [PATCH] [Cherry-Pick][XPU] add conv3d, fix instance_norm, fix
 conv2d_transpose (#7642), test=develop

---
 lite/kernels/x86/instance_norm_compute.cc     |   3 +
 lite/kernels/xpu/CMakeLists.txt               |   1 +
 lite/kernels/xpu/conv2d_transpose_compute.cc  | 117 ++++++++++++++----
 lite/kernels/xpu/conv3d_compute.cc            |  75 +++++++++++
 lite/kernels/xpu/conv3d_compute.h             |  37 ++++++
 lite/kernels/xpu/instance_norm_compute.cc     |  33 ++++-
 lite/operators/conv_op.cc                     |   3 +-
 lite/operators/conv_op.h                      |   6 +-
 .../kernels/conv_transpose_compute_test.cc    |   9 ++
 .../kernels/instance_norm_compute_test.cc     |   2 +
 10 files changed, 257 insertions(+), 29 deletions(-)
 create mode 100644 lite/kernels/xpu/conv3d_compute.cc
 create mode 100644 lite/kernels/xpu/conv3d_compute.h
 mode change 100755 => 100644 lite/operators/conv_op.h
diff --git a/lite/kernels/x86/instance_norm_compute.cc b/lite/kernels/x86/instance_norm_compute.cc
index 00b50df374b..10375688b96 100644
--- a/lite/kernels/x86/instance_norm_compute.cc
+++ b/lite/kernels/x86/instance_norm_compute.cc
@@ -42,6 +42,9 @@ void InstanceNormCompute::Run() {
   int c = param.x->dims()[1];
   int height = param.x->dims()[2];
   int width = param.x->dims()[3];
+  if (param.x->dims().size() == 5) {
+    width = param.x->dims()[3] * param.x->dims()[4];
+  }
 
   lite::x86::math::instance_norm(in,
                                  out,
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 3d3720ba7f0..ceabed4fa7a 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -19,6 +19,7 @@ else()
   # basic
   add_kernel(rnn_compute_xpu XPU basic SRCS rnn_compute.cc)
   add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc)
+  add_kernel(conv3d_compute_xpu XPU basic SRCS conv3d_compute.cc)
   add_kernel(conv2d_transpose_compute_xpu XPU basic SRCS conv2d_transpose_compute.cc)
   add_kernel(calib_compute_xpu XPU basic SRCS calib_compute.cc)
   add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc)
diff --git a/lite/kernels/xpu/conv2d_transpose_compute.cc b/lite/kernels/xpu/conv2d_transpose_compute.cc
index f9f1a7c4847..7dc6abf2b8f 100644
--- a/lite/kernels/xpu/conv2d_transpose_compute.cc
+++ b/lite/kernels/xpu/conv2d_transpose_compute.cc
@@ -36,27 +36,102 @@ void Conv2dTransposeCompute<PRECISION(kFloat)>::Run() {
   auto paddings = *param.paddings;
   auto dilations = *param.dilations;
 
-  int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
-      ctx.GetRawContext(),
-      param.x->data<float>(),
-      param.filter->data<float>(),
-      param.output->mutable_data<float>(TARGET(kXPU)),
-      in_dims[0],
-      in_dims[1],
-      in_dims[2],
-      in_dims[3],
-      out_dims[1],
-      std::vector<int>{static_cast<int>(w_dims[2]),
-                       static_cast<int>(w_dims[3])},
-      strides,
-      paddings,
-      dilations,
-      groups,
-      nullptr,
-      nullptr,
-      nullptr,
-      true);
-  CHECK_EQ(ret, 0);
+  if (param.output_padding.empty()) {
+    int ret = xdnn::conv2d_transpose<float, float, float, int16_t>(
+        ctx.GetRawContext(),
+        param.x->data<float>(),
+        param.filter->data<float>(),
+        param.output->mutable_data<float>(TARGET(kXPU)),
+        in_dims[0],
+        in_dims[1],
+        in_dims[2],
+        in_dims[3],
+        out_dims[1],
+        std::vector<int>{static_cast<int>(w_dims[2]),
+                         static_cast<int>(w_dims[3])},
+        strides,
+        paddings,
+        dilations,
+        groups,
+        nullptr,
+        nullptr,
+        nullptr,
+        true);
+    CHECK_EQ(ret, 0);
+  } else {
+    int n = in_dims[0];
+    int yc = in_dims[1];
+    int yh = in_dims[2];
+    int yw = in_dims[3];
+    int xc = out_dims[1];
+    int xh = out_dims[2];
+    int xw = out_dims[3];
+    int kh = w_dims[2];
+    int kw = w_dims[3];
+    float* x_trans = nullptr;
+    XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_trans),
+                        (param.x->numel()) * sizeof(float)));
+    float* x_col_before_concat = nullptr;
+    XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_col_before_concat),
+                        (n * yh * yw * kh * kw * xc) * sizeof(float)));
+    float* x_col = nullptr;
+    XPU_CALL(xpu_malloc(reinterpret_cast<void**>(&x_col),
+                        (n * yh * yw * kh * kw * xc) * sizeof(float)));
+    const float* weight = param.filter->data<float>();
+    int ret = xdnn::transpose<float>(ctx.GetRawContext(),
+                                     param.x->data<float>(),
+                                     x_trans,
+                                     {n, groups, yc / groups, yh, yw},
+                                     {1, 0, 3, 4, 2});
+    CHECK_EQ(ret, 0);
+    for (int g = 0; g < groups; g++) {
+      const float* curr_y = x_trans + g * n * yh * yw * (yc / groups);
+      const float* curr_w =
+          weight + g * (yc / groups) * (xc / groups) * kh * kw;
+      float* curr_x =
+          x_col_before_concat + g * n * yh * yw * (xc / groups) * kh * kw;
+      int mac_m = n * yh * yw;
+      int mac_k = yc / groups;
+      int mac_n = xc / groups * kh * kw;
+      ret = xdnn::fc<float, float, float, int16_t>(ctx.GetRawContext(),
+                                                   curr_y,
+                                                   curr_w,
+                                                   curr_x,
+                                                   mac_m,
+                                                   mac_n,
+                                                   mac_k,
+                                                   false,
+                                                   false,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+      CHECK_EQ(ret, 0);
+    }
+    ret = xdnn::transpose<float>(ctx.GetRawContext(),
+                                 x_col_before_concat,
+                                 x_col,
+                                 {groups, n * yh * yw, (xc / groups) * kh * kw},
+                                 {1, 0, 2});
+    CHECK_EQ(ret, 0);
+
+    ret = xdnn::col2im<float>(ctx.GetRawContext(),
+                              x_col,
+                              param.output->mutable_data<float>(TARGET(kXPU)),
+                              n,
+                              xc,
+                              xh,
+                              xw,
+                              std::vector<int>{static_cast<int>(w_dims[2]),
+                                               static_cast<int>(w_dims[3])},
+                              strides,
+                              paddings,
+                              dilations,
+                              true);
+    CHECK_EQ(ret, 0);
+    XPU_CALL(xpu_free(x_trans));
+    XPU_CALL(xpu_free(x_col_before_concat));
+    XPU_CALL(xpu_free(x_col));
+  }
 }
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/conv3d_compute.cc b/lite/kernels/xpu/conv3d_compute.cc
new file mode 100644
index 00000000000..1589a14b892
--- /dev/null
+++ b/lite/kernels/xpu/conv3d_compute.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/conv3d_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <>
+void Conv3DCompute<PRECISION(kFloat)>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto& x_dims = param.x->dims();
+  auto& w_dims = param.filter->dims();
+  int groups = param.groups;
+  auto& strides = param.strides;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
+  int r = xdnn::conv3d<float, float, float, int16_t>(
+      ctx.GetRawContext(), /* context */
+      param.x->data<float>(),
+      param.filter->data<float>(), /* weight */
+      param.output->mutable_data<float>(TARGET(kXPU)),
+      x_dims[0], /* input_n */
+      x_dims[1], /* input_c */
+      x_dims[2], /* input_d */
+      x_dims[3], /* input_h */
+      x_dims[4], /* input_w */
+      w_dims[0], /* num_filter */
+      std::vector<int>{static_cast<int>(w_dims[2]),
+                       static_cast<int>(w_dims[3]),
+                       static_cast<int>(w_dims[4])}, /* kernel size*/
+      strides,
+      paddings,
+      dilations,
+      groups,
+      nullptr,
+      nullptr,
+      nullptr,
+      true /*is_ncdhw*/);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace xpu = paddle::lite::kernels::xpu;
+using Conv3dFp32 = xpu::Conv3DCompute<PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(conv3d, kXPU, kFloat, kNCHW, Conv3dFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/conv3d_compute.h b/lite/kernels/xpu/conv3d_compute.h
new file mode 100644
index 00000000000..caadb82a1e8
--- /dev/null
+++ b/lite/kernels/xpu/conv3d_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <PrecisionType FilterPtype>
+class Conv3DCompute : public KernelLite<TARGET(kXPU), FilterPtype> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void Run() override;
+
+  virtual ~Conv3DCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/instance_norm_compute.cc b/lite/kernels/xpu/instance_norm_compute.cc
index 2f57478ed41..00fddb6f416 100644
--- a/lite/kernels/xpu/instance_norm_compute.cc
+++ b/lite/kernels/xpu/instance_norm_compute.cc
@@ -25,12 +25,32 @@ void InstanceNormCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->As<XPUContext>();
   auto x_dims = param.x->dims();
-  CHECK_EQ(x_dims.size(), 4);
+  CHECK(x_dims.size() == 4 || x_dims.size() == 5)
+      << "Not support x_dims_rank = " << x_dims.size();
+
   int n = x_dims[0];
   int c = x_dims[1];
   int h = x_dims[2];
   int w = x_dims[3];
+  if (x_dims.size() == 5) {
+    h = x_dims[2] * x_dims[3];
+    w = x_dims[4];
+  }
 
+  float* xpu_scale = nullptr;
+  if (param.scale == nullptr) {
+    XPU_CALL(
+        xpu_malloc(reinterpret_cast<void**>(&xpu_scale), c * sizeof(float)));
+    int ret = xdnn::constant<float>(ctx.GetRawContext(), xpu_scale, c, 1.0f);
+    CHECK_EQ(ret, 0);
+  }
+  float* xpu_bias = nullptr;
+  if (param.bias == nullptr) {
+    XPU_CALL(
+        xpu_malloc(reinterpret_cast<void**>(&xpu_bias), c * sizeof(float)));
+    int ret = xdnn::constant<float>(ctx.GetRawContext(), xpu_bias, c, 0.0f);
+    CHECK_EQ(ret, 0);
+  }
   int ret = xdnn::instance_norm<float>(
       ctx.GetRawContext(),
       param.x->data<float>(),
@@ -40,13 +60,18 @@ void InstanceNormCompute::Run() {
       h,
       w,
       param.epsilon,
-      param.scale->data<float>(),
-      param.bias->data<float>(),
+      (param.scale == nullptr) ? xpu_scale : param.scale->data<float>(),
+      (param.bias == nullptr) ? xpu_bias : param.bias->data<float>(),
       param.saved_mean->mutable_data<float>(TARGET(kXPU)),
       param.saved_variance->mutable_data<float>(TARGET(kXPU)),
       true);
-
   CHECK_EQ(ret, 0);
+  if (xpu_scale != nullptr) {
+    XPU_CALL(xpu_free(xpu_scale));
+  }
+  if (xpu_bias != nullptr) {
+    XPU_CALL(xpu_free(xpu_bias));
+  }
 }
 
 }  // namespace xpu
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index fa18a384fb1..6aa8f1b9eab 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -34,7 +34,7 @@ bool ConvOpLite::CheckShape() const {
 
   CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
   CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
+  CHECK_OR_FALSE(filter_dims.size() == 4UL || filter_dims.size() == 5UL);
 
   return true;
 }
@@ -115,4 +115,5 @@ bool ConvOpLite::InferShapeImpl() const {
 }  // namespace paddle
 
 REGISTER_LITE_OP(conv2d, paddle::lite::operators::ConvOpLite);
+REGISTER_LITE_OP(conv3d, paddle::lite::operators::ConvOpLite);
 REGISTER_LITE_OP(depthwise_conv2d, paddle::lite::operators::ConvOpLite);
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
old mode 100755
new mode 100644
index 51f13366fa9..275c14ff8f1
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -215,14 +215,14 @@ class ConvOpLite : public OpLite {
     }
 #endif
 
-    // 2-pad to 4-pad
-    if (paddings.size() == 2L) {
+    // conv3d: 3-pad to 6-pad, or conv2d: 2-pad to 4-pad
+    if (paddings.size() == 2L || paddings.size() == 3L) {
       for (size_t i = 0; i < param_.strides.size(); ++i) {
         int copy_pad = *(paddings.begin() + 2 * i);
         paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
       }
     } else {
-      if (paddings.size() != 4L) {
+      if (paddings.size() != 4L && paddings.size() != 6L) {
         LOG(FATAL)
             << "Paddings size should be the same or twice as the input size.";
       }
diff --git a/lite/tests/kernels/conv_transpose_compute_test.cc b/lite/tests/kernels/conv_transpose_compute_test.cc
index ab3defd8c85..adf4ec80f6b 100644
--- a/lite/tests/kernels/conv_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv_transpose_compute_test.cc
@@ -425,6 +425,15 @@ TEST(Conv_transpose, precision) {
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 5e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
+  place = TARGET(kXPU);
+  abs_error = 5e-4;
+  TestConvTransposeKsize(place, abs_error);
+  TestConvTransposeStrides(place, abs_error);
+  TestConvTransposePaddings(place, abs_error);
+  TestConvTransposeGroups(place, abs_error);
+  TestConvTransposeOutputPadding(place, abs_error);
+  return;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
   TestConvTransposeOutputPadding(place, abs_error);
diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc
index 759ecd5e635..bfe96419bbb 100644
--- a/lite/tests/kernels/instance_norm_compute_test.cc
+++ b/lite/tests/kernels/instance_norm_compute_test.cc
@@ -187,6 +187,8 @@ TEST(InstanceNorm, precision) {
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
   ignored_outs = {"saved_mean", "saved_variance"};
+#elif defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
+  place = TARGET(kXPU);
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_X86)