fixed the bug of tile op in large input and add xpu implemention.

test=develop
PaddlePaddle · Jun 8, 2022 · 4402f36 · 4402f36
1 parent c9d1213
commit 4402f36
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 3 deletions.
diff --git a/lite/kernels/host/tile_compute.cc b/lite/kernels/host/tile_compute.cc
@@ -85,9 +85,10 @@ void TileCompute<T, PType>::Run() {
       int dst_stride = in_stride[i + 1] * right;
       for (int m = 0; m < num; m++) {
         for (int j = 0; j < bcast_dims[i]; j++) {
-          std::memcpy(tmp_dst + j * dst_stride / bcast_dims[i] + m * dst_stride,
-                      tmp_src + m * dst_stride / bcast_dims[i],
-                      dst_stride / bcast_dims[i] * sizeof(T));
+          std::memcpy(
+              tmp_dst + j * (dst_stride / bcast_dims[i]) + m * dst_stride,
+              tmp_src + m * (dst_stride / bcast_dims[i]),
+              dst_stride / bcast_dims[i] * sizeof(T));
         }
       }
       tmp_src_tensor.CopyDataFrom(tmp_dst_tensor);

diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -30,6 +30,7 @@ add_kernel(gru_compute_xpu XPU basic SRCS gru_compute.cc)
 add_kernel(gru_unit_compute_xpu XPU basic SRCS gru_unit_compute.cc)
 add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc)
 add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc)
+add_kernel(tile_compute_xpu XPU basic SRCS tile_compute.cc)
 add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc)
 add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc)
 add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc)

diff --git a/lite/kernels/xpu/tile_compute.cc b/lite/kernels/xpu/tile_compute.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/tile_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void TileCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto repeat_times = param.repeat_times;
+  if (param.RepeatTimes) {
+    auto repeat_times_size = param.RepeatTimes->data_size();
+    for (int64_t i = 0; i < repeat_times_size; i++) {
+      repeat_times.push_back(param.RepeatTimes->template data<int>()[i]);
+    }
+  } else if (param.repeat_times_tensor.size() != 0) {
+    for (int i = 0; i < param.repeat_times_tensor.size(); i++) {
+      auto temp = param.repeat_times_tensor[i];
+      repeat_times.push_back(*(temp->template data<int>()));
+    }
+  }
+  auto in_dims = param.X->dims();
+  auto vec_in_dims = in_dims.Vectorize();
+  // broadcast for vec_in_dims.size() equal to repeat_times.size()
+  if (repeat_times.size() < vec_in_dims.size()) {
+    int diff = vec_in_dims.size() - repeat_times.size();
+    repeat_times.insert(repeat_times.begin(), diff, 1);
+  } else {
+    int diff = repeat_times.size() - vec_in_dims.size();
+    vec_in_dims.insert(vec_in_dims.begin(), diff, 1);
+  }
+
+  std::vector<int> new_in_dims(vec_in_dims.begin(), vec_in_dims.end());
+  std::vector<int> out_dims(new_in_dims);
+  for (size_t i = 0; i < repeat_times.size(); ++i) {
+    out_dims[i] *= repeat_times[i];
+  }
+  int r = xdnn::broadcast<T>(ctx.GetRawContext(),
+                             param.X->template data<T>(),
+                             param.Out->template mutable_data<T>(TARGET(kXPU)),
+                             new_in_dims,
+                             out_dims);
+
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using tile_float =
+    paddle::lite::kernels::xpu::TileCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(tile, kXPU, kFloat, kNCHW, tile_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("RepeatTimes",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindInput("repeat_times_tensor",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/tile_compute.h b/lite/kernels/xpu/tile_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class TileCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::TileParam;
+
+  virtual void Run();
+
+  virtual ~TileCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/tile_op.cc b/lite/operators/tile_op.cc
@@ -118,6 +118,7 @@ bool TileOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   } else if (opdesc.HasInput("repeat_times_tensor") &&
              (opdesc.Input("repeat_times_tensor").size() != 0)) {
     auto temp = opdesc.Input("repeat_times_tensor");
+    param_.repeat_times_tensor.clear();
     for (auto var : temp) {
       param_.repeat_times_tensor.push_back(
           scope->FindVar(var)->GetMutable<lite::Tensor>());

diff --git a/lite/tests/unittest_py/op/test_tile_op.py b/lite/tests/unittest_py/op/test_tile_op.py
@@ -36,6 +36,12 @@ def __init__(self, *args, **kwargs):
         ]
         self.enable_testing_on_place(places=host_places, thread=[1, 4])
 
+        xpu_places = [
+            Place(TargetType.XPU, PrecisionType.FP32, DataLayoutType.NCHW),
+            Place(TargetType.Host, PrecisionType.FP32)
+        ]
+        self.enable_testing_on_place(places=xpu_places)
+
     def is_program_valid(self,
                          program_config: ProgramConfig,
                          predictor_config: CxxConfig) -> bool: