Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL][Buffer]Add reshape buffer opencl #9193

Merged
merged 4 commits into from
Aug 11, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lite/kernels/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ add_kernel(slice_opencl_buffer OPENCL basic SRCS slice_buffer_compute.cc)
add_kernel(yolo_box_opencl_buffer OPENCL basic SRCS yolo_box_buffer_compute.cc)
add_kernel(squeeze_unsqueeze_opencl_buffer OPENCL basic SRCS squeeze_unsqueeze_buffer_compute.cc)
add_kernel(matmul_opencl_buffer OPENCL basic SRCS matmul_buffer_compute.cc)
add_kernel(reshape_opencl_buffer OPENCL basic SRCS reshape_buffer_compute.cc)
# extra
# wait to add ...

Expand Down Expand Up @@ -215,3 +216,6 @@ lite_cc_test(test_fc_buffer_opencl SRCS fc_buffer_compute_test.cc

lite_cc_test(test_io_copy_buffer_opencl SRCS io_copy_buffer_compute_test.cc
DEPS kernels core)

lite_cc_test(test_reshape_buffer_opencl SRCS reshape_buffer_compute_test.cc
DEPS kernels core)
9 changes: 7 additions & 2 deletions lite/kernels/opencl/fc_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,13 @@ class FcImageCompute : public KernelLite<TARGET(kOpenCL),
CHECK_GE(w_dims.size(), 2UL);
CHECK_LE(param.output->dims().size(), 4UL);

m_ = x_dims.Slice(0, param.in_num_col_dims).production();
k_ = x_dims.Slice(param.in_num_col_dims, x_dims.size()).production();
int in_num_col_dims = param.in_num_col_dims;
std::string op_type = param.op_type;
if (op_type == "matmul" || op_type == "matmul_v2") {
in_num_col_dims = x_dims.size() - 1;
}
m_ = x_dims.Slice(0, in_num_col_dims).production();
k_ = x_dims.Slice(in_num_col_dims, x_dims.size()).production();
n_ = w_dims[1];
CHECK_EQ(k_, static_cast<int>(w_dims[0]));
k_blks_ = UP_DIV(k_, 4);
Expand Down
114 changes: 114 additions & 0 deletions lite/kernels/opencl/reshape_buffer_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/log/logging.h"
#include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"

#undef LITE_WITH_LOG

namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {

class ReshapeComputeFloatBuffer
: public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
public:
using param_t = operators::ReshapeParam;

void PrepareForRun() override { auto& context = ctx_->As<OpenCLContext>(); }

void Run() override {
auto& param = *param_.get_mutable<param_t>();
const Tensor* const x = param.x;
Tensor* output = param.output;

auto output_dims = output->dims();
auto output_lod = output->lod();
if (param.inplace) {
output->ShareDataWith(*x);
} else {
output->CopyDataFrom(*x);
}
output->Resize(output_dims);
output->set_lod(output_lod);

#ifdef LITE_WITH_LOG
VLOG(4) << TargetToStr(x->target());
VLOG(4) << TargetToStr(param.output->target());
#endif
}

private:
std::string time_stamp_{GetTimeStamp()};
};

} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(reshape,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::ReshapeComputeFloatBuffer,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.BindInput("ShapeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();

REGISTER_LITE_KERNEL(reshape2,
kOpenCL,
kFloat,
kNCHW,
paddle::lite::kernels::opencl::ReshapeComputeFloatBuffer,
def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.BindInput("ShapeTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindInput("Shape",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("XShape",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();

#define LITE_WITH_LOG
221 changes: 221 additions & 0 deletions lite/kernels/opencl/reshape_buffer_compute_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <gtest/gtest.h>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

opencl 的单测能复用lite/tests/kernels下面的吗?

#include <random>
#include "lite/backends/opencl/target_wrapper.h"
#include "lite/core/op_registry.h"
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/test_helper.h"
#include "lite/operators/reshape_op.h"
#include "lite/utils/log/logging.h"

#define FP16_MAX_DIFF (5e-1)

namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
static DDim ValidateShape(const std::vector<int>& shape,
const DDim& input_dims) {
const lite::DDim::value_type input_size = input_dims.production();
auto input_shape = input_dims.Vectorize();
bool all_positive = std::all_of(
input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
return i > 0;
});
// only one dimension can be set to -1, whose size will be automatically
// infered.
const int unk_dim_val = -1;
const int copy_dim_val = 0;

std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
lite::DDim::value_type capacity = 1;
int unk_dim_idx = -1;
for (size_t i = 0; i < shape.size(); ++i) {
if (shape[i] == unk_dim_val) {
CHECK_EQ(unk_dim_idx, -1)
<< "Only one input dimension of Attr(shape) can be unknown.";
unk_dim_idx = i;
} else if (shape[i] == copy_dim_val) {
CHECK_LT(static_cast<int>(i), input_shape.size())
<< "The index of dimension to copy from input shape must be less "
"than the size of input shape.";
} else {
CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
"be negtive except one unknown dimension.";
}

capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
: input_shape[i]);
output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
: input_shape[i]);
}

if (unk_dim_idx != -1) {
if (all_positive) {
// input_size < 0 and is un-determinate in compile time, skip the check,
// for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
// capacity = -24, input_size = -8, output_shape[0] = 0
// the following check will fail.
output_shape[unk_dim_idx] = -input_size / capacity;
CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
<< "Invalid shape is given.";
} else {
output_shape[unk_dim_idx] = -1;
}
} else {
CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
}
return lite::DDim(output_shape);
}

TEST(reshape_opencl, compute) {
LOG(INFO) << "to get kernel ...";
auto kernels = KernelRegistry::Global().Create(
"reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
ASSERT_FALSE(kernels.empty());
auto kernel = std::move(kernels.front());
lite_api::CLPrecisionType p = lite_api::CLPrecisionType::CL_PRECISION_FP16;
CLRuntime::Global()->set_precision(p);
const bool fp16_flag = (p == lite_api::CLPrecisionType::CL_PRECISION_FP16);
LOG(INFO) << "created reshape kernel";

LOG(INFO) << "prepare kernel ------";

int64_t batch_size = 15;
int64_t ic = 1;
int64_t ih = 2;
int64_t iw = 3;

lite::Tensor input, output, input_h;

operators::ReshapeParam param;

Tensor shape_tensor;
shape_tensor.Resize({3});
auto* shape_tensor_data = shape_tensor.mutable_data<int>();
shape_tensor_data[0] = 1;
shape_tensor_data[1] = 15;
shape_tensor_data[2] = 6;

if (fp16_flag) {
param.x = &input_h;
param.shape_tensor = &shape_tensor; // use shape_tensor
param.inplace = true;
param.output = &output;
} else {
param.x = &input;
param.shape_tensor = &shape_tensor; // use shape_tensor
param.inplace = true;
param.output = &output;
}

const DDim input_dim =
lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
input.Resize(input_dim);
input_h.Resize(input_dim);

std::vector<int> final_shape = std::vector<int>(
shape_tensor_data, shape_tensor_data + shape_tensor.numel());
LOG(INFO) << "shape_tensor.numel() " << shape_tensor.numel();
auto out_dim = ValidateShape(final_shape, input_dim);
param.output->Resize(out_dim);
LOG(INFO) << " out_dim------" << out_dim;

LOG(INFO) << "prepare kernel SetParam------";
kernel->SetParam(param);
std::unique_ptr<KernelContext> context(new KernelContext);
context->As<OpenCLContext>().InitOnce();
kernel->SetContext(std::move(context));

auto* input_data_h =
input_h.mutable_data<half_t, cl::Buffer>(TARGET(kOpenCL));
auto* input_data = input.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));

std::default_random_engine engine;
std::uniform_real_distribution<float> dist(-5, 5);
LOG(INFO) << "gen input ...";
std::vector<float> x_source(input_dim.production());
std::vector<half_t> x_source_half(input_dim.production());
for (size_t i = 0; i < input_dim.production(); ++i) {
x_source[i] = static_cast<int>(dist(engine));
x_source_half[i] = Float2Half(x_source[i]);
}

size_t x_size = input_dim.production() * sizeof(float);
if (fp16_flag) {
x_size = input_dim.production() * sizeof(half_t);
TargetWrapperCL::MemcpySync(
input_data_h, x_source_half.data(), x_size, IoDirection::HtoD);
} else {
TargetWrapperCL::MemcpySync(
input_data, x_source.data(), x_size, IoDirection::HtoD);
}

kernel->Launch();
CLRuntime::Global()->command_queue().finish();
auto* y_buffer = fp16_flag ? output.data<half_t, cl::Buffer>()
: output.data<float, cl::Buffer>();
std::vector<float> out_data_from_gpu(out_dim.production());
std::vector<float> output_half2float(out_dim.production());
std::vector<half_t> out_data_from_gpu_half(out_dim.production());
if (fp16_flag) {
TargetWrapperCL::MemcpySync(out_data_from_gpu_half.data(),
y_buffer,
out_data_from_gpu_half.size() * sizeof(half_t),
IoDirection::DtoH);
} else {
TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
y_buffer,
out_data_from_gpu.size() * sizeof(float),
IoDirection::DtoH);
}
for (int eidx = 0; eidx < out_dim.production(); ++eidx) {
output_half2float[eidx] = Half2Float(out_data_from_gpu_half.data()[eidx]);
}

// check output dims
for (int i = 0; i < output.dims().size(); i++) {
CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
}

// check output data
for (int i = 0; i < output.numel(); i++) {
auto out_gpu_data = out_data_from_gpu[i];
if (fp16_flag) {
out_gpu_data = output_half2float[i];
}
auto abs_diff = abs(out_gpu_data - x_source[i]);
auto relative_diff = COMPUTE_RELATIVE_DIFF(out_gpu_data, x_source[i]);
EXPECT_EQ((relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
true);
if ((relative_diff > FP16_MAX_DIFF) && (abs_diff > FP16_MAX_DIFF)) {
LOG(ERROR) << "error idx:" << i << " out_gpu_data[" << i
<< "]:" << out_gpu_data << " "
"input_data["
<< i << "]:" << x_source[i] << " abs_diff:" << abs_diff
<< " relative_diff:" << relative_diff
<< " FP16_MAX_DIFF:" << FP16_MAX_DIFF;
}
}
}

} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle

USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kNCHW, def);
USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kNCHW, def);