Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL][Kernel]Support opencl layer_norm #8648

Merged
merged 2 commits into from
Mar 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions lite/backends/opencl/cl_kernel/image/layer_norm_kernel.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <cl_common.h>

__kernel void layer_norm_batch(__read_only image2d_t input,
__write_only image2d_t output,
#ifdef SCALE
__global const CL_DTYPE* scale,
#endif
#ifdef BIAS
__global const CL_DTYPE* bias,
#endif
__private const int batch_size,
__private const int feature_size,
__private const int height,
__private const int image_w,
__private const int width,
__private const float epsilon) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);

int out_h = out_nh % height;
int out_n = out_nh / height;
CL_DTYPE avg = (CL_DTYPE)(0.0f);
CL_DTYPE var = (CL_DTYPE)(0.0f);
for (int h = 0; h < height; ++h) {
zhaoyang-star marked this conversation as resolved.
Show resolved Hide resolved
for (int w = 0; w < image_w; ++w) {
CL_DTYPE4 in = READ_IMG_TYPE(
CL_DTYPE_CHAR, input, SAMPLER, (int2)(w, out_n * height + h));
avg += (in.x + in.y + in.z + in.w);
var += (in.x * in.x + in.y * in.y + in.z * in.z + in.w * in.w);
}
}
avg /= feature_size;
var = var / feature_size - avg * avg;

var = sqrt(var + epsilon);
int2 pos;
pos.x = out_c * width + out_w;
pos.y = out_nh;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, pos);
CL_DTYPE4 out = (in - avg) / var;
#ifdef SCALE
int index0 = out_c * 4 * height * width + out_h * width + out_w;
int index1 = (out_c * 4 + 1) * height * width + out_h * width + out_w;
int index2 = (out_c * 4 + 2) * height * width + out_h * width + out_w;
int index3 = (out_c * 4 + 3) * height * width + out_h * width + out_w;
out *=
(CL_DTYPE4)(scale[index0], scale[index1], scale[index2], scale[index3]);
#endif
#ifdef BIAS
int ind0 = out_c * 4 * height * width + out_h * width + out_w;
int ind1 = (out_c * 4 + 1) * height * width + out_h * width + out_w;
int ind2 = (out_c * 4 + 2) * height * width + out_h * width + out_w;
int ind3 = (out_c * 4 + 3) * height * width + out_h * width + out_w;
out += (CL_DTYPE4)(bias[ind0], bias[ind1], bias[ind2], bias[ind3]);
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, pos, out);
}

__kernel void layer_norm_chann(__read_only image2d_t input,
__write_only image2d_t output,
#ifdef SCALE
__global const CL_DTYPE* scale,
#endif
#ifdef BIAS
__global const CL_DTYPE* bias,
#endif
__private const int batch_size,
__private const int feature_size,
__private const int height,
__private const int image_w,
__private const int width,
__private const float epsilon) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);

int out_h = out_nh % height;
int out_n = out_nh / height;
CL_DTYPE4 avg = (CL_DTYPE4)(0.0f);
CL_DTYPE4 var = (CL_DTYPE4)(0.0f);
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
CL_DTYPE4 in =
READ_IMG_TYPE(CL_DTYPE_CHAR,
input,
SAMPLER,
(int2)(out_c * width + w, out_n * height + h));
avg.x += in.x;
avg.y += in.y;
avg.z += in.z;
avg.w += in.w;
var.x += in.x * in.x;
var.y += in.y * in.y;
var.z += in.z * in.z;
var.w += in.w * in.w;
}
}
avg /= feature_size;
var = var / feature_size - avg * avg;
var = sqrt(var + epsilon);
int2 pos;
pos.x = out_c * width + out_w;
pos.y = out_nh;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, SAMPLER, pos);
CL_DTYPE4 out = (in - avg) / var;
#ifdef SCALE
out *= (CL_DTYPE4)(scale[out_h * width + out_w]);
#endif
#ifdef BIAS
out += (CL_DTYPE4)(bias[out_h * width + out_w]);
#endif
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, pos, out);
}
1 change: 1 addition & 0 deletions lite/kernels/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ add_kernel(fc_opencl_image OPENCL basic SRCS fc_image_compute.cc)
add_kernel(argmax_opencl_image OPENCL basic SRCS argmax_image_compute.cc)
add_kernel(max_opencl_image OPENCL basic SRCS max_image_compute.cc)
add_kernel(matmul_opencl_image OPENCL basic SRCS matmul_image_compute.cc)
add_kernel(layer_norm_opencl_image OPENCL basic SRCS layer_norm_image_compute.cc)
# extra
# wait to add ...

Expand Down
227 changes: 227 additions & 0 deletions lite/kernels/opencl/layer_norm_image_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <memory>
#include <string>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_image_converter.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#include "lite/utils/log/logging.h"
#include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
class LayerNormImageCompute : public KernelLite<TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault)> {
public:
using param_t = operators::LayerNormParam;

std::string doc() const override {
return "LayerNorm using cl::Image2D(ImageDefault/RGBA), kFP16";
}

void PrepareForRun() override {
layer_norm_param_ = param_.get_mutable<param_t>();
auto& context = ctx_->As<OpenCLContext>();

auto x_dims = layer_norm_param_->X->dims();
begin_norm_axis = layer_norm_param_->begin_norm_axis;
VLOG(4) << "begin_norm_axis: " << begin_norm_axis;
if (begin_norm_axis == 1) {
kernel_func_name_ = "layer_norm_batch";
} else if (begin_norm_axis == 2) {
kernel_func_name_ = "layer_norm_chann";
}
auto* scale = layer_norm_param_->Scale;
auto* bias = layer_norm_param_->Bias;
build_options_ += (scale != nullptr) ? " -DSCALE " : "";
build_options_ += (bias != nullptr) ? " -DBIAS " : "";
context.cl_context()->AddKernel(kernel_func_name_,
"image/layer_norm_kernel.cl",
build_options_,
time_stamp_);
VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
STL::stringstream kernel_key;
kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
if (scale != nullptr) {
VLOG(4) << "scale_dims: " << scale->dims();
auto* scale_cpu = scale->data<float>();
scale_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* scale_gpu_data =
scale_gpu_t_->mutable_data(TARGET(kOpenCL), scale->memory_size());
TargetWrapperCL::MemcpySync(
scale_gpu_data, scale_cpu, scale->memory_size(), IoDirection::HtoD);
has_scale_ = true;
}
if (bias != nullptr) {
VLOG(4) << "bias_dims: " << bias->dims();
auto* bias_cpu = bias->data<float>();
bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
auto* bias_gpu_data =
bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias->memory_size());
TargetWrapperCL::MemcpySync(
bias_gpu_data, bias_cpu, bias->memory_size(), IoDirection::HtoD);
has_bias_ = true;
}
}

void Run() override {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
auto* x = layer_norm_param_->X;
auto* scale = layer_norm_param_->Scale;
auto* bias = layer_norm_param_->Bias;

auto x_dims = x->dims();
auto* x_img = GET_DATA_GPU(x);

auto* y = layer_norm_param_->Y;
auto* mean = layer_norm_param_->Mean;
auto* var = layer_norm_param_->Variance;

auto y_dims = y->dims();
auto y_image_shape = InitImageDimInfoWith(y_dims);
auto* y_img = MUTABLE_DATA_GPU(
y, y_image_shape["width"], y_image_shape["height"], nullptr);

VLOG(4) << "x_dims: " << x_dims;
VLOG(4) << "y_dims: " << y_dims;
VLOG(4) << "mean_dims: " << mean->dims();
VLOG(4) << "var_dims: " << var->dims();

auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
int batch_size = matrix_dim[0];
int feature_size = matrix_dim[1];
epsilon = layer_norm_param_->epsilon;
int height = x_dims[2];
int width = x_dims[3];

int arg_idx = 0;
auto default_work_size = DefaultGlobalWorkSize(
y_dims,
DDim(std::vector<DDim::value_type>{
static_cast<int64_t>(y_image_shape["width"]),
static_cast<int64_t>(y_image_shape["height"])}));
#ifdef LITE_WITH_LOG
VLOG(4) << "default_work_size: " << default_work_size[0] << ", "
<< default_work_size[1] << ", " << default_work_size[2];
#endif
auto kernel = kernel_;
cl_int status = kernel.setArg(arg_idx++, *x_img);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, *y_img);
CL_CHECK_FATAL(status);
if (has_scale_) {
auto* scale_buffer_p_ = GET_BUFFER_GPU(scale_gpu_t_);
status = kernel.setArg(arg_idx++, *scale_buffer_p_);
CL_CHECK_FATAL(status);
}
if (has_bias_) {
auto* bias_buffer_p_ = GET_BUFFER_GPU(bias_gpu_t_);
status = kernel.setArg(arg_idx++, *bias_buffer_p_);
CL_CHECK_FATAL(status);
}
status = kernel.setArg(arg_idx++, batch_size);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, feature_size);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, height);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, y_image_shape["width"]);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, width);
CL_CHECK_FATAL(status);
status = kernel.setArg(arg_idx++, epsilon);
CL_CHECK_FATAL(status);

auto global_work_size =
cl::NDRange{static_cast<cl::size_type>(default_work_size[0]),
static_cast<cl::size_type>(default_work_size[1]),
static_cast<cl::size_type>(default_work_size[2])};

status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
#ifdef LITE_WITH_LOG
VLOG(4) << "global_work_size:[2D]:" << global_work_size[0] << " "
<< global_work_size[1] << " " << global_work_size[2];
#endif
}

#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif

protected:
param_t* layer_norm_param_{nullptr};
cl::Kernel kernel_;
float epsilon{1e-5f};
int begin_norm_axis{1};
std::string kernel_func_name_{""};
std::string build_options_{""};
std::string time_stamp_{GetTimeStamp()};
bool has_scale_{false};
bool has_bias_{false};
const cl::Buffer* scale_buffer_p_{nullptr};
const cl::Buffer* bias_buffer_p_{nullptr};
std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
std::unique_ptr<Tensor> scale_gpu_t_{nullptr};
};

} // namespace opencl
} // namespace kernels
} // namespace lite
} // namespace paddle

namespace ocl = paddle::lite::kernels::opencl;
REGISTER_LITE_KERNEL(layer_norm,
kOpenCL,
kFP16,
kImageDefault,
ocl::LayerNormImageCompute,
ImageDefault)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindInput("Scale", {LiteType::GetTensorTy(TARGET(kHost))})
.BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Y",
{LiteType::GetTensorTy(TARGET(kOpenCL),
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kHost))})
.BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
Loading