Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OpenCL]add adreno conv1x1 h1w2c1 and h2w2c1 #6112

Merged
merged 1 commit into from
May 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,106 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
}
}

__kernel void conv2d_1x1_h1w2c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__read_only image2d_t bias,
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w,
__read_only image2d_t prelu_alpha) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);

int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;

int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);

int in_pos_x0 = out_w0 * stride + offset;
int in_pos_x1 = out_w1 * stride + offset;
int in_pos_y = out_nh * stride + offset;

#ifdef BIASE_CH
CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0;
#else
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
#endif

for (int i = 0; i < input_c; ++i) {
CL_DTYPE4 weight0 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2)));
CL_DTYPE4 weight1 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 1));
CL_DTYPE4 weight2 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 2));
CL_DTYPE4 weight3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 3));

CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_x0, in_pos_y));
CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_x1, in_pos_y));

output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);

output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
}

CL_DTYPE4 alpha0,alpha1;
#ifdef PRELU_CH //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
alpha1 = alpha0;
//}
#elif defined(PRELU_ELE) //{
if (out_w0 < old_w) {
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
}
if (out_w1 < old_w) {
alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos1);
}
//}
#elif defined(PRELU_ALL) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
alpha0.z = alpha0.x;
alpha0.w = alpha0.x;
alpha1 = alpha0;
//}
#endif
output0 = activation_type4(output0, alpha0);
output1 = activation_type4(output1, alpha1);

#ifdef SCALE_ACTIVATION
output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
output1 = fuse_scale(output1, 1.f, 0.f, 0.f);
#endif

if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}

if (out_w1 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
}
}

__kernel void conv2d_1x1_h1w5c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
Expand Down Expand Up @@ -1013,6 +1113,130 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3, alpha4, alpha5, alpha6;
}
}

__kernel void conv2d_1x1_h2w2c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__read_only image2d_t bias,
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w,
__read_only image2d_t prelu_alpha) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1) * 2;
const int out_nh = get_global_id(2) * 2;

int in_pos_w0 = out_w * stride + offset;
int in_pos_w1 = (out_w + 1) * stride + offset;
int in_pos_h0 = out_nh * stride + offset;
int in_pos_h1 = (out_nh + 1) * stride + offset;

#ifdef BIASE_CH
CL_DTYPE4 out_w0_h0_c0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0));
CL_DTYPE4 out_w1_h0_c0 = out_w0_h0_c0;
CL_DTYPE4 out_w0_h1_c0 = out_w0_h0_c0;
CL_DTYPE4 out_w1_h1_c0 = out_w0_h0_c0;
#else
CL_DTYPE4 out_w0_h0_c0 = 0.0f;
CL_DTYPE4 out_w1_h0_c0 = 0.0f;
CL_DTYPE4 out_w0_h1_c0 = 0.0f;
CL_DTYPE4 out_w1_h1_c0 = 0.0f;
#endif
if (out_w >= output_width || out_nh >= output_height || out_c >= global_size_dim0) {
return;
}
for (int i = 0; i < input_c; ++i) {
CL_DTYPE4 f0 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2)));
CL_DTYPE4 f1 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 1));
CL_DTYPE4 f2 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 2));
CL_DTYPE4 f3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 3));

CL_DTYPE4 src_w0_h0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w0, in_pos_h0));
CL_DTYPE4 src_w1_h0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w1, in_pos_h0));
CL_DTYPE4 src_w0_h1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w0, in_pos_h1));
CL_DTYPE4 src_w1_h1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w1, in_pos_h1));

out_w0_h0_c0 = mad(f0, src_w0_h0.x, out_w0_h0_c0);
out_w1_h0_c0 = mad(f0, src_w1_h0.x, out_w1_h0_c0);
out_w0_h1_c0 = mad(f0, src_w0_h1.x, out_w0_h1_c0);
out_w1_h1_c0 = mad(f0, src_w1_h1.x, out_w1_h1_c0);
out_w0_h0_c0 = mad(f1, src_w0_h0.y, out_w0_h0_c0);
out_w1_h0_c0 = mad(f1, src_w1_h0.y, out_w1_h0_c0);
out_w0_h1_c0 = mad(f1, src_w0_h1.y, out_w0_h1_c0);
out_w1_h1_c0 = mad(f1, src_w1_h1.y, out_w1_h1_c0);
out_w0_h0_c0 = mad(f2, src_w0_h0.z, out_w0_h0_c0);
out_w1_h0_c0 = mad(f2, src_w1_h0.z, out_w1_h0_c0);
out_w0_h1_c0 = mad(f2, src_w0_h1.z, out_w0_h1_c0);
out_w1_h1_c0 = mad(f2, src_w1_h1.z, out_w1_h1_c0);
out_w0_h0_c0 = mad(f3, src_w0_h0.w, out_w0_h0_c0);
out_w1_h0_c0 = mad(f3, src_w1_h0.w, out_w1_h0_c0);
out_w0_h1_c0 = mad(f3, src_w0_h1.w, out_w0_h1_c0);
out_w1_h1_c0 = mad(f3, src_w1_h1.w, out_w1_h1_c0);
}

CL_DTYPE4 alpha0, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7;
#ifdef PRELU_CH //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
alpha1 = alpha0;
alpha2 = alpha0;
alpha3 = alpha0;
//}
#elif defined(PRELU_ELE) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w, out_nh));
if (out_w + 1 < output_width) {
alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w + 1, out_nh));
}
if (out_nh + 1 < output_height) {
alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w , out_nh + 1));
}
if (out_w + 1 < output_width && out_nh + 1 < output_height) {
alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w + 1, out_nh + 1));
}
//}
#elif defined(PRELU_ALL) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
alpha0.z = alpha0.x;
alpha0.w = alpha0.x;
alpha1 = alpha0;
alpha2 = alpha0;
alpha3 = alpha0;
//}
#endif
out_w0_h0_c0 = activation_type4(out_w0_h0_c0, alpha0);
out_w1_h0_c0 = activation_type4(out_w1_h0_c0, alpha1);
out_w0_h1_c0 = activation_type4(out_w0_h1_c0, alpha2);
out_w1_h1_c0 = activation_type4(out_w1_h1_c0, alpha3);

#ifdef SCALE_ACTIVATION
out_w0_h0_c0 = fuse_scale(out_w0_h0_c0, 1.f, 0.f, 0.f);
out_w1_h0_c0 = fuse_scale(out_w1_h0_c0, 1.f, 0.f, 0.f);
out_w0_h1_c0 = fuse_scale(out_w0_h1_c0, 1.f, 0.f, 0.f);
out_w1_h1_c0 = fuse_scale(out_w1_h1_c0, 1.f, 0.f, 0.f);
#endif

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w, out_nh), out_w0_h0_c0);
if (out_w + 1 < output_width) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w + 1, out_nh), out_w1_h0_c0);
}
if (out_nh + 1 < output_height) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w, out_nh + 1), out_w0_h1_c0);
}
if (out_w + 1 < output_width && out_nh + 1 < output_height) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w + 1, out_nh + 1), out_w1_h1_c0);
}
}

__kernel void conv2d_1x1_h2w2c2(
__private const int global_size_dim0,
__private const int global_size_dim1,
Expand Down
27 changes: 26 additions & 1 deletion lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
double final_lws_time = DBL_MAX;
auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key;
for (size_t i = 0; i < 4; i++) {
for (size_t i = 0; i < 6; i++) {
if (i == 1) {
kernel_func_names_[0] = "conv2d_1x1_h1w5c1";
global_work_size_ =
Expand Down Expand Up @@ -722,6 +722,28 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
build_options_[0],
time_stamp_);
}
if (i == 4) {
kernel_func_names_[0] = "conv2d_1x1_h1w2c1";
global_work_size_ =
cl::NDRange{static_cast<size_t>(default_c_blk_),
static_cast<size_t>(UP_DIV(default_w_blk_, 2)),
static_cast<size_t>(default_nh_blk_)};
context.cl_context()->AddKernel(kernel_func_names_[0],
kernel_func_paths_[0],
build_options_[0],
time_stamp_);
}
if (i == 5) {
kernel_func_names_[0] = "conv2d_1x1_h2w2c1";
global_work_size_ =
cl::NDRange{static_cast<size_t>(default_c_blk_),
static_cast<size_t>(UP_DIV(default_w_blk_, 2)),
static_cast<size_t>(UP_DIV(default_nh_blk_, 2))};
context.cl_context()->AddKernel(kernel_func_names_[0],
kernel_func_paths_[0],
build_options_[0],
time_stamp_);
}
kernel_key.str("");
kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
Expand Down Expand Up @@ -777,6 +799,9 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
if (kernel_func_names_[0] == "conv2d_1x1_h1w7c1") {
w_blk_ = UP_DIV(default_w_blk_, 7);
}
if (kernel_func_names_[0] == "conv2d_1x1_h1w2c1") {
w_blk_ = UP_DIV(default_w_blk_, 2);
}
// CLRuntime::Global()->SetTunedLocalWorkSizeMap(tuned_map_key,local_work_size_);
} else {
auto& context = ctx_->As<OpenCLContext>();
Expand Down