Skip to content

Commit

Permalink
[OpenCL] Add adreno conv1x1 h1w2c1 and h2w2c1 (#6112) (#6256)
Browse files Browse the repository at this point in the history
  • Loading branch information
daming5432 authored Jun 21, 2021
1 parent 11f92f6 commit 7ddb6e9
Show file tree
Hide file tree
Showing 2 changed files with 250 additions and 1 deletion.
224 changes: 224 additions & 0 deletions lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,106 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3;
}
}

__kernel void conv2d_1x1_h1w2c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__read_only image2d_t bias,
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w,
__read_only image2d_t prelu_alpha) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);

int out_w0 = out_w;
int out_w1 = out_w + global_size_dim1;

int outpos_main = mul24(out_c, old_w);
int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);

int in_pos_x0 = out_w0 * stride + offset;
int in_pos_x1 = out_w1 * stride + offset;
int in_pos_y = out_nh * stride + offset;

#ifdef BIASE_CH
CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0));
CL_DTYPE4 output1 = output0;
#else
CL_DTYPE4 output0 = 0.0f;
CL_DTYPE4 output1 = 0.0f;
#endif

for (int i = 0; i < input_c; ++i) {
CL_DTYPE4 weight0 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2)));
CL_DTYPE4 weight1 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 1));
CL_DTYPE4 weight2 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 2));
CL_DTYPE4 weight3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 3));

CL_DTYPE4 input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_x0, in_pos_y));
CL_DTYPE4 input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_x1, in_pos_y));

output0 = mad(input0.x, weight0, output0);
output0 = mad(input0.y, weight1, output0);
output0 = mad(input0.z, weight2, output0);
output0 = mad(input0.w, weight3, output0);

output1 = mad(input1.x, weight0, output1);
output1 = mad(input1.y, weight1, output1);
output1 = mad(input1.z, weight2, output1);
output1 = mad(input1.w, weight3, output1);
}

CL_DTYPE4 alpha0,alpha1;
#ifdef PRELU_CH //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
alpha1 = alpha0;
//}
#elif defined(PRELU_ELE) //{
if (out_w0 < old_w) {
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos0);
}
if (out_w1 < old_w) {
alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, output_pos1);
}
//}
#elif defined(PRELU_ALL) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
alpha0.z = alpha0.x;
alpha0.w = alpha0.x;
alpha1 = alpha0;
//}
#endif
output0 = activation_type4(output0, alpha0);
output1 = activation_type4(output1, alpha1);

#ifdef SCALE_ACTIVATION
output0 = fuse_scale(output0, 1.f, 0.f, 0.f);
output1 = fuse_scale(output1, 1.f, 0.f, 0.f);
#endif

if (out_w0 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
}

if (out_w1 < old_w) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
}
}

__kernel void conv2d_1x1_h1w5c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
Expand Down Expand Up @@ -1013,6 +1113,130 @@ CL_DTYPE4 alpha0,alpha1,alpha2,alpha3, alpha4, alpha5, alpha6;
}
}

__kernel void conv2d_1x1_h2w2c1(
__private const int global_size_dim0,
__private const int global_size_dim1,
__private const int global_size_dim2,
__read_only image2d_t input_image,
__read_only image2d_t filter,
__read_only image2d_t bias,
__write_only image2d_t output_image,
__private const int stride,
__private const int offset,
__private const int input_c,
__private const int input_c_origin,
__private const int dilation,
__private const int input_width, /* of one block */
__private const int input_height, /* of one block */
__private const int output_width,
__private const int output_height,
__private const int old_w,
__read_only image2d_t prelu_alpha) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1) * 2;
const int out_nh = get_global_id(2) * 2;

int in_pos_w0 = out_w * stride + offset;
int in_pos_w1 = (out_w + 1) * stride + offset;
int in_pos_h0 = out_nh * stride + offset;
int in_pos_h1 = (out_nh + 1) * stride + offset;

#ifdef BIASE_CH
CL_DTYPE4 out_w0_h0_c0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, SAMPLER, (int2)(out_c, 0));
CL_DTYPE4 out_w1_h0_c0 = out_w0_h0_c0;
CL_DTYPE4 out_w0_h1_c0 = out_w0_h0_c0;
CL_DTYPE4 out_w1_h1_c0 = out_w0_h0_c0;
#else
CL_DTYPE4 out_w0_h0_c0 = 0.0f;
CL_DTYPE4 out_w1_h0_c0 = 0.0f;
CL_DTYPE4 out_w0_h1_c0 = 0.0f;
CL_DTYPE4 out_w1_h1_c0 = 0.0f;
#endif
if (out_w >= output_width || out_nh >= output_height || out_c >= global_size_dim0) {
return;
}
for (int i = 0; i < input_c; ++i) {
CL_DTYPE4 f0 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2)));
CL_DTYPE4 f1 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 1));
CL_DTYPE4 f2 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 2));
CL_DTYPE4 f3 = READ_IMG_TYPE(CL_DTYPE_CHAR, filter, SAMPLER, (int2)(out_c, (i << 2) + 3));

CL_DTYPE4 src_w0_h0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w0, in_pos_h0));
CL_DTYPE4 src_w1_h0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w1, in_pos_h0));
CL_DTYPE4 src_w0_h1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w0, in_pos_h1));
CL_DTYPE4 src_w1_h1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, (int2)(i * input_width + in_pos_w1, in_pos_h1));

out_w0_h0_c0 = mad(f0, src_w0_h0.x, out_w0_h0_c0);
out_w1_h0_c0 = mad(f0, src_w1_h0.x, out_w1_h0_c0);
out_w0_h1_c0 = mad(f0, src_w0_h1.x, out_w0_h1_c0);
out_w1_h1_c0 = mad(f0, src_w1_h1.x, out_w1_h1_c0);
out_w0_h0_c0 = mad(f1, src_w0_h0.y, out_w0_h0_c0);
out_w1_h0_c0 = mad(f1, src_w1_h0.y, out_w1_h0_c0);
out_w0_h1_c0 = mad(f1, src_w0_h1.y, out_w0_h1_c0);
out_w1_h1_c0 = mad(f1, src_w1_h1.y, out_w1_h1_c0);
out_w0_h0_c0 = mad(f2, src_w0_h0.z, out_w0_h0_c0);
out_w1_h0_c0 = mad(f2, src_w1_h0.z, out_w1_h0_c0);
out_w0_h1_c0 = mad(f2, src_w0_h1.z, out_w0_h1_c0);
out_w1_h1_c0 = mad(f2, src_w1_h1.z, out_w1_h1_c0);
out_w0_h0_c0 = mad(f3, src_w0_h0.w, out_w0_h0_c0);
out_w1_h0_c0 = mad(f3, src_w1_h0.w, out_w1_h0_c0);
out_w0_h1_c0 = mad(f3, src_w0_h1.w, out_w0_h1_c0);
out_w1_h1_c0 = mad(f3, src_w1_h1.w, out_w1_h1_c0);
}

CL_DTYPE4 alpha0, alpha1, alpha2, alpha3, alpha4, alpha5, alpha6, alpha7;
#ifdef PRELU_CH //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c, 0));
alpha1 = alpha0;
alpha2 = alpha0;
alpha3 = alpha0;
//}
#elif defined(PRELU_ELE) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w, out_nh));
if (out_w + 1 < output_width) {
alpha1 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w + 1, out_nh));
}
if (out_nh + 1 < output_height) {
alpha2 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w , out_nh + 1));
}
if (out_w + 1 < output_width && out_nh + 1 < output_height) {
alpha3 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(out_c * old_w + out_w + 1, out_nh + 1));
}
//}
#elif defined(PRELU_ALL) //{
alpha0 = READ_IMG_TYPE(CL_DTYPE_CHAR, prelu_alpha, SAMPLER, (int2)(0, 0));
alpha0.y = alpha0.x;
alpha0.z = alpha0.x;
alpha0.w = alpha0.x;
alpha1 = alpha0;
alpha2 = alpha0;
alpha3 = alpha0;
//}
#endif
out_w0_h0_c0 = activation_type4(out_w0_h0_c0, alpha0);
out_w1_h0_c0 = activation_type4(out_w1_h0_c0, alpha1);
out_w0_h1_c0 = activation_type4(out_w0_h1_c0, alpha2);
out_w1_h1_c0 = activation_type4(out_w1_h1_c0, alpha3);

#ifdef SCALE_ACTIVATION
out_w0_h0_c0 = fuse_scale(out_w0_h0_c0, 1.f, 0.f, 0.f);
out_w1_h0_c0 = fuse_scale(out_w1_h0_c0, 1.f, 0.f, 0.f);
out_w0_h1_c0 = fuse_scale(out_w0_h1_c0, 1.f, 0.f, 0.f);
out_w1_h1_c0 = fuse_scale(out_w1_h1_c0, 1.f, 0.f, 0.f);
#endif

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w, out_nh), out_w0_h0_c0);
if (out_w + 1 < output_width) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w + 1, out_nh), out_w1_h0_c0);
}
if (out_nh + 1 < output_height) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w, out_nh + 1), out_w0_h1_c0);
}
if (out_w + 1 < output_width && out_nh + 1 < output_height) {
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(out_c * old_w + out_w + 1, out_nh + 1), out_w1_h1_c0);
}
}

__kernel void conv2d_1x1_h2w2c2(
__private const int global_size_dim0,
__private const int global_size_dim1,
Expand Down
27 changes: 26 additions & 1 deletion lite/kernels/opencl/conv_image_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
double final_lws_time = DBL_MAX;
auto& context = ctx_->As<OpenCLContext>();
std::stringstream kernel_key;
for (size_t i = 0; i < 4; i++) {
for (size_t i = 0; i < 6; i++) {
if (i == 1) {
kernel_func_names_[0] = "conv2d_1x1_h1w5c1";
global_work_size_ =
Expand Down Expand Up @@ -755,6 +755,28 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
build_options_[0],
time_stamp_);
}
if (i == 4) {
kernel_func_names_[0] = "conv2d_1x1_h1w2c1";
global_work_size_ =
cl::NDRange{static_cast<size_t>(default_c_blk_),
static_cast<size_t>(UP_DIV(default_w_blk_, 2)),
static_cast<size_t>(default_nh_blk_)};
context.cl_context()->AddKernel(kernel_func_names_[0],
kernel_func_paths_[0],
build_options_[0],
time_stamp_);
}
if (i == 5) {
kernel_func_names_[0] = "conv2d_1x1_h2w2c1";
global_work_size_ =
cl::NDRange{static_cast<size_t>(default_c_blk_),
static_cast<size_t>(UP_DIV(default_w_blk_, 2)),
static_cast<size_t>(UP_DIV(default_nh_blk_, 2))};
context.cl_context()->AddKernel(kernel_func_names_[0],
kernel_func_paths_[0],
build_options_[0],
time_stamp_);
}
kernel_key.str("");
kernel_key << kernel_func_names_[0] << build_options_[0] << time_stamp_;
kernel_ = context.cl_context()->GetKernel(kernel_key.str());
Expand Down Expand Up @@ -810,6 +832,9 @@ void ConvImageCompute::SetLocalWorkSize(size_t repeats /*=4*/) {
if (kernel_func_names_[0] == "conv2d_1x1_h1w7c1") {
w_blk_ = UP_DIV(default_w_blk_, 7);
}
if (kernel_func_names_[0] == "conv2d_1x1_h1w2c1") {
w_blk_ = UP_DIV(default_w_blk_, 2);
}
// CLRuntime::Global()->SetTunedLocalWorkSizeMap(tuned_map_key,local_work_size_);
} else if (is_wino_) {
auto& context = ctx_->As<OpenCLContext>();
Expand Down

0 comments on commit 7ddb6e9

Please sign in to comment.