Skip to content

Commit

Permalink
[OpenCL][kernel]refactoring_transpose_kernel test=develop (#8341)
Browse files Browse the repository at this point in the history
  • Loading branch information
zhenlin-work authored Feb 8, 2022
1 parent 5af0f82 commit 5cd9b89
Show file tree
Hide file tree
Showing 6 changed files with 1,898 additions and 343 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,143 @@ limitations under the License. */

#include <cl_common.h>

__kernel void transpose_general_buffer(__global const CL_DTYPE* src,
__global CL_DTYPE* dst,
__global const int* out_idxs,
__private const int out_tensor_n,
__private const int out_tensor_c,
__private const int out_tensor_h,
__private const int out_tensor_w,
__private const int out_tensor_hw) {
int hidx = get_global_id(0); // [0, h) columns of dst
int widx = get_global_id(1); // [0, w) rows of dst
int chidx = get_global_id(2); // [0, ch) channels of dst

for (int i = 0; i < out_tensor_n; ++i) {
int glb_off = i * out_tensor_c * out_tensor_hw;
const int idx = mad((CL_DTYPE)chidx,
(CL_DTYPE)out_tensor_hw,
(CL_DTYPE)(mul24(hidx, out_tensor_w) + widx)) +
glb_off;
dst[out_idxs[idx]] = src[idx];
__kernel void transpose_4d_perm0132(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W,
__private const int in_H) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;

const int in_n = out_n;
const int in_c = out_c;
const int in_h = out_w;
const int in_w = out_h;

int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;

int2 input_pos0;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_H + in_h;

CL_DTYPE4 input0;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);

output = input0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}

__kernel void transpose_4d_perm0213(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W,
__private const int in_H) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;

const int in_n = out_n;
const int in_c = out_h / 4;
const int in_h0 = out_c * 4;
const int in_h1 = out_c * 4 + 1;
const int in_h2 = out_c * 4 + 2;
const int in_h3 = out_c * 4 + 3;
const int in_w = out_w;

int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;

int2 input_pos0;
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_H + in_h0;

input_pos1.x = in_W * in_c + in_w;
input_pos1.y = in_n * in_H + in_h1;

input_pos2.x = in_W * in_c + in_w;
input_pos2.y = in_n * in_H + in_h2;

input_pos3.x = in_W * in_c + in_w;
input_pos3.y = in_n * in_H + in_h3;

CL_DTYPE4 input0;
CL_DTYPE4 input1;
CL_DTYPE4 input2;
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);
if (out_h % 4 == 0) {
output.x = input0.x;
} else if (out_h % 4 == 1) {
output.x = input0.y;
} else if (out_h % 4 == 2) {
output.x = input0.z;
} else {
output.x = input0.w;
}

if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1);
if (out_h % 4 == 0) {
output.y = input1.x;
} else if (out_h % 4 == 1) {
output.y = input1.y;
} else if (out_h % 4 == 2) {
output.y = input1.z;
} else {
output.y = input1.w;
}
} else {
output.y = 0.0f;
}

if (out_C - out_c * 4 >= 3) {
input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2);
if (out_h % 4 == 0) {
output.z = input2.x;
} else if (out_h % 4 == 1) {
output.z = input2.y;
} else if (out_h % 4 == 2) {
output.z = input2.z;
} else {
output.z = input2.w;
}
} else {
output.z = 0.0f;
}

if (out_C - out_c * 4 >= 4) {
input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3);
if (out_h % 4 == 0) {
output.w = input3.x;
} else if (out_h % 4 == 1) {
output.w = input3.y;
} else if (out_h % 4 == 2) {
output.w = input3.z;
} else {
output.w = input3.w;
}
} else {
output.w = 0.0f;
}

WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}

__kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
Expand All @@ -48,17 +165,13 @@ __kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;

const int in_n = out_n;
const int in_c = out_w * 0.25;
const int in_h0 = out_c0;
const int in_h1 = out_c1;
const int in_h2 = out_c2;
const int in_h3 = out_c3;
const int in_c = out_w / 4;
const int in_h0 = out_c * 4;
const int in_h1 = out_c * 4 + 1;
const int in_h2 = out_c * 4 + 2;
const int in_h3 = out_c * 4 + 3;
const int in_w = out_h;

int2 output_pos;
Expand All @@ -69,7 +182,6 @@ __kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;

input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_H + in_h0;

Expand All @@ -88,7 +200,6 @@ __kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);

if (out_w % 4 == 0) {
output.x = input0.x;
} else if (out_w % 4 == 1) {
Expand All @@ -98,6 +209,7 @@ __kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
} else {
output.x = input0.w;
}

if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1);
if (out_w % 4 == 0) {
Expand Down Expand Up @@ -146,7 +258,7 @@ __kernel void transpose_4d_perm0231(__read_only image2d_t input_image,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}

__kernel void transpose_4d_perm0213(__read_only image2d_t input_image,
__kernel void transpose_4d_perm0312(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
Expand All @@ -158,18 +270,14 @@ __kernel void transpose_4d_perm0213(__read_only image2d_t input_image,
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;

const int in_n = out_n;
const int in_c = out_h / 4;
const int in_h0 = out_c0;
const int in_h1 = out_c1;
const int in_h2 = out_c2;
const int in_h3 = out_c3;
const int in_w = out_w;
const int in_h = out_w;
const int in_w0 = out_c * 4;
const int in_w1 = out_c * 4 + 1;
const int in_w2 = out_c * 4 + 2;
const int in_w3 = out_c * 4 + 3;

int2 output_pos;
output_pos.x = out_c * out_W + out_w;
Expand All @@ -179,26 +287,24 @@ __kernel void transpose_4d_perm0213(__read_only image2d_t input_image,
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;
input_pos0.x = in_W * in_c + in_w0;
input_pos0.y = in_n * in_H + in_h;

input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_H + in_h0;

input_pos1.x = in_W * in_c + in_w;
input_pos1.y = in_n * in_H + in_h1;
input_pos1.x = in_W * in_c + in_w1;
input_pos1.y = in_n * in_H + in_h;

input_pos2.x = in_W * in_c + in_w;
input_pos2.y = in_n * in_H + in_h2;
input_pos2.x = in_W * in_c + in_w2;
input_pos2.y = in_n * in_H + in_h;

input_pos3.x = in_W * in_c + in_w;
input_pos3.y = in_n * in_H + in_h3;
input_pos3.x = in_W * in_c + in_w3;
input_pos3.y = in_n * in_H + in_h;

CL_DTYPE4 input0;
CL_DTYPE4 input1;
CL_DTYPE4 input2;
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);

if (out_h % 4 == 0) {
output.x = input0.x;
} else if (out_h % 4 == 1) {
Expand All @@ -208,6 +314,7 @@ __kernel void transpose_4d_perm0213(__read_only image2d_t input_image,
} else {
output.x = input0.w;
}

if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1);
if (out_h % 4 == 0) {
Expand Down Expand Up @@ -268,17 +375,13 @@ __kernel void transpose_4d_perm0321(__read_only image2d_t input_image,
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;
const int out_c0 = out_c * 4;
const int out_c1 = out_c * 4 + 1;
const int out_c2 = out_c * 4 + 2;
const int out_c3 = out_c * 4 + 3;

const int in_n = out_n;
const int in_c = out_w / 4;
const int in_w0 = out_c0;
const int in_w1 = out_c1;
const int in_w2 = out_c2;
const int in_w3 = out_c3;
const int in_w0 = out_c * 4;
const int in_w1 = out_c * 4 + 1;
const int in_w2 = out_c * 4 + 2;
const int in_w3 = out_c * 4 + 3;
const int in_h = out_h;
const int in_w = out_w;

Expand All @@ -290,7 +393,6 @@ __kernel void transpose_4d_perm0321(__read_only image2d_t input_image,
int2 input_pos1;
int2 input_pos2;
int2 input_pos3;

input_pos0.x = in_W * in_c + in_w0;
input_pos0.y = out_nh;

Expand All @@ -309,7 +411,6 @@ __kernel void transpose_4d_perm0321(__read_only image2d_t input_image,
CL_DTYPE4 input3;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);

if (out_w % 4 == 0) {
output.x = input0.x;
} else if (out_w % 4 == 1) {
Expand All @@ -319,6 +420,7 @@ __kernel void transpose_4d_perm0321(__read_only image2d_t input_image,
} else {
output.x = input0.w;
}

if (out_C - out_c * 4 >= 2) {
input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1);
if (out_w % 4 == 0) {
Expand Down Expand Up @@ -367,40 +469,6 @@ __kernel void transpose_4d_perm0321(__read_only image2d_t input_image,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}

__kernel void transpose_4d_perm0132(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H,
__private const int out_W,
__private const int in_W,
__private const int in_H) {
const int out_c = get_global_id(0);
const int out_w = get_global_id(1);
const int out_nh = get_global_id(2);
const int out_n = out_nh / out_H;
const int out_h = out_nh % out_H;

const int in_n = out_n;
const int in_c = out_c;
const int in_h = out_w;
const int in_w = out_h;

int2 output_pos;
output_pos.x = out_c * out_W + out_w;
output_pos.y = out_nh;

int2 input_pos0;
input_pos0.x = in_W * in_c + in_w;
input_pos0.y = in_n * in_H + in_h;

CL_DTYPE4 input0;
CL_DTYPE4 output;
input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0);

output = input0;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
}

__kernel void transpose_2d(__read_only image2d_t input_image,
__write_only image2d_t output_image,
__private const int out_C,
Expand Down
Loading

0 comments on commit 5cd9b89

Please sign in to comment.