From 60f94549467d747bcefe540be3aa95ee6f9c3c4c Mon Sep 17 00:00:00 2001 From: zhenlin-work Date: Mon, 24 Jan 2022 11:33:03 +0000 Subject: [PATCH] refactoring_transpose_kernel test=develop --- .../cl_kernel/image/transpose_fixb1_kernel.cl | 504 ++++ .../cl_kernel/image/transpose_fixb2_kernel.cl | 645 +++++ .../cl_kernel/image/transpose_fixb3_kernel.cl | 503 ++++ .../cl_kernel/image/transpose_fixb4_kernel.cl | 503 ++++ .../cl_kernel/image/transpose_kernel.cl | 2110 ----------------- .../kernels/opencl/transpose_image_compute.cc | 197 +- 6 files changed, 2212 insertions(+), 2250 deletions(-) create mode 100644 lite/backends/opencl/cl_kernel/image/transpose_fixb1_kernel.cl create mode 100644 lite/backends/opencl/cl_kernel/image/transpose_fixb2_kernel.cl create mode 100644 lite/backends/opencl/cl_kernel/image/transpose_fixb3_kernel.cl create mode 100644 lite/backends/opencl/cl_kernel/image/transpose_fixb4_kernel.cl delete mode 100644 lite/backends/opencl/cl_kernel/image/transpose_kernel.cl diff --git a/lite/backends/opencl/cl_kernel/image/transpose_fixb1_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_fixb1_kernel.cl new file mode 100644 index 00000000000..6e993b68330 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_fixb1_kernel.cl @@ -0,0 +1,504 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d_perm0132(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c = out_c; + const int in_h = out_w; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + + output = input0; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm0213(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c = out_h / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm0231(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c = out_w / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm0312(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c = out_h / 4; + const int in_h = out_w; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = in_n * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = in_n * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = in_n * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm0321(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_n; + const int in_c = out_w / 4; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + const int in_h = out_h; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = out_nh; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = out_nh; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = out_nh; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = out_nh; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_2d(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = 1; + const int out_h = out_nh % out_H; + + const int in_n = 1; + const int in_c = out_c; + const int in_w = out_h; + const int in_h = out_w; + + int2 input_pos; + int2 output_pos; + input_pos.x = in_c * in_W + in_w; + input_pos.y = in_n * in_h; + + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_n * out_h; + + CL_DTYPE4 input; + CL_DTYPE4 output; + input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos); + + output = input; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); +} diff --git a/lite/backends/opencl/cl_kernel/image/transpose_fixb2_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_fixb2_kernel.cl new file mode 100644 index 00000000000..dbdd4ca7dd4 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_fixb2_kernel.cl @@ -0,0 +1,645 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d_perm1023(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_n / 4; + const int in_h = out_h; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm1032(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_n / 4; + const int in_h = out_w; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm1203(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_n / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm1230(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_n / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm1302(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_n / 4; + const int in_h = out_w; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = in_n * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = in_n * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = in_n * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm1320(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_n / 4; + const int in_h = out_h; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = in_n * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = in_n * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = in_n * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_n % 4 == 0) { + output.x = input0.x; + } else if (out_n % 4 == 1) { + output.x = input0.y; + } else if (out_n % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_n % 4 == 0) { + output.y = input1.x; + } else if (out_n % 4 == 1) { + output.y = input1.y; + } else if (out_n % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_n % 4 == 0) { + output.z = input2.x; + } else if (out_n % 4 == 1) { + output.z = input2.y; + } else if (out_n % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_n % 4 == 0) { + output.w = input3.x; + } else if (out_n % 4 == 1) { + output.w = input3.y; + } else if (out_n % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/backends/opencl/cl_kernel/image/transpose_fixb3_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_fixb3_kernel.cl new file mode 100644 index 00000000000..1b4f9a3e86b --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_fixb3_kernel.cl @@ -0,0 +1,503 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d_perm2013(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_h / 4; + const int in_h = out_n; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm2031(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_w / 4; + const int in_h = out_n; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm2103(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_c; + const int in_h = out_n; + const int in_w = out_w; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + + output = input0; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm2130(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_c; + const int in_h = out_n; + const int in_w = out_h; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + + output = input0; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm2301(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_w / 4; + const int in_h = out_n; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = in_n * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = in_n * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = in_n * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm2310(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_h / 4; + const int in_h = out_n; + const int in_w0 = out_c * 4; + const int in_w1 = out_c * 4 + 1; + const int in_w2 = out_c * 4 + 2; + const int in_w3 = out_c * 4 + 3; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w0; + input_pos0.y = in_n * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w1; + input_pos1.y = in_n * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w2; + input_pos2.y = in_n * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w3; + input_pos3.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/backends/opencl/cl_kernel/image/transpose_fixb4_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_fixb4_kernel.cl new file mode 100644 index 00000000000..c5a600fe271 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/transpose_fixb4_kernel.cl @@ -0,0 +1,503 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void transpose_4d_perm3012(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_h / 4; + const int in_h = out_w; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm3021(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n0 = out_c * 4; + const int in_n1 = out_c * 4 + 1; + const int in_n2 = out_c * 4 + 2; + const int in_n3 = out_c * 4 + 3; + const int in_c = out_w / 4; + const int in_h = out_h; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n0 * in_H + in_h; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n1 * in_H + in_h; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n2 * in_H + in_h; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n3 * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm3102(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_c; + const int in_h = out_w; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + + output = input0; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm3120(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_c; + const int in_h = out_h; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h; + + CL_DTYPE4 input0; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + + output = input0; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm3201(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_h; + const int in_c = out_w / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_w % 4 == 0) { + output.x = input0.x; + } else if (out_w % 4 == 1) { + output.x = input0.y; + } else if (out_w % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_w % 4 == 0) { + output.y = input1.x; + } else if (out_w % 4 == 1) { + output.y = input1.y; + } else if (out_w % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_w % 4 == 0) { + output.z = input2.x; + } else if (out_w % 4 == 1) { + output.z = input2.y; + } else if (out_w % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_w % 4 == 0) { + output.w = input3.x; + } else if (out_w % 4 == 1) { + output.w = input3.y; + } else if (out_w % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} + +__kernel void transpose_4d_perm3210(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H, + __private const int out_W, + __private const int in_W, + __private const int in_H) { + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + const int out_n = out_nh / out_H; + const int out_h = out_nh % out_H; + + const int in_n = out_w; + const int in_c = out_h / 4; + const int in_h0 = out_c * 4; + const int in_h1 = out_c * 4 + 1; + const int in_h2 = out_c * 4 + 2; + const int in_h3 = out_c * 4 + 3; + const int in_w = out_n; + + int2 output_pos; + output_pos.x = out_c * out_W + out_w; + output_pos.y = out_nh; + + int2 input_pos0; + int2 input_pos1; + int2 input_pos2; + int2 input_pos3; + input_pos0.x = in_W * in_c + in_w; + input_pos0.y = in_n * in_H + in_h0; + + input_pos1.x = in_W * in_c + in_w; + input_pos1.y = in_n * in_H + in_h1; + + input_pos2.x = in_W * in_c + in_w; + input_pos2.y = in_n * in_H + in_h2; + + input_pos3.x = in_W * in_c + in_w; + input_pos3.y = in_n * in_H + in_h3; + + CL_DTYPE4 input0; + CL_DTYPE4 input1; + CL_DTYPE4 input2; + CL_DTYPE4 input3; + CL_DTYPE4 output; + input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); + if (out_h % 4 == 0) { + output.x = input0.x; + } else if (out_h % 4 == 1) { + output.x = input0.y; + } else if (out_h % 4 == 2) { + output.x = input0.z; + } else { + output.x = input0.w; + } + + if (out_C - out_c * 4 >= 2) { + input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); + if (out_h % 4 == 0) { + output.y = input1.x; + } else if (out_h % 4 == 1) { + output.y = input1.y; + } else if (out_h % 4 == 2) { + output.y = input1.z; + } else { + output.y = input1.w; + } + } else { + output.y = 0.0f; + } + + if (out_C - out_c * 4 >= 3) { + input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); + if (out_h % 4 == 0) { + output.z = input2.x; + } else if (out_h % 4 == 1) { + output.z = input2.y; + } else if (out_h % 4 == 2) { + output.z = input2.z; + } else { + output.z = input2.w; + } + } else { + output.z = 0.0f; + } + + if (out_C - out_c * 4 >= 4) { + input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); + if (out_h % 4 == 0) { + output.w = input3.x; + } else if (out_h % 4 == 1) { + output.w = input3.y; + } else if (out_h % 4 == 2) { + output.w = input3.z; + } else { + output.w = input3.w; + } + } else { + output.w = 0.0f; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); +} diff --git a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl deleted file mode 100644 index 9d9082eeeef..00000000000 --- a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl +++ /dev/null @@ -1,2110 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -__kernel void transpose_4d_perm0132(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c = out_c; - const int in_h = out_w; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - - output = input0; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm0213(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c = out_h / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm0231(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c = out_w / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm0312(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c = out_h / 4; - const int in_h = out_w; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = in_n * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = in_n * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = in_n * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm0321(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_n; - const int in_c = out_w / 4; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - const int in_h = out_h; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = out_nh; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = out_nh; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = out_nh; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = out_nh; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1023(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_n / 4; - const int in_h = out_h; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1032(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_n / 4; - const int in_h = out_w; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1203(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_n / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1230(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_n / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1302(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_n / 4; - const int in_h = out_w; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = in_n * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = in_n * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = in_n * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm1320(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_n / 4; - const int in_h = out_h; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = in_n * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = in_n * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = in_n * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_n % 4 == 0) { - output.x = input0.x; - } else if (out_n % 4 == 1) { - output.x = input0.y; - } else if (out_n % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_n % 4 == 0) { - output.y = input1.x; - } else if (out_n % 4 == 1) { - output.y = input1.y; - } else if (out_n % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_n % 4 == 0) { - output.z = input2.x; - } else if (out_n % 4 == 1) { - output.z = input2.y; - } else if (out_n % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_n % 4 == 0) { - output.w = input3.x; - } else if (out_n % 4 == 1) { - output.w = input3.y; - } else if (out_n % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2013(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_h / 4; - const int in_h = out_n; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2031(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_w / 4; - const int in_h = out_n; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2103(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_c; - const int in_h = out_n; - const int in_w = out_w; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - - output = input0; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2130(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_c; - const int in_h = out_n; - const int in_w = out_h; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - - output = input0; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2301(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_w / 4; - const int in_h = out_n; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = in_n * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = in_n * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = in_n * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm2310(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_h / 4; - const int in_h = out_n; - const int in_w0 = out_c * 4; - const int in_w1 = out_c * 4 + 1; - const int in_w2 = out_c * 4 + 2; - const int in_w3 = out_c * 4 + 3; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w0; - input_pos0.y = in_n * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w1; - input_pos1.y = in_n * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w2; - input_pos2.y = in_n * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w3; - input_pos3.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3012(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_h / 4; - const int in_h = out_w; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3021(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n0 = out_c * 4; - const int in_n1 = out_c * 4 + 1; - const int in_n2 = out_c * 4 + 2; - const int in_n3 = out_c * 4 + 3; - const int in_c = out_w / 4; - const int in_h = out_h; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n0 * in_H + in_h; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n1 * in_H + in_h; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n2 * in_H + in_h; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n3 * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3102(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_c; - const int in_h = out_w; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - - output = input0; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3120(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_c; - const int in_h = out_h; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h; - - CL_DTYPE4 input0; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - - output = input0; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3201(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_h; - const int in_c = out_w / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_w % 4 == 0) { - output.x = input0.x; - } else if (out_w % 4 == 1) { - output.x = input0.y; - } else if (out_w % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_w % 4 == 0) { - output.y = input1.x; - } else if (out_w % 4 == 1) { - output.y = input1.y; - } else if (out_w % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_w % 4 == 0) { - output.z = input2.x; - } else if (out_w % 4 == 1) { - output.z = input2.y; - } else if (out_w % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_w % 4 == 0) { - output.w = input3.x; - } else if (out_w % 4 == 1) { - output.w = input3.y; - } else if (out_w % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_4d_perm3210(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = out_nh / out_H; - const int out_h = out_nh % out_H; - - const int in_n = out_w; - const int in_c = out_h / 4; - const int in_h0 = out_c * 4; - const int in_h1 = out_c * 4 + 1; - const int in_h2 = out_c * 4 + 2; - const int in_h3 = out_c * 4 + 3; - const int in_w = out_n; - - int2 output_pos; - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_nh; - - int2 input_pos0; - int2 input_pos1; - int2 input_pos2; - int2 input_pos3; - input_pos0.x = in_W * in_c + in_w; - input_pos0.y = in_n * in_H + in_h0; - - input_pos1.x = in_W * in_c + in_w; - input_pos1.y = in_n * in_H + in_h1; - - input_pos2.x = in_W * in_c + in_w; - input_pos2.y = in_n * in_H + in_h2; - - input_pos3.x = in_W * in_c + in_w; - input_pos3.y = in_n * in_H + in_h3; - - CL_DTYPE4 input0; - CL_DTYPE4 input1; - CL_DTYPE4 input2; - CL_DTYPE4 input3; - CL_DTYPE4 output; - input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos0); - if (out_h % 4 == 0) { - output.x = input0.x; - } else if (out_h % 4 == 1) { - output.x = input0.y; - } else if (out_h % 4 == 2) { - output.x = input0.z; - } else { - output.x = input0.w; - } - - if (out_C - out_c * 4 >= 2) { - input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos1); - if (out_h % 4 == 0) { - output.y = input1.x; - } else if (out_h % 4 == 1) { - output.y = input1.y; - } else if (out_h % 4 == 2) { - output.y = input1.z; - } else { - output.y = input1.w; - } - } else { - output.y = 0.0f; - } - - if (out_C - out_c * 4 >= 3) { - input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos2); - if (out_h % 4 == 0) { - output.z = input2.x; - } else if (out_h % 4 == 1) { - output.z = input2.y; - } else if (out_h % 4 == 2) { - output.z = input2.z; - } else { - output.z = input2.w; - } - } else { - output.z = 0.0f; - } - - if (out_C - out_c * 4 >= 4) { - input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos3); - if (out_h % 4 == 0) { - output.w = input3.x; - } else if (out_h % 4 == 1) { - output.w = input3.y; - } else if (out_h % 4 == 2) { - output.w = input3.z; - } else { - output.w = input3.w; - } - } else { - output.w = 0.0f; - } - - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output); -} - -__kernel void transpose_2d(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_C, - __private const int out_H, - __private const int out_W, - __private const int in_W, - __private const int in_H) { - const int out_c = get_global_id(0); - const int out_w = get_global_id(1); - const int out_nh = get_global_id(2); - const int out_n = 1; - const int out_h = out_nh % out_H; - - const int in_n = 1; - const int in_c = out_c; - const int in_w = out_h; - const int in_h = out_w; - - int2 input_pos; - int2 output_pos; - input_pos.x = in_c * in_W + in_w; - input_pos.y = in_n * in_h; - - output_pos.x = out_c * out_W + out_w; - output_pos.y = out_n * out_h; - - CL_DTYPE4 input; - CL_DTYPE4 output; - input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, SAMPLER, input_pos); - - output = input; - WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input); -} diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc index af4d159a52c..bf548ffa292 100644 --- a/lite/kernels/opencl/transpose_image_compute.cc +++ b/lite/kernels/opencl/transpose_image_compute.cc @@ -50,6 +50,7 @@ class TransposeComputeFloatImage output_image_h_ = output_image_shape.at("height"); output_image_w_ = output_image_shape.at("width"); VLOG(4) << "x_tensor_dims_: " << x_tensor_dims_; + if (axis_.size() == 3) { VLOG(4) << "Extend CHW to 1CHW"; axis_.insert(axis_.begin(), 0); // extend batch dim is 1 @@ -58,58 +59,31 @@ class TransposeComputeFloatImage } } if (axis_.size() == 4) { - if (axis_ == std::vector({0, 1, 3, 2})) { - kernel_func_name_ = "transpose_4d_perm0132"; - } else if (axis_ == std::vector({0, 2, 1, 3})) { - kernel_func_name_ = "transpose_4d_perm0213"; - } else if (axis_ == std::vector({0, 2, 3, 1})) { // for NHWC - kernel_func_name_ = "transpose_4d_perm0231"; - } else if (axis_ == std::vector({0, 3, 1, 2})) { - kernel_func_name_ = "transpose_4d_perm0312"; - } else if (axis_ == std::vector({0, 3, 2, 1})) { - kernel_func_name_ = "transpose_4d_perm0321"; - } else if (axis_ == std::vector({1, 0, 2, 3})) { - kernel_func_name_ = "transpose_4d_perm1023"; - } else if (axis_ == std::vector({1, 0, 3, 2})) { - kernel_func_name_ = "transpose_4d_perm1032"; - } else if (axis_ == std::vector({1, 2, 0, 3})) { - kernel_func_name_ = "transpose_4d_perm1203"; - } else if (axis_ == std::vector({1, 2, 3, 0})) { - kernel_func_name_ = "transpose_4d_perm1230"; - } else if (axis_ == std::vector({1, 3, 0, 2})) { - kernel_func_name_ = "transpose_4d_perm1302"; - } else if (axis_ == std::vector({1, 3, 2, 0})) { - kernel_func_name_ = "transpose_4d_perm1320"; - } else if (axis_ == std::vector({2, 0, 1, 3})) { - kernel_func_name_ = "transpose_4d_perm2013"; - } else if (axis_ == std::vector({2, 0, 3, 1})) { - kernel_func_name_ = "transpose_4d_perm2031"; - } else if (axis_ == std::vector({2, 1, 0, 3})) { - kernel_func_name_ = "transpose_4d_perm2103"; - } else if (axis_ == std::vector({2, 1, 3, 0})) { - kernel_func_name_ = "transpose_4d_perm2130"; - } else if (axis_ == std::vector({2, 3, 0, 1})) { - kernel_func_name_ = "transpose_4d_perm2301"; - } else if (axis_ == std::vector({2, 3, 1, 0})) { - kernel_func_name_ = "transpose_4d_perm2310"; - } else if (axis_ == std::vector({3, 0, 1, 2})) { - kernel_func_name_ = "transpose_4d_perm3012"; - } else if (axis_ == std::vector({3, 0, 2, 1})) { - kernel_func_name_ = "transpose_4d_perm3021"; - } else if (axis_ == std::vector({3, 1, 0, 2})) { - kernel_func_name_ = "transpose_4d_perm3102"; - } else if (axis_ == std::vector({3, 1, 2, 0})) { - kernel_func_name_ = "transpose_4d_perm3120"; - } else if (axis_ == std::vector({3, 2, 0, 1})) { - kernel_func_name_ = "transpose_4d_perm3201"; - } else if (axis_ == std::vector({3, 2, 1, 0})) { - kernel_func_name_ = "transpose_4d_perm3210"; + std::vector tmp = axis_; + sort(tmp.begin(), tmp.end()); + if (tmp == std::vector({0, 1, 2, 3}) && + axis_ != std::vector({0, 1, 2, 3})) { + kernel_func_name_ = "transpose_4d_perm"; + for (int i = 0; i < axis_.size(); ++i) { + kernel_func_name_ += to_string(axis_[i]); + } + kernel_path = + "image/transpose_fixb" + to_string(axis_[0] + 1) + "_kernel.cl"; } else { LOG(FATAL) << "Unsupported axis permutation for current lite OpenCL " "kernel! "; } } else if (axis_.size() == 2) { - kernel_func_name_ = "transpose_2d"; + std::vector tmp = axis_; + sort(tmp.begin(), tmp.end()); + if (tmp == std::vector({0, 1}) && + axis_ != std::vector({0, 1})) { + kernel_func_name_ = "transpose_2d"; + kernel_path_ = "image/transpose_fixb1_kernel.cl"; + } else { + LOG(FATAL) << "Unsupported axis permutation for current lite OpenCL " + "kernel! "; + } } else { LOG(FATAL) << "Unsupported axis permutation for current lite OpenCL " "kernel! "; @@ -136,10 +110,8 @@ class TransposeComputeFloatImage auto& context = ctx_->As(); VLOG(1) << "kernel_func_name_:" << kernel_func_name_; - context.cl_context()->AddKernel(kernel_func_name_, - "image/transpose_kernel.cl", - build_options_, - time_stamp_); + context.cl_context()->AddKernel( + kernel_func_name_, kernel_path_, build_options_, time_stamp_); STL::stringstream kernel_key; kernel_key << kernel_func_name_ << build_options_ << time_stamp_; kernel_ = context.cl_context()->GetKernel(kernel_key.str()); @@ -154,42 +126,14 @@ class TransposeComputeFloatImage #endif void GetGlobalWorkSize() { - if (kernel_func_name_ == "transpose_4d_perm0132" || - kernel_func_name_ == "transpose_4d_perm0213" || - kernel_func_name_ == "transpose_4d_perm0231" || - kernel_func_name_ == "transpose_4d_perm0312" || - kernel_func_name_ == "transpose_4d_perm0321" || - kernel_func_name_ == "transpose_4d_perm1023" || - kernel_func_name_ == "transpose_4d_perm1032" || - kernel_func_name_ == "transpose_4d_perm1203" || - kernel_func_name_ == "transpose_4d_perm1230" || - kernel_func_name_ == "transpose_4d_perm1302" || - kernel_func_name_ == "transpose_4d_perm1320" || - kernel_func_name_ == "transpose_4d_perm2013" || - kernel_func_name_ == "transpose_4d_perm2031" || - kernel_func_name_ == "transpose_4d_perm2103" || - kernel_func_name_ == "transpose_4d_perm2130" || - kernel_func_name_ == "transpose_4d_perm2301" || - kernel_func_name_ == "transpose_4d_perm2310" || - kernel_func_name_ == "transpose_4d_perm3012" || - kernel_func_name_ == "transpose_4d_perm3021" || - kernel_func_name_ == "transpose_4d_perm3102" || - kernel_func_name_ == "transpose_4d_perm3120" || - kernel_func_name_ == "transpose_4d_perm3201" || - kernel_func_name_ == "transpose_4d_perm3210" || - kernel_func_name_ == "transpose_2d") { - const std::vector& ws = - DefaultGlobalWorkSize(output_tensor_dims_, - DDim(std::vector{ - static_cast(output_image_w_), - static_cast(output_image_h_)})); - global_work_size_ = cl::NDRange{static_cast(ws[0]), - static_cast(ws[1]), - static_cast(ws[2])}; - } else { - LOG(FATAL) << "Unsupported get global work size for kernel function: " - << kernel_func_name_; - } + const std::vector& ws = + DefaultGlobalWorkSize(output_tensor_dims_, + DDim(std::vector{ + static_cast(output_image_w_), + static_cast(output_image_h_)})); + global_work_size_ = cl::NDRange{static_cast(ws[0]), + static_cast(ws[1]), + static_cast(ws[2])}; } void Run() override { @@ -200,62 +144,35 @@ class TransposeComputeFloatImage auto& context = ctx_->As(); auto kernel = kernel_; cl_int status; - if (kernel_func_name_ == "transpose_4d_perm0132" || - kernel_func_name_ == "transpose_4d_perm0213" || - kernel_func_name_ == "transpose_4d_perm0231" || - kernel_func_name_ == "transpose_4d_perm0312" || - kernel_func_name_ == "transpose_4d_perm0321" || - kernel_func_name_ == "transpose_4d_perm1023" || - kernel_func_name_ == "transpose_4d_perm1032" || - kernel_func_name_ == "transpose_4d_perm1203" || - kernel_func_name_ == "transpose_4d_perm1230" || - kernel_func_name_ == "transpose_4d_perm1302" || - kernel_func_name_ == "transpose_4d_perm1320" || - kernel_func_name_ == "transpose_4d_perm2013" || - kernel_func_name_ == "transpose_4d_perm2031" || - kernel_func_name_ == "transpose_4d_perm2103" || - kernel_func_name_ == "transpose_4d_perm2130" || - kernel_func_name_ == "transpose_4d_perm2301" || - kernel_func_name_ == "transpose_4d_perm2310" || - kernel_func_name_ == "transpose_4d_perm3012" || - kernel_func_name_ == "transpose_4d_perm3021" || - kernel_func_name_ == "transpose_4d_perm3102" || - kernel_func_name_ == "transpose_4d_perm3120" || - kernel_func_name_ == "transpose_4d_perm3201" || - kernel_func_name_ == "transpose_4d_perm3210" || - kernel_func_name_ == "transpose_2d") { - status = kernel.setArg(0, *x_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(1, *output_image); - CL_CHECK_FATAL(status); - status = kernel.setArg(2, output_tensor_c_); - CL_CHECK_FATAL(status); - status = kernel.setArg(3, output_tensor_h_); - CL_CHECK_FATAL(status); - status = kernel.setArg(4, output_tensor_w_); - CL_CHECK_FATAL(status); - status = kernel.setArg(5, x_tensor_w_); - CL_CHECK_FATAL(status); - status = kernel.setArg(6, x_tensor_h_); - CL_CHECK_FATAL(status); - - GetGlobalWorkSize(); - status = EnqueueNDRangeKernel(context, - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - event_); - CL_CHECK_FATAL(status); - - } else { - LOG(FATAL) << "Unsupported kernel function: " << kernel_func_name_; - } + status = kernel.setArg(0, *x_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(1, *output_image); + CL_CHECK_FATAL(status); + status = kernel.setArg(2, output_tensor_c_); + CL_CHECK_FATAL(status); + status = kernel.setArg(3, output_tensor_h_); + CL_CHECK_FATAL(status); + status = kernel.setArg(4, output_tensor_w_); + CL_CHECK_FATAL(status); + status = kernel.setArg(5, x_tensor_w_); + CL_CHECK_FATAL(status); + status = kernel.setArg(6, x_tensor_h_); + CL_CHECK_FATAL(status); + + GetGlobalWorkSize(); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); + CL_CHECK_FATAL(status); } private: - std::string kernel_func_name_{"transpose"}; + std::string kernel_func_name_{""}; + std::string kernel_path_{""}; std::string build_options_{""}; std::string time_stamp_{GetTimeStamp()};