From a934a96aa77b3516dd10c74501967749f1336290 Mon Sep 17 00:00:00 2001 From: FhqTreap <676807370@qq.com> Date: Wed, 11 Oct 2023 23:21:25 +0800 Subject: [PATCH] re commit --- src/layer/vulkan/convolution1d_vulkan.cpp | 429 ++++++++++++++++++ src/layer/vulkan/convolution1d_vulkan.h | 53 +++ src/layer/vulkan/shader/convolution1d.comp | 177 ++++++++ .../vulkan/shader/convolution1d_pack1to4.comp | 177 ++++++++ .../vulkan/shader/convolution1d_pack1to8.comp | 186 ++++++++ .../vulkan/shader/convolution1d_pack4.comp | 208 +++++++++ .../vulkan/shader/convolution1d_pack4to1.comp | 177 ++++++++ .../vulkan/shader/convolution1d_pack4to8.comp | 270 +++++++++++ .../vulkan/shader/convolution1d_pack8.comp | 270 +++++++++++ .../vulkan/shader/convolution1d_pack8to1.comp | 178 ++++++++ .../vulkan/shader/convolution1d_pack8to4.comp | 220 +++++++++ 11 files changed, 2345 insertions(+) create mode 100644 src/layer/vulkan/convolution1d_vulkan.cpp create mode 100644 src/layer/vulkan/convolution1d_vulkan.h create mode 100644 src/layer/vulkan/shader/convolution1d.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack1to4.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack1to8.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack4.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack4to1.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack4to8.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack8.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack8to1.comp create mode 100644 src/layer/vulkan/shader/convolution1d_pack8to4.comp diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp new file mode 100644 index 00000000000..445e29345b8 --- /dev/null +++ b/src/layer/vulkan/convolution1d_vulkan.cpp @@ -0,0 +1,429 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "convolution1d_vulkan.h" + +#include "layer_shader_type.h" +#include "layer_type.h" + +namespace ncnn { + +Convolution1D_vulkan::Convolution1D_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + padding = 0; + + pipeline_convolution1d = 0; +} + +int Convolution1D_vulkan::create_pipeline(const Option& _opt) +{ + if (dynamic_weight) + { + support_vulkan = false; + support_image_storage = false; + return 0; + } + + Option opt = _opt; + + const int maxk = kernel_w; + int num_input = weight_data_size / maxk / num_output; + + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + { + padding = ncnn::create_layer(ncnn::LayerType::Padding); + padding->vkdev = vkdev; + + ncnn::ParamDict pd; + pd.set(0, 0); + pd.set(1, 0); + pd.set(2, pad_left); + pd.set(3, pad_right); + pd.set(4, 0); + pd.set(5, pad_value); + + padding->load_param(pd); + + padding->create_pipeline(opt); + } + + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_packed.channel(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < out_elempack; i++) + { + const Mat k0 = weight_data_r2.channel(q + i); + + for (int j = 0; j < elempack; j++) + { + const float* k00 = k0.row(p + j); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } + + if (bias_term) + { + convert_packing(bias_data, bias_data_packed, out_elempack, opt); + } + + { + std::vector specializations(7 + 10); + specializations[0].i = kernel_w; + specializations[1].i = dilation_w; + specializations[2].i = stride_w; + specializations[3].i = bias_term; + specializations[4].i = activation_type; + specializations[5].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[6].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[7 + 0].i = 0; + specializations[7 + 1].i = 0; + specializations[7 + 2].i = 0; + specializations[7 + 3].i = 0; + specializations[7 + 4].i = 0; + specializations[7 + 5].i = 0; + specializations[7 + 6].i = 0; + specializations[7 + 7].i = 0; + specializations[7 + 8].i = 0; + specializations[7 + 9].i = 0; + + int shader_type_index = -1; + if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d; + if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack4; + if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack1to4; + if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack4to1; + if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack8; + if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack1to8; + if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack8to1; + if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack4to8; + if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack8to4; + + pipeline_convolution1d = new Pipeline(vkdev); + pipeline_convolution1d->set_optimal_local_size_xyz(1, 1, 1); + pipeline_convolution1d->create(shader_type_index, opt, specializations); + } + + return 0; +} + +int Convolution1D_vulkan::destroy_pipeline(const Option& opt) +{ + if (padding) + { + padding->destroy_pipeline(opt); + delete padding; + padding = 0; + } + + delete pipeline_convolution1d; + pipeline_convolution1d = 0; + + return 0; +} + +int Convolution1D_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + if (padding) + { + padding->upload_model(cmd, opt); + } + + if (support_image_storage && opt.use_image_storage) + { + cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } + + weight_data_packed.release(); + + if (bias_term) + { + if (support_image_storage && opt.use_image_storage) + { + cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + + bias_data_packed.release(); + } + + return 0; +} + +int Convolution1D_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + + VkMat bottom_blob_bordered = bottom_blob; + if (pad_left > 0 || pad_right > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + else if (pad_left == -233 && pad_right == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + if (wpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = 0; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + else if (pad_left == -234 && pad_right == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + if (wpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = 0; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + + + int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1; + + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + VkMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = (top_blob.c + 1) / 2; + + cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher); + + return 0; +} + +int Convolution1D_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + + + VkImageMat bottom_blob_bordered = bottom_blob; + if (pad_left > 0 || pad_right > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + else if (pad_left == -233 && pad_right == -233) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + if (wpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = 0; + padding_params[2] = wpad / 2; + padding_params[3] = wpad - wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + else if (pad_left == -234 && pad_right == -234) + { + int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w; + if (wpad > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator); + int* padding_params = padding_param_blob.mapped(); + + padding_params[0] = 0; + padding_params[1] = 0; + padding_params[2] = wpad - wpad / 2; + padding_params[3] = wpad / 2; + padding_params[4] = 0; + padding_params[5] = 0; + + std::vector padding_inputs(2); + padding_inputs[0] = bottom_blob; + padding_inputs[1] = padding_param_blob; + + std::vector padding_outputs(1); + padding->forward(padding_inputs, padding_outputs, cmd, opt_pad); + bottom_blob_bordered = padding_outputs[0]; + } + } + + + int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1; + + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image; + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = 0; //bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = 0; //top_blob.cstep; + + VkImageMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = (top_blob.c + 1) / 2; + + cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher); + + return 0; +} + +} // namespace ncnn \ No newline at end of file diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h new file mode 100644 index 00000000000..0356d1948cb --- /dev/null +++ b/src/layer/vulkan/convolution1d_vulkan.h @@ -0,0 +1,53 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_CONVOLUTION1D_VULKAN_H +#define LAYER_CONVOLUTION1D_VULKAN_H + +#include "convolution1d.h" + +namespace ncnn { + +class Convolution1D_vulkan : virtual public Convolution1D +{ +public: + Convolution1D_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + using Convolution1D::forward; + virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + ncnn::Layer* padding; + + Mat weight_data_packed; + Mat bias_data_packed; + + VkMat weight_data_gpu; + VkMat bias_data_gpu; + + VkImageMat weight_data_gpu_image; + VkImageMat bias_data_gpu_image; + + Pipeline* pipeline_convolution1d; +}; + +} // namespace ncnn + +#endif // LAYER_CONVOLUTION1D_VULKAN_H diff --git a/src/layer/vulkan/shader/convolution1d.comp b/src/layer/vulkan/shader/convolution1d.comp new file mode 100644 index 00000000000..3403a50f193 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afp sum0 = afp(0.0f); + afp sum1 = afp(0.0f); + afp sum2 = afp(0.0f); + afp sum3 = afp(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld1(bias_data, gy2.x); + sum2 = buffer_ld1(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afp k0 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.x)); + afp k1 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.y)); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + + wx += 1; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w); + afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w); + + afp k0 = buffer_ld1(weight_data, w_offset.x + x); + afp k1 = buffer_ld1(weight_data, w_offset.y + x); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack1to4.comp b/src/layer/vulkan/shader/convolution1d_pack1to4.comp new file mode 100644 index 00000000000..98e6fadd3c1 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack1to4.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec4 sum0 = afpvec4(0.0f); + afpvec4 sum1 = afpvec4(0.0f); + afpvec4 sum2 = afpvec4(0.0f); + afpvec4 sum3 = afpvec4(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld4(bias_data, gy2.x); + sum2 = buffer_ld4(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y)); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + + wx += 1; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w); + afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x); + afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack1to8.comp b/src/layer/vulkan/shader/convolution1d_pack1to8.comp new file mode 100644 index 00000000000..c32bc2114e5 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack1to8.comp @@ -0,0 +1,186 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld8(bias_data, gy2.x); + sum2 = buffer_ld8(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y)); + + sum0[0] += v0 * k0[0]; + sum0[1] += v0 * k0[1]; + sum1[0] += v1 * k0[0]; + sum1[1] += v1 * k0[1]; + sum2[0] += v0 * k1[0]; + sum2[1] += v0 * k1[1]; + sum3[0] += v1 * k1[0]; + sum3[1] += v1 * k1[1]; + + wx += 1; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w); + afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x); + afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x); + + sum0[0] += v0 * k0[0]; + sum0[1] += v0 * k0[1]; + sum1[0] += v1 * k0[0]; + sum1[1] += v1 * k0[1]; + sum2[0] += v0 * k1[0]; + sum2[1] += v0 * k1[1]; + sum3[0] += v1 * k1[0]; + sum3[1] += v1 * k1[1]; + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack4.comp b/src/layer/vulkan/shader/convolution1d_pack4.comp new file mode 100644 index 00000000000..f1e12586785 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack4.comp @@ -0,0 +1,208 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else +layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec4 sum0 = afpvec4(0.0f); + afpvec4 sum1 = afpvec4(0.0f); + afpvec4 sum2 = afpvec4(0.0f); + afpvec4 sum3 = afpvec4(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld4(bias_data, gy2.x); + sum2 = buffer_ld4(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpmat4 k0 = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x)), + image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x)), + image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x)), + image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x)) + ); + afpmat4 k1 = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y)), + image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y)), + image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y)), + image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y)) + ); + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + + wx += 4; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k0 = afpmat4( + buffer_ld4(weight_data, (w_offset.x + x) * 4 + 0), + buffer_ld4(weight_data, (w_offset.x + x) * 4 + 1), + buffer_ld4(weight_data, (w_offset.x + x) * 4 + 2), + buffer_ld4(weight_data, (w_offset.x + x) * 4 + 3) + ); + afpmat4 k1 = afpmat4( + buffer_ld4(weight_data, (w_offset.y + x) * 4 + 0), + buffer_ld4(weight_data, (w_offset.y + x) * 4 + 1), + buffer_ld4(weight_data, (w_offset.y + x) * 4 + 2), + buffer_ld4(weight_data, (w_offset.y + x) * 4 + 3) + ); +#else + afpmat4 k0 = sfp2afpmat4(weight_data[w_offset.x + x]); + afpmat4 k1 = sfp2afpmat4(weight_data[w_offset.y + x]); +#endif + + sum0 += v0 * k0; + sum1 += v1 * k0; + sum2 += v0 * k1; + sum3 += v1 * k1; + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack4to1.comp b/src/layer/vulkan/shader/convolution1d_pack4to1.comp new file mode 100644 index 00000000000..1f5c87e1835 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack4to1.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afp sum0 = afp(0.0f); + afp sum1 = afp(0.0f); + afp sum2 = afp(0.0f); + afp sum3 = afp(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld1(bias_data, gy2.x); + sum2 = buffer_ld1(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y)); + + sum0 += dot(v0, k0); + sum1 += dot(v1, k0); + sum2 += dot(v0, k1); + sum3 += dot(v1, k1); + + wx += 1; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x); + afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x); + + sum0 += dot(v0, k0); + sum1 += dot(v1, k0); + sum2 += dot(v0, k1); + sum3 += dot(v1, k1); + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack4to8.comp b/src/layer/vulkan/shader/convolution1d_pack4to8.comp new file mode 100644 index 00000000000..1133b097ac0 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack4to8.comp @@ -0,0 +1,270 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld8(bias_data, gy2.x); + sum2 = buffer_ld8(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.x)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.x)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.x)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.x)); + + afpvec4 k8 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y)); + afpvec4 k9 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y)); + afpvec4 ka = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y)); + afpvec4 kb = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y)); + afpvec4 kc = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.y)); + afpvec4 kd = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.y)); + afpvec4 ke = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.y)); + afpvec4 kf = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.y)); + + sum0[0].r += dot(v0, k0); + sum0[0].g += dot(v0, k1); + sum0[0].b += dot(v0, k2); + sum0[0].a += dot(v0, k3); + sum0[1].r += dot(v0, k4); + sum0[1].g += dot(v0, k5); + sum0[1].b += dot(v0, k6); + sum0[1].a += dot(v0, k7); + + sum1[0].r += dot(v1, k0); + sum1[0].g += dot(v1, k1); + sum1[0].b += dot(v1, k2); + sum1[0].a += dot(v1, k3); + sum1[1].r += dot(v1, k4); + sum1[1].g += dot(v1, k5); + sum1[1].b += dot(v1, k6); + sum1[1].a += dot(v1, k7); + + sum2[0].r += dot(v0, k8); + sum2[0].g += dot(v0, k9); + sum2[0].b += dot(v0, ka); + sum2[0].a += dot(v0, kb); + sum2[1].r += dot(v0, kc); + sum2[1].g += dot(v0, kd); + sum2[1].b += dot(v0, ke); + sum2[1].a += dot(v0, kf); + + sum3[0].r += dot(v1, k8); + sum3[0].g += dot(v1, k9); + sum3[0].b += dot(v1, ka); + sum3[0].a += dot(v1, kb); + sum3[1].r += dot(v1, kc); + sum3[1].g += dot(v1, kd); + sum3[1].b += dot(v1, ke); + sum3[1].a += dot(v1, kf); + + wx += 8; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec4 k0 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 0); + afpvec4 k1 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 1); + afpvec4 k2 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 2); + afpvec4 k3 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 3); + afpvec4 k4 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 4); + afpvec4 k5 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 5); + afpvec4 k6 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 6); + afpvec4 k7 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 7); + + afpvec4 k8 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 0); + afpvec4 k9 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 1); + afpvec4 ka = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 2); + afpvec4 kb = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 3); + afpvec4 kc = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 4); + afpvec4 kd = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 5); + afpvec4 ke = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 6); + afpvec4 kf = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 7); + + sum0[0].r += dot(v0, k0); + sum0[0].g += dot(v0, k1); + sum0[0].b += dot(v0, k2); + sum0[0].a += dot(v0, k3); + sum0[1].r += dot(v0, k4); + sum0[1].g += dot(v0, k5); + sum0[1].b += dot(v0, k6); + sum0[1].a += dot(v0, k7); + + sum1[0].r += dot(v1, k0); + sum1[0].g += dot(v1, k1); + sum1[0].b += dot(v1, k2); + sum1[0].a += dot(v1, k3); + sum1[1].r += dot(v1, k4); + sum1[1].g += dot(v1, k5); + sum1[1].b += dot(v1, k6); + sum1[1].a += dot(v1, k7); + + sum2[0].r += dot(v0, k8); + sum2[0].g += dot(v0, k9); + sum2[0].b += dot(v0, ka); + sum2[0].a += dot(v0, kb); + sum2[1].r += dot(v0, kc); + sum2[1].g += dot(v0, kd); + sum2[1].b += dot(v0, ke); + sum2[1].a += dot(v0, kf); + + sum3[0].r += dot(v1, k8); + sum3[0].g += dot(v1, k9); + sum3[0].b += dot(v1, ka); + sum3[0].a += dot(v1, kb); + sum3[1].r += dot(v1, kc); + sum3[1].g += dot(v1, kd); + sum3[1].b += dot(v1, ke); + sum3[1].a += dot(v1, kf); + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack8.comp b/src/layer/vulkan/shader/convolution1d_pack8.comp new file mode 100644 index 00000000000..fff72ade829 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack8.comp @@ -0,0 +1,270 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f)); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld8(bias_data, gy2.x); + sum2 = buffer_ld8(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.x)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.x)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.x)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.x)); + + afpvec8 k8 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y)); + afpvec8 k9 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y)); + afpvec8 ka = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y)); + afpvec8 kb = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y)); + afpvec8 kc = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.y)); + afpvec8 kd = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.y)); + afpvec8 ke = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.y)); + afpvec8 kf = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.y)); + + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]); + sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]); + sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]); + sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]); + sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]); + sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]); + sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]); + sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]); + + sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]); + sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]); + sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]); + sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]); + sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]); + sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]); + sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]); + sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]); + + wx += 8; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 3); + afpvec8 k4 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 4); + afpvec8 k5 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 5); + afpvec8 k6 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 6); + afpvec8 k7 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 7); + + afpvec8 k8 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 0); + afpvec8 k9 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 1); + afpvec8 ka = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 2); + afpvec8 kb = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 3); + afpvec8 kc = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 4); + afpvec8 kd = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 5); + afpvec8 ke = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 6); + afpvec8 kf = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 7); + + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]); + sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]); + sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]); + sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]); + sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]); + sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]); + sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]); + sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]); + + sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]); + sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]); + sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]); + sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]); + sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]); + sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]); + sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]); + sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]); + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack8to1.comp b/src/layer/vulkan/shader/convolution1d_pack8to1.comp new file mode 100644 index 00000000000..9d08d3b11af --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack8to1.comp @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afp sum0 = afp(0.0f); + afp sum1 = afp(0.0f); + afp sum2 = afp(0.0f); + afp sum3 = afp(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld1(bias_data, gy2.x); + sum2 = buffer_ld1(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y)); + + sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + + wx += 1; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x); + afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x); + + sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file diff --git a/src/layer/vulkan/shader/convolution1d_pack8to4.comp b/src/layer/vulkan/shader/convolution1d_pack8to4.comp new file mode 100644 index 00000000000..86ca696d584 --- /dev/null +++ b/src/layer/vulkan/shader/convolution1d_pack8to4.comp @@ -0,0 +1,220 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int dilation_w = 1; +layout (constant_id = 2) const int stride_w = 1; +layout (constant_id = 3) const int bias_term = 0; +layout (constant_id = 4) const int activation_type = 0; +layout (constant_id = 5) const float activation_param_0 = 0; +layout (constant_id = 6) const float activation_param_1 = 0; + +#define shape_constant_id_offset 7 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler3D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 2; + int gy = int(gl_GlobalInvocationID.y) * 2; + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + const ivec2 gx2 = gx + ivec2(0, 1); + const ivec2 gy2 = gy + ivec2(0, 1); + + afpvec4 sum0 = afpvec4(0.0f); + afpvec4 sum1 = afpvec4(0.0f); + afpvec4 sum2 = afpvec4(0.0f); + afpvec4 sum3 = afpvec4(0.0f); + + if (bias_term == 1) + { +#if NCNN_image_shader + sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0)); + sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0)); +#else + sum0 = buffer_ld4(bias_data, gy2.x); + sum2 = buffer_ld4(bias_data, gy2.y); +#endif + sum1 = sum0; + sum3 = sum2; + } + +#if NCNN_image_shader + + ivec2 v_offset = gx2 * stride_w; + + for (int y = 0; y < psc(h); y++) + { + int wx = 0; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y)); + + sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + + sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + + sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + wx += 4; + } + } + +#else + + ivec2 v_offset = gx2 * stride_w; + ivec2 w_offset = gy2 * psc(h) * kernel_w; + + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w); + afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 3); + afpvec8 k4 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 0); + afpvec8 k5 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 1); + afpvec8 k6 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 2); + afpvec8 k7 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 3); + + sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + + sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + + sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + } + v_offset += psc(w); + w_offset += kernel_w; + } + +#endif + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1); + sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1); + +#if NCNN_image_shader + + image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0); + image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1); + image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2); + image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3); + +#else + + const int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum0); + if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2); + if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3); + +#endif +} \ No newline at end of file