From a934a96aa77b3516dd10c74501967749f1336290 Mon Sep 17 00:00:00 2001
From: FhqTreap <676807370@qq.com>
Date: Wed, 11 Oct 2023 23:21:25 +0800
Subject: [PATCH] re commit

---
 src/layer/vulkan/convolution1d_vulkan.cpp     | 429 ++++++++++++++++++
 src/layer/vulkan/convolution1d_vulkan.h       |  53 +++
 src/layer/vulkan/shader/convolution1d.comp    | 177 ++++++++
 .../vulkan/shader/convolution1d_pack1to4.comp | 177 ++++++++
 .../vulkan/shader/convolution1d_pack1to8.comp | 186 ++++++++
 .../vulkan/shader/convolution1d_pack4.comp    | 208 +++++++++
 .../vulkan/shader/convolution1d_pack4to1.comp | 177 ++++++++
 .../vulkan/shader/convolution1d_pack4to8.comp | 270 +++++++++++
 .../vulkan/shader/convolution1d_pack8.comp    | 270 +++++++++++
 .../vulkan/shader/convolution1d_pack8to1.comp | 178 ++++++++
 .../vulkan/shader/convolution1d_pack8to4.comp | 220 +++++++++
 11 files changed, 2345 insertions(+)
 create mode 100644 src/layer/vulkan/convolution1d_vulkan.cpp
 create mode 100644 src/layer/vulkan/convolution1d_vulkan.h
 create mode 100644 src/layer/vulkan/shader/convolution1d.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack1to4.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack1to8.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack4.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack4to1.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack4to8.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack8.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack8to1.comp
 create mode 100644 src/layer/vulkan/shader/convolution1d_pack8to4.comp

diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
new file mode 100644
index 00000000000..445e29345b8
--- /dev/null
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -0,0 +1,429 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution1d_vulkan.h"
+
+#include "layer_shader_type.h"
+#include "layer_type.h"
+
+namespace ncnn {
+
+Convolution1D_vulkan::Convolution1D_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    padding = 0;
+
+    pipeline_convolution1d = 0;
+}
+
+int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+{
+    if (dynamic_weight)
+    {
+        support_vulkan = false;
+        support_image_storage = false;
+        return 0;
+    }
+
+    Option opt = _opt;
+
+    const int maxk = kernel_w;
+    int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    {
+        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 0);
+        pd.set(1, 0);
+        pd.set(2, pad_left);
+        pd.set(3, pad_right);
+        pd.set(4, 0);
+        pd.set(5, pad_value);
+
+        padding->load_param(pd);
+
+        padding->create_pipeline(opt);
+    }
+
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_packed.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < out_elempack; i++)
+                    {
+                        const Mat k0 = weight_data_r2.channel(q + i);
+
+                        for (int j = 0; j < elempack; j++)
+                        {
+                            const float* k00 = k0.row(p + j);
+                            g00[0] = k00[k];
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (bias_term)
+    {
+        convert_packing(bias_data, bias_data_packed, out_elempack, opt);
+    }
+
+    {
+        std::vector<vk_specialization_type> specializations(7 + 10);
+        specializations[0].i = kernel_w;
+        specializations[1].i = dilation_w;
+        specializations[2].i = stride_w;
+        specializations[3].i = bias_term;
+        specializations[4].i = activation_type;
+        specializations[5].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[6].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[7 + 0].i = 0;
+        specializations[7 + 1].i = 0;
+        specializations[7 + 2].i = 0;
+        specializations[7 + 3].i = 0;
+        specializations[7 + 4].i = 0;
+        specializations[7 + 5].i = 0;
+        specializations[7 + 6].i = 0;
+        specializations[7 + 7].i = 0;
+        specializations[7 + 8].i = 0;
+        specializations[7 + 9].i = 0;
+
+        int shader_type_index = -1;
+        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d;
+        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack4;
+        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack1to4;
+        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack4to1;
+        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack8;
+        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack1to8;
+        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack8to1;
+        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack4to8;
+        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack8to4;
+
+        pipeline_convolution1d = new Pipeline(vkdev);
+        pipeline_convolution1d->set_optimal_local_size_xyz(1, 1, 1);
+        pipeline_convolution1d->create(shader_type_index, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Convolution1D_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (padding)
+    {
+        padding->destroy_pipeline(opt);
+        delete padding;
+        padding = 0;
+    }
+
+    delete pipeline_convolution1d;
+    pipeline_convolution1d = 0;
+
+    return 0;
+}
+
+int Convolution1D_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    if (padding)
+    {
+        padding->upload_model(cmd, opt);
+    }
+
+    if (support_image_storage && opt.use_image_storage)
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
+    }
+    else
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+    }
+    
+    weight_data_packed.release();
+
+    if (bias_term)
+    {
+        if (support_image_storage && opt.use_image_storage)
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
+        }
+        else
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+        }
+
+        bias_data_packed.release();
+    }
+
+    return 0;
+}
+
+int Convolution1D_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+
+    VkMat bottom_blob_bordered = bottom_blob;
+    if (pad_left > 0 || pad_right > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+    else if (pad_left == -233 && pad_right == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+
+
+    int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1;
+
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+    
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+    
+
+    std::vector<VkMat> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    VkMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
+
+    cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+int Convolution1D_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+
+    VkImageMat bottom_blob_bordered = bottom_blob;
+    if (pad_left > 0 || pad_right > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+    else if (pad_left == -233 && pad_right == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+            std::vector<VkImageMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkImageMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkImageMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkImageMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+
+
+    int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1;
+
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+
+    std::vector<VkImageMat> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu_image;
+    bindings[3] = bias_data_gpu_image;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = 0; //bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = 0; //top_blob.cstep;
+
+    VkImageMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
+
+    cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
new file mode 100644
index 00000000000..0356d1948cb
--- /dev/null
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION1D_VULKAN_H
+#define LAYER_CONVOLUTION1D_VULKAN_H
+
+#include "convolution1d.h"
+
+namespace ncnn {
+
+class Convolution1D_vulkan : virtual public Convolution1D
+{
+public:
+    Convolution1D_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    using Convolution1D::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    ncnn::Layer* padding;
+    
+    Mat weight_data_packed;
+    Mat bias_data_packed;
+
+    VkMat weight_data_gpu;
+    VkMat bias_data_gpu;
+
+    VkImageMat weight_data_gpu_image;
+    VkImageMat bias_data_gpu_image;
+
+    Pipeline* pipeline_convolution1d;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION1D_VULKAN_H
diff --git a/src/layer/vulkan/shader/convolution1d.comp b/src/layer/vulkan/shader/convolution1d.comp
new file mode 100644
index 00000000000..3403a50f193
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afp k0 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.x));
+            afp k1 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afp k0 = buffer_ld1(weight_data, w_offset.x + x);
+            afp k1 = buffer_ld1(weight_data, w_offset.y + x);
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack1to4.comp b/src/layer/vulkan/shader/convolution1d_pack1to4.comp
new file mode 100644
index 00000000000..98e6fadd3c1
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack1to4.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x);
+            afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x);
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack1to8.comp b/src/layer/vulkan/shader/convolution1d_pack1to8.comp
new file mode 100644
index 00000000000..c32bc2114e5
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack1to8.comp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0[0] += v0 * k0[0];
+            sum0[1] += v0 * k0[1];
+            sum1[0] += v1 * k0[0];
+            sum1[1] += v1 * k0[1];
+            sum2[0] += v0 * k1[0];
+            sum2[1] += v0 * k1[1];
+            sum3[0] += v1 * k1[0];
+            sum3[1] += v1 * k1[1];
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x);
+            afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x);
+
+            sum0[0] += v0 * k0[0];
+            sum0[1] += v0 * k0[1];
+            sum1[0] += v1 * k0[0];
+            sum1[1] += v1 * k0[1];
+            sum2[0] += v0 * k1[0];
+            sum2[1] += v0 * k1[1];
+            sum3[0] += v1 * k1[0];
+            sum3[1] += v1 * k1[1];
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4.comp b/src/layer/vulkan/shader/convolution1d_pack4.comp
new file mode 100644
index 00000000000..f1e12586785
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4.comp
@@ -0,0 +1,208 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+
+            afpmat4 k0 = afpmat4(
+                image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x))
+            );
+            afpmat4 k1 = afpmat4(
+                image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y))
+            );
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 4;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+            // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+            afpmat4 k0 = afpmat4(
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 0),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 1),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 2),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 3)
+            );
+            afpmat4 k1 = afpmat4(
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 0),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 1),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 2),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 3)
+            );
+#else
+            afpmat4 k0 = sfp2afpmat4(weight_data[w_offset.x + x]);
+            afpmat4 k1 = sfp2afpmat4(weight_data[w_offset.y + x]);
+#endif
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4to1.comp b/src/layer/vulkan/shader/convolution1d_pack4to1.comp
new file mode 100644
index 00000000000..1f5c87e1835
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4to1.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += dot(v0, k0);
+            sum1 += dot(v1, k0);
+            sum2 += dot(v0, k1);
+            sum3 += dot(v1, k1);
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x);
+            afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x);
+            
+            sum0 += dot(v0, k0);
+            sum1 += dot(v1, k0);
+            sum2 += dot(v0, k1);
+            sum3 += dot(v1, k1);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4to8.comp b/src/layer/vulkan/shader/convolution1d_pack4to8.comp
new file mode 100644
index 00000000000..1133b097ac0
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4to8.comp
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.x));
+            afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.x));
+            afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.x));
+            afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.x));
+
+            afpvec4 k8 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec4 k9 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec4 ka = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec4 kb = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y));
+            afpvec4 kc = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.y));
+            afpvec4 kd = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.y));
+            afpvec4 ke = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.y));
+            afpvec4 kf = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.y));
+
+            sum0[0].r += dot(v0, k0);
+            sum0[0].g += dot(v0, k1);
+            sum0[0].b += dot(v0, k2);
+            sum0[0].a += dot(v0, k3);
+            sum0[1].r += dot(v0, k4);
+            sum0[1].g += dot(v0, k5);
+            sum0[1].b += dot(v0, k6);
+            sum0[1].a += dot(v0, k7);
+
+            sum1[0].r += dot(v1, k0);
+            sum1[0].g += dot(v1, k1);
+            sum1[0].b += dot(v1, k2);
+            sum1[0].a += dot(v1, k3);
+            sum1[1].r += dot(v1, k4);
+            sum1[1].g += dot(v1, k5);
+            sum1[1].b += dot(v1, k6);
+            sum1[1].a += dot(v1, k7);
+
+            sum2[0].r += dot(v0, k8);
+            sum2[0].g += dot(v0, k9);
+            sum2[0].b += dot(v0, ka);
+            sum2[0].a += dot(v0, kb);
+            sum2[1].r += dot(v0, kc);
+            sum2[1].g += dot(v0, kd);
+            sum2[1].b += dot(v0, ke);
+            sum2[1].a += dot(v0, kf);
+
+            sum3[0].r += dot(v1, k8);
+            sum3[0].g += dot(v1, k9);
+            sum3[0].b += dot(v1, ka);
+            sum3[0].a += dot(v1, kb);
+            sum3[1].r += dot(v1, kc);
+            sum3[1].g += dot(v1, kd);
+            sum3[1].b += dot(v1, ke);
+            sum3[1].a += dot(v1, kf);
+
+            wx += 8;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 0);
+            afpvec4 k1 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 1);
+            afpvec4 k2 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 2);
+            afpvec4 k3 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 3);
+            afpvec4 k4 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 4);
+            afpvec4 k5 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 5);
+            afpvec4 k6 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 6);
+            afpvec4 k7 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 7);
+
+            afpvec4 k8 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 0);
+            afpvec4 k9 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 1);
+            afpvec4 ka = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 2);
+            afpvec4 kb = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 3);
+            afpvec4 kc = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 4);
+            afpvec4 kd = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 5);
+            afpvec4 ke = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 6);
+            afpvec4 kf = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 7);
+
+            sum0[0].r += dot(v0, k0);
+            sum0[0].g += dot(v0, k1);
+            sum0[0].b += dot(v0, k2);
+            sum0[0].a += dot(v0, k3);
+            sum0[1].r += dot(v0, k4);
+            sum0[1].g += dot(v0, k5);
+            sum0[1].b += dot(v0, k6);
+            sum0[1].a += dot(v0, k7);
+
+            sum1[0].r += dot(v1, k0);
+            sum1[0].g += dot(v1, k1);
+            sum1[0].b += dot(v1, k2);
+            sum1[0].a += dot(v1, k3);
+            sum1[1].r += dot(v1, k4);
+            sum1[1].g += dot(v1, k5);
+            sum1[1].b += dot(v1, k6);
+            sum1[1].a += dot(v1, k7);
+
+            sum2[0].r += dot(v0, k8);
+            sum2[0].g += dot(v0, k9);
+            sum2[0].b += dot(v0, ka);
+            sum2[0].a += dot(v0, kb);
+            sum2[1].r += dot(v0, kc);
+            sum2[1].g += dot(v0, kd);
+            sum2[1].b += dot(v0, ke);
+            sum2[1].a += dot(v0, kf);
+
+            sum3[0].r += dot(v1, k8);
+            sum3[0].g += dot(v1, k9);
+            sum3[0].b += dot(v1, ka);
+            sum3[0].a += dot(v1, kb);
+            sum3[1].r += dot(v1, kc);
+            sum3[1].g += dot(v1, kd);
+            sum3[1].b += dot(v1, ke);
+            sum3[1].a += dot(v1, kf);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8.comp b/src/layer/vulkan/shader/convolution1d_pack8.comp
new file mode 100644
index 00000000000..fff72ade829
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8.comp
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.x));
+            afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.x));
+            afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.x));
+            afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.x));
+
+            afpvec8 k8 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec8 k9 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec8 ka = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec8 kb = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y));
+            afpvec8 kc = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.y));
+            afpvec8 kd = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.y));
+            afpvec8 ke = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.y));
+            afpvec8 kf = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.y));
+
+            sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+            sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+            sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]);
+            sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]);
+            sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]);
+            sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]);
+            sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]);
+            sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]);
+            sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]);
+            sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]);
+
+            sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]);
+            sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]);
+            sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]);
+            sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]);
+            sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]);
+            sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]);
+            sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]);
+            sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]);
+
+            wx += 8;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 0);
+            afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 1);
+            afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 2);
+            afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 3);
+            afpvec8 k4 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 4);
+            afpvec8 k5 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 5);
+            afpvec8 k6 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 6);
+            afpvec8 k7 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 7);
+
+            afpvec8 k8 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 0);
+            afpvec8 k9 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 1);
+            afpvec8 ka = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 2);
+            afpvec8 kb = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 3);
+            afpvec8 kc = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 4);
+            afpvec8 kd = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 5);
+            afpvec8 ke = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 6);
+            afpvec8 kf = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 7);
+
+            sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+            sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+            sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]);
+            sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]);
+            sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]);
+            sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]);
+            sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]);
+            sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]);
+            sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]);
+            sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]);
+
+            sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]);
+            sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]);
+            sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]);
+            sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]);
+            sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]);
+            sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]);
+            sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]);
+            sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8to1.comp b/src/layer/vulkan/shader/convolution1d_pack8to1.comp
new file mode 100644
index 00000000000..9d08d3b11af
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8to1.comp
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x);
+            afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x);
+
+            sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8to4.comp b/src/layer/vulkan/shader/convolution1d_pack8to4.comp
new file mode 100644
index 00000000000..86ca696d584
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8to4.comp
@@ -0,0 +1,220 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y));
+
+            sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+
+            sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+
+            sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            wx += 4;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 0);
+            afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 1);
+            afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 2);
+            afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 3);
+            afpvec8 k4 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 0);
+            afpvec8 k5 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 1);
+            afpvec8 k6 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 2);
+            afpvec8 k7 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 3);
+
+            sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+
+            sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+
+            sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file