From c45c01c7c10dc2482ee973f602580ecfaaa102d7 Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Thu, 27 Jul 2023 19:21:13 +0800
Subject: [PATCH] enable VK_KHR_cooperative_matrix  (#4823)

* enable VK_KHR_cooperative_matrix
* add khr cm shader
* update glslang
* print matrix info
---
 .ci/test-coverage.yml                         |  20 +-
 glslang                                       |   2 +-
 src/gpu.cpp                                   | 193 ++++++++--
 src/gpu.h                                     |   6 +
 src/layer/vulkan/convolution_vulkan.cpp       | 290 ++++++++++++---
 src/layer/vulkan/deconvolution_vulkan.cpp     |  72 +++-
 .../convolution_pack4_1x1s1d1_cm_16_8_8.comp  | 257 --------------
 ...olution_pack4_1x1s1d1_khr_cm_16_16_16.comp | 239 +++++++++++++
 ...nvolution_pack4_1x1s1d1_khr_cm_16_8_8.comp | 298 ++++++++++++++++
 ...volution_pack4_1x1s1d1_nv_cm_16_16_16.comp | 239 +++++++++++++
 ...onvolution_pack4_1x1s1d1_nv_cm_16_8_8.comp | 298 ++++++++++++++++
 ...pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp | 210 -----------
 ...3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp | 196 ++++++++++
 ...4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp | 241 +++++++++++++
 ..._3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp | 196 ++++++++++
 ...k4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp | 241 +++++++++++++
 .../convolution_pack4_gemm_cm_16_8_8.comp     | 294 ---------------
 ...onvolution_pack4_gemm_khr_cm_16_16_16.comp | 276 +++++++++++++++
 .../convolution_pack4_gemm_khr_cm_16_8_8.comp | 335 ++++++++++++++++++
 ...convolution_pack4_gemm_nv_cm_16_16_16.comp | 276 +++++++++++++++
 .../convolution_pack4_gemm_nv_cm_16_8_8.comp  | 335 ++++++++++++++++++
 .../deconvolution_pack4_gemm_cm_16_8_8.comp   | 211 -----------
 ...onvolution_pack4_gemm_khr_cm_16_16_16.comp | 195 ++++++++++
 ...econvolution_pack4_gemm_khr_cm_16_8_8.comp | 239 +++++++++++++
 ...convolution_pack4_gemm_nv_cm_16_16_16.comp | 195 ++++++++++
 ...deconvolution_pack4_gemm_nv_cm_16_8_8.comp | 239 +++++++++++++
 src/vulkan_header_fix.h                       |  57 +++
 tests/testutil.h                              |   6 +-
 28 files changed, 4580 insertions(+), 1076 deletions(-)
 delete mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
 delete mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
 delete mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
 delete mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
 create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
 create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp

diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
index 9e5a054a847..a693f415883 100644
--- a/.ci/test-coverage.yml
+++ b/.ci/test-coverage.yml
@@ -52,7 +52,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: swiftshader-install
-        cacheKey: swiftshader-linux-install-20230420
+        cacheKey: swiftshader-linux-install-20230420-1
 
     - name: checkout-swiftshader
       if: steps.cache-swiftshader.outputs.cacheHit != 'true'
@@ -126,7 +126,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: lavapipe-install
-        cacheKey: lavapipe-linux-install-20211127-3
+        cacheKey: lavapipe-linux-install-20211127-4
 
     - name: checkout-lavapipe
       if: steps.cache-lavapipe.outputs.cacheHit != 'true'
@@ -280,7 +280,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-arm-install-20220831
+        cacheKey: qemu-arm-install-20220831-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -386,7 +386,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-aarch64-install-20220831
+        cacheKey: qemu-aarch64-install-20220831-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -473,7 +473,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-mipsel-install-20220831
+        cacheKey: qemu-mipsel-install-20220831-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -552,7 +552,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-mips64el-install-20220831
+        cacheKey: qemu-mips64el-install-20220831-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -631,7 +631,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-riscv64-install-20230624
+        cacheKey: qemu-riscv64-install-20230624-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -716,7 +716,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-riscv64-install-20230624
+        cacheKey: qemu-riscv64-install-20230624-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
@@ -751,7 +751,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: rv64gcv-install
-        cacheKey: rv64gcv-linux-install-20221029
+        cacheKey: rv64gcv-linux-install-20221029-1
 
     - name: checkout-riscv-gnu-toolchain
       if: steps.cache-rv64gcv.outputs.cacheHit != 'true'
@@ -861,7 +861,7 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-loongarch64-install-20230524
+        cacheKey: qemu-loongarch64-install-20230524-1
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
diff --git a/glslang b/glslang
index 88fd417b0bb..4420f9b33ba 160000
--- a/glslang
+++ b/glslang
@@ -1 +1 @@
-Subproject commit 88fd417b0bb7d91755961c70e846d274c182f2b0
+Subproject commit 4420f9b33ba44928d5c82d9eae0c3bb4d5674c05
diff --git a/src/gpu.cpp b/src/gpu.cpp
index c743d42c90b..88c44d53f9b 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -110,6 +110,9 @@ int support_VK_EXT_validation_flags = 0;
 int support_VK_KHR_android_surface = 0;
 #endif // __ANDROID_API__ >= 26
 
+// VK_KHR_cooperative_matrix
+PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = 0;
+
 // VK_KHR_external_memory_capabilities
 PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;
 
@@ -223,6 +226,8 @@ class GpuInfoPrivate
     // cooperative matrix
     bool support_cooperative_matrix;
     bool support_cooperative_matrix_16_8_8;
+    bool support_cooperative_matrix_16_8_16;
+    bool support_cooperative_matrix_16_16_16;
 
     // extension capability
     int support_VK_KHR_8bit_storage;
@@ -230,6 +235,7 @@ class GpuInfoPrivate
     int support_VK_KHR_bind_memory2;
     int support_VK_KHR_buffer_device_address;
     int support_VK_KHR_create_renderpass2;
+    int support_VK_KHR_cooperative_matrix;
     int support_VK_KHR_dedicated_allocation;
     int support_VK_KHR_descriptor_update_template;
     int support_VK_KHR_external_memory;
@@ -527,6 +533,16 @@ bool GpuInfo::support_cooperative_matrix_16_8_8() const
     return d->support_cooperative_matrix_16_8_8;
 }
 
+bool GpuInfo::support_cooperative_matrix_16_8_16() const
+{
+    return d->support_cooperative_matrix_16_8_16;
+}
+
+bool GpuInfo::support_cooperative_matrix_16_16_16() const
+{
+    return d->support_cooperative_matrix_16_16_16;
+}
+
 int GpuInfo::support_VK_KHR_8bit_storage() const
 {
     return d->support_VK_KHR_8bit_storage;
@@ -552,6 +568,11 @@ int GpuInfo::support_VK_KHR_create_renderpass2() const
     return d->support_VK_KHR_create_renderpass2;
 }
 
+int GpuInfo::support_VK_KHR_cooperative_matrix() const
+{
+    return d->support_VK_KHR_cooperative_matrix;
+}
+
 int GpuInfo::support_VK_KHR_dedicated_allocation() const
 {
     return d->support_VK_KHR_dedicated_allocation;
@@ -709,6 +730,11 @@ static int init_instance_extension()
     }
 #endif // __ANDROID_API__ >= 26
 
+    // VK_KHR_cooperative_matrix
+    {
+        vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR");
+    }
+
     // VK_NV_cooperative_matrix
     {
         vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV");
@@ -1399,6 +1425,7 @@ int create_gpu_instance()
         gpu_info.support_VK_KHR_bind_memory2 = 0;
         gpu_info.support_VK_KHR_buffer_device_address = 0;
         gpu_info.support_VK_KHR_create_renderpass2 = 0;
+        gpu_info.support_VK_KHR_cooperative_matrix = 0;
         gpu_info.support_VK_KHR_dedicated_allocation = 0;
         gpu_info.support_VK_KHR_descriptor_update_template = 0;
         gpu_info.support_VK_KHR_external_memory = 0;
@@ -1439,6 +1466,8 @@ int create_gpu_instance()
                 gpu_info.support_VK_KHR_buffer_device_address = exp.specVersion;
             else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0)
                 gpu_info.support_VK_KHR_create_renderpass2 = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_cooperative_matrix") == 0)
+                gpu_info.support_VK_KHR_cooperative_matrix = exp.specVersion;
             else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
                 gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
             else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
@@ -1495,6 +1524,12 @@ int create_gpu_instance()
             gpu_info.support_VK_EXT_buffer_device_address = 0;
         }
 
+        if (gpu_info.support_VK_KHR_cooperative_matrix)
+        {
+            // we prefer khr extension
+            gpu_info.support_VK_NV_cooperative_matrix = 0;
+        }
+
         // check features
         gpu_info.support_fp16_packed = true;
         gpu_info.support_fp16_storage = false;
@@ -1505,6 +1540,8 @@ int create_gpu_instance()
         gpu_info.support_ycbcr_conversion = false;
         gpu_info.support_cooperative_matrix = false;
         gpu_info.support_cooperative_matrix_16_8_8 = false;
+        gpu_info.support_cooperative_matrix_16_8_16 = false;
+        gpu_info.support_cooperative_matrix_16_16_16 = false;
         if (support_VK_KHR_get_physical_device_properties2)
         {
             void* queryExtensionFeatures = 0;
@@ -1550,14 +1587,22 @@ int create_gpu_instance()
             }
 
             // query cooperative_matrix
-            VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
-            queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
+            VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures;
+            queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
             queryCooperativeMatrixFeatures.pNext = 0;
-            if (gpu_info.support_VK_NV_cooperative_matrix)
+            VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV;
+            queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
+            queryCooperativeMatrixFeaturesNV.pNext = 0;
+            if (gpu_info.support_VK_KHR_cooperative_matrix)
             {
                 queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures;
                 queryExtensionFeatures = &queryCooperativeMatrixFeatures;
             }
+            else if (gpu_info.support_VK_NV_cooperative_matrix)
+            {
+                queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures;
+                queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV;
+            }
 
             VkPhysicalDeviceFeatures2KHR queryFeatures;
             queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR;
@@ -1583,10 +1628,14 @@ int create_gpu_instance()
             {
                 gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
             }
-            if (gpu_info.support_VK_NV_cooperative_matrix)
+            if (gpu_info.support_VK_KHR_cooperative_matrix)
             {
                 gpu_info.support_cooperative_matrix = queryCooperativeMatrixFeatures.cooperativeMatrix;
             }
+            else if (gpu_info.support_VK_NV_cooperative_matrix)
+            {
+                gpu_info.support_cooperative_matrix = queryCooperativeMatrixFeaturesNV.cooperativeMatrix;
+            }
         }
         else
         {
@@ -1622,36 +1671,97 @@ int create_gpu_instance()
         if (gpu_info.support_cooperative_matrix)
         {
             // query supported cooperative matrix types and operations
-            uint32_t propertyCount = 0;
-            ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0);
-            if (ret != VK_SUCCESS)
+            if (gpu_info.support_VK_KHR_cooperative_matrix)
             {
-                NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
-            }
+                uint32_t propertyCount = 0;
+                ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, 0);
+                if (ret != VK_SUCCESS)
+                {
+                    NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
+                }
 
-            std::vector<VkCooperativeMatrixPropertiesNV> properties(propertyCount);
-            for (uint32_t j = 0; j < properties.size(); j++)
-            {
-                properties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
-                properties[j].pNext = 0;
+                std::vector<VkCooperativeMatrixPropertiesKHR> properties(propertyCount);
+                ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, properties.data());
+                if (ret != VK_SUCCESS)
+                {
+                    NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
+                }
+
+                for (uint32_t j = 0; j < properties.size(); j++)
+                {
+                    const VkCooperativeMatrixPropertiesKHR& cmp = properties[j];
+                    // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
+
+                    if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                            && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                    {
+                        gpu_info.support_cooperative_matrix_16_8_8 = true;
+                    }
+                    if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                            && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                    {
+                        gpu_info.support_cooperative_matrix_16_8_16 = true;
+                    }
+                    if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR
+                            && cmp.scope == VK_SCOPE_SUBGROUP_KHR)
+                    {
+                        gpu_info.support_cooperative_matrix_16_16_16 = true;
+                    }
+                }
             }
-            ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data());
-            if (ret != VK_SUCCESS)
+            else
             {
-                NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
-            }
+                uint32_t propertyCount = 0;
+                ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0);
+                if (ret != VK_SUCCESS)
+                {
+                    NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
+                }
 
-            for (uint32_t j = 0; j < properties.size(); j++)
-            {
-                const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
-                // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
+                std::vector<VkCooperativeMatrixPropertiesNV> properties(propertyCount);
+                for (uint32_t j = 0; j < properties.size(); j++)
+                {
+                    properties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
+                    properties[j].pNext = 0;
+                }
+                ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data());
+                if (ret != VK_SUCCESS)
+                {
+                    NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
+                }
 
-                if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
-                        && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
-                        && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
-                        && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                for (uint32_t j = 0; j < properties.size(); j++)
                 {
-                    gpu_info.support_cooperative_matrix_16_8_8 = true;
+                    const VkCooperativeMatrixPropertiesNV& cmp = properties[j];
+                    // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
+
+                    if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                            && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                    {
+                        gpu_info.support_cooperative_matrix_16_8_8 = true;
+                    }
+                    if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                            && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                    {
+                        gpu_info.support_cooperative_matrix_16_8_16 = true;
+                    }
+                    if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16
+                            && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV
+                            && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV
+                            && cmp.scope == VK_SCOPE_SUBGROUP_NV)
+                    {
+                        gpu_info.support_cooperative_matrix_16_16_16 = true;
+                    }
                 }
             }
         }
@@ -1668,10 +1778,14 @@ int create_gpu_instance()
                   gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
                   gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
 
-        NCNN_LOGE("[%u %s]  subgroup=%u  basic=%d  vote=%d  ballot=%d  shuffle=%d", i, physicalDeviceProperties.deviceName,
+        NCNN_LOGE("[%u %s]  subgroup=%u  basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName,
                   gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote,
                   gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle);
 
+        NCNN_LOGE("[%u %s]  fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName,
+                  gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16,
+                  gpu_info.support_cooperative_matrix_16_16_16);
+
         gpu_info_index++;
     }
 
@@ -2038,6 +2152,8 @@ VulkanDevice::VulkanDevice(int device_index)
         enabledExtensions.push_back("VK_KHR_buffer_device_address");
     if (info.support_VK_KHR_create_renderpass2())
         enabledExtensions.push_back("VK_KHR_create_renderpass2");
+    if (info.support_VK_KHR_cooperative_matrix())
+        enabledExtensions.push_back("VK_KHR_cooperative_matrix");
     if (info.support_VK_KHR_dedicated_allocation())
         enabledExtensions.push_back("VK_KHR_dedicated_allocation");
     if (info.support_VK_KHR_descriptor_update_template())
@@ -2140,15 +2256,28 @@ VulkanDevice::VulkanDevice(int device_index)
     }
 
     // enable cooperative matrix
-    VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures;
-    queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
+    VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures;
+    queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
     queryCooperativeMatrixFeatures.pNext = 0;
     queryCooperativeMatrixFeatures.cooperativeMatrix = info.support_cooperative_matrix();
     queryCooperativeMatrixFeatures.cooperativeMatrixRobustBufferAccess = VK_FALSE;
+    VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV;
+    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
+    queryCooperativeMatrixFeaturesNV.pNext = 0;
+    queryCooperativeMatrixFeaturesNV.cooperativeMatrix = info.support_cooperative_matrix();
+    queryCooperativeMatrixFeaturesNV.cooperativeMatrixRobustBufferAccess = VK_FALSE;
     if (support_VK_KHR_get_physical_device_properties2 && info.support_cooperative_matrix())
     {
-        queryCooperativeMatrixFeatures.pNext = enabledExtensionFeatures;
-        enabledExtensionFeatures = &queryCooperativeMatrixFeatures;
+        if (info.support_VK_KHR_cooperative_matrix())
+        {
+            queryCooperativeMatrixFeatures.pNext = enabledExtensionFeatures;
+            enabledExtensionFeatures = &queryCooperativeMatrixFeatures;
+        }
+        else
+        {
+            queryCooperativeMatrixFeaturesNV.pNext = enabledExtensionFeatures;
+            enabledExtensionFeatures = &queryCooperativeMatrixFeaturesNV;
+        }
     }
 
     std::vector<float> compute_queue_priorities(info.compute_queue_count(), 1.f);   // 0.f ~ 1.f
diff --git a/src/gpu.h b/src/gpu.h
index 345329f7d47..1eff228e4eb 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -55,6 +55,9 @@ extern int support_VK_EXT_validation_flags;
 extern int support_VK_KHR_android_surface;
 #endif // __ANDROID_API__ >= 26
 
+// VK_KHR_cooperative_matrix
+extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR;
+
 // VK_KHR_external_memory_capabilities
 extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
 
@@ -176,6 +179,8 @@ class NCNN_EXPORT GpuInfo
     // cooperative matrix feature
     bool support_cooperative_matrix() const;
     bool support_cooperative_matrix_16_8_8() const;
+    bool support_cooperative_matrix_16_8_16() const;
+    bool support_cooperative_matrix_16_16_16() const;
 
     // extension capability
     int support_VK_KHR_8bit_storage() const;
@@ -183,6 +188,7 @@ class NCNN_EXPORT GpuInfo
     int support_VK_KHR_bind_memory2() const;
     int support_VK_KHR_buffer_device_address() const;
     int support_VK_KHR_create_renderpass2() const;
+    int support_VK_KHR_cooperative_matrix() const;
     int support_VK_KHR_dedicated_allocation() const;
     int support_VK_KHR_descriptor_update_template() const;
     int support_VK_KHR_external_memory() const;
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index caab40f5000..79879e815af 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -180,7 +180,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
     if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
 
         // winograd43 transform kernel
         if (opt.use_winograd43_convolution)
@@ -233,7 +234,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                 }
             }
 
-            if (use_cooperative_matrix)
+            if (use_cooperative_matrix_16_8_8)
             {
                 // src = 36-inch-outch
                 // dst = 8b-8a-inch/8a-outch/8b-36
@@ -260,6 +261,33 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                     }
                 }
             }
+            else if (use_cooperative_matrix_16_16_16)
+            {
+                // src = 36-inch-outch
+                // dst = 16b-16a-inch/16a-outch/16b-36
+                weight_winograd43_data_packed.create(num_input / 16, num_output / 16, 36, (size_t)4 * 16 * 16, 16 * 16);
+
+                for (int k = 0; k < 36; k++)
+                {
+                    float* g00 = weight_winograd43_data_packed.channel(k);
+
+                    for (int q = 0; q + (16 - 1) < num_output; q += 16)
+                    {
+                        for (int p = 0; p + (16 - 1) < num_input; p += 16)
+                        {
+                            for (int i = 0; i < 16; i++)
+                            {
+                                for (int j = 0; j < 16; j++)
+                                {
+                                    const float* k00 = weight_data_tm.channel(q + j).row(p + i);
+                                    g00[0] = k00[k];
+                                    g00++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
             else
             {
                 // src = 36-inch-outch
@@ -375,16 +403,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                 if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_3x3s1d1_winograd_gemm;
                 if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_3x3s1d1_winograd_gemm;
 
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
+                {
+                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
+                    else
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
+                }
+                else if (use_cooperative_matrix_16_16_16)
                 {
-                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
+                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
+                    else
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
                 }
 
                 pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
                 {
-                    // TODO proper unroll y
-                    pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4
+                    pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1);
+                }
+                else if (use_cooperative_matrix_16_16_16)
+                {
+                    pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1);
                 }
                 else if (opt.use_shader_local_memory)
                 {
@@ -471,7 +512,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                 }
             }
 
-            if (use_cooperative_matrix)
+            if (use_cooperative_matrix_16_8_8)
             {
                 // src = 16-inch-outch
                 // dst = 8b-8a-inch/8a-outch/8b-16
@@ -498,6 +539,33 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                     }
                 }
             }
+            else if (use_cooperative_matrix_16_16_16)
+            {
+                // src = 16-inch-outch
+                // dst = 16b-16a-inch/16a-outch/16b-16
+                weight_winograd23_data_packed.create(num_input / 16, num_output / 16, 16, (size_t)4 * 16 * 16, 16 * 16);
+
+                for (int k = 0; k < 16; k++)
+                {
+                    float* g00 = weight_winograd23_data_packed.channel(k);
+
+                    for (int q = 0; q + (16 - 1) < num_output; q += 16)
+                    {
+                        for (int p = 0; p + (16 - 1) < num_input; p += 16)
+                        {
+                            for (int i = 0; i < 16; i++)
+                            {
+                                for (int j = 0; j < 16; j++)
+                                {
+                                    const float* k00 = weight_data_tm.channel(q + j).row(p + i);
+                                    g00[0] = k00[k];
+                                    g00++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
             else
             {
                 // src = 16-inch-outch
@@ -613,16 +681,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                 if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_3x3s1d1_winograd_gemm;
                 if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_3x3s1d1_winograd_gemm;
 
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
+                {
+                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
+                    else
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
+                }
+                else if (use_cooperative_matrix_16_16_16)
                 {
-                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
+                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
+                    else
+                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
                 }
 
                 pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
+                {
+                    pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1);
+                }
+                else if (use_cooperative_matrix_16_16_16)
                 {
-                    // TODO proper unroll y
-                    pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4
+                    pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1);
                 }
                 else if (opt.use_shader_local_memory)
                 {
@@ -666,11 +747,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         // dst = pa-pb-kw-kh-inch/pa-outch/pb
         if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16)
         {
-            bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
-            if (use_cooperative_matrix)
+            bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+            bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
+
+            if (use_cooperative_matrix_16_8_8)
             {
                 // dst = 8b-8a-maxk-inch/8a-outch/8b
-                // dst = 16b-16a-maxk-inch/16a-outch/16b
                 Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
 
                 weight_data_packed.create(maxk * num_input / 8, num_output / 8, (size_t)4 * 8 * 8, 8 * 8);
@@ -696,6 +778,34 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                     }
                 }
             }
+            else if (use_cooperative_matrix_16_16_16)
+            {
+                // dst = 16b-16a-maxk-inch/16a-outch/16b
+                Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+                weight_data_packed.create(maxk * num_input / 16, num_output / 16, (size_t)4 * 16 * 16, 16 * 16);
+
+                for (int q = 0; q + 15 < num_output; q += 16)
+                {
+                    float* g00 = weight_data_packed.row(q / 16);
+
+                    for (int p = 0; p + 15 < num_input; p += 16)
+                    {
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            for (int i = 0; i < 16; i++)
+                            {
+                                for (int j = 0; j < 16; j++)
+                                {
+                                    const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+                                    g00[0] = k00[k];
+                                    g00++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
             else
             {
                 Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
@@ -728,11 +838,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         }
         else
         {
-            bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
-            if (use_cooperative_matrix)
+            bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+            bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
+
+            if (use_cooperative_matrix_16_8_8)
             {
                 // dst = 8b-8a-inch/8a-outch/8b
-                // dst = 16b-16a-inch/16a-outch/16b
                 Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
 
                 weight_data_packed.create(maxk, num_input / 8, num_output / 8, (size_t)4 * 8 * 8, 8 * 8);
@@ -758,6 +869,34 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
                     }
                 }
             }
+            else if (use_cooperative_matrix_16_16_16)
+            {
+                // dst = 16b-16a-inch/16a-outch/16b
+                Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+                weight_data_packed.create(maxk, num_input / 16, num_output / 16, (size_t)4 * 16 * 16, 16 * 16);
+
+                for (int q = 0; q + 15 < num_output; q += 16)
+                {
+                    float* g00 = weight_data_packed.channel(q / 16);
+
+                    for (int p = 0; p + 15 < num_input; p += 16)
+                    {
+                        for (int k = 0; k < maxk; k++)
+                        {
+                            for (int i = 0; i < 16; i++)
+                            {
+                                for (int j = 0; j < 16; j++)
+                                {
+                                    const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+                                    g00[0] = k00[k];
+                                    g00++;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
             else
             {
                 Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
@@ -801,7 +940,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     }
     else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
 
         // check blob shape
         if (!vkdev->shape_support_image_storage(shape_bordered_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
@@ -856,16 +996,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
         if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;
 
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
         {
-            shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_8_8;
+            if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_8_8;
+            else
+                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_8_8;
+        }
+        else if (use_cooperative_matrix_16_16_16)
+        {
+            if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_16_16;
+            else
+                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_16_16;
         }
 
         pipeline_convolution_gemm = new Pipeline(vkdev);
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
         {
-            // TODO proper unroll y
-            pipeline_convolution_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4
+            pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8
+        }
+        else if (use_cooperative_matrix_16_16_16)
+        {
+            pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16
         }
         else if (opt.use_shader_local_memory)
         {
@@ -879,7 +1032,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
     }
     else if (is_conv1x1s1d1)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
 
         std::vector<vk_specialization_type> specializations(4 + 8);
         specializations[0].i = bias_term;
@@ -906,16 +1060,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
         if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;
 
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
         {
-            shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_8_8;
+            if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_8_8;
+            else
+                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_8_8;
+        }
+        else if (use_cooperative_matrix_16_16_16)
+        {
+            if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_16_16;
+            else
+                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_16_16;
         }
 
         pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
         {
-            // TODO proper unroll y
-            pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_8_8
+        }
+        else if (use_cooperative_matrix_16_16_16)
+        {
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_16_16
         }
         else if (opt.use_shader_local_memory)
         {
@@ -1223,7 +1390,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
 
     if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
 
         bool pre_winograd43 = opt.use_winograd43_convolution;
         if (opt.use_winograd23_convolution)
@@ -1233,7 +1401,9 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
             if (vkdev->info.type() != 0 && (w <= 12 && h <= 12))
                 pre_winograd43 = false;
 
-            if (use_cooperative_matrix && (w <= 18 && h <= 18))
+            if (use_cooperative_matrix_16_8_8 && (w <= 18 && h <= 18))
+                pre_winograd43 = false;
+            else if (use_cooperative_matrix_16_16_16 && (w <= 18 && h <= 18))
                 pre_winograd43 = false;
         }
 
@@ -1295,10 +1465,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
                 dispatcher.h = top_tm_blob.h;
                 dispatcher.c = 36;
 
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
                 {
-                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 3) / 4 * 32;
-                    dispatcher.h = (top_tm_blob.h + 1) / 2;
+                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32;
+                    dispatcher.h = ((top_tm_blob.h + 1) / 2 + 3) / 4;
+                    dispatcher.c = 36;
+                }
+                else if (use_cooperative_matrix_16_16_16)
+                {
+                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32;
+                    dispatcher.h = ((top_tm_blob.h + 3) / 4 + 1) / 2;
                     dispatcher.c = 36;
                 }
 
@@ -1391,10 +1567,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
                 dispatcher.h = top_tm_blob.h;
                 dispatcher.c = 16;
 
-                if (use_cooperative_matrix)
+                if (use_cooperative_matrix_16_8_8)
+                {
+                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32;
+                    dispatcher.h = ((top_tm_blob.h + 1) / 2 + 3) / 4;
+                    dispatcher.c = 16;
+                }
+                else if (use_cooperative_matrix_16_16_16)
                 {
-                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 3) / 4 * 32;
-                    dispatcher.h = (top_tm_blob.h + 1) / 2;
+                    dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32;
+                    dispatcher.h = ((top_tm_blob.h + 3) / 4 + 1) / 2;
                     dispatcher.c = 16;
                 }
 
@@ -1434,7 +1616,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     }
     if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
 
         // gemm
         top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
@@ -1462,10 +1645,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
         dispatcher.h = top_blob.c;
         dispatcher.c = 1;
 
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
+        {
+            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32;
+            dispatcher.h = ((top_blob.c + 1) / 2 + 3) / 4;
+            dispatcher.c = 1;
+        }
+        else if (use_cooperative_matrix_16_16_16)
         {
-            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 3) / 4 * 32;
-            dispatcher.h = (top_blob.c + 1) / 2;
+            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32;
+            dispatcher.h = ((top_blob.c + 3) / 4 + 1) / 2;
             dispatcher.c = 1;
         }
 
@@ -1475,7 +1664,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     }
     if (is_conv1x1s1d1)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
 
         top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
         if (top_blob.empty())
@@ -1502,10 +1692,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
         dispatcher.h = top_blob.c;
         dispatcher.c = 1;
 
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
+        {
+            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32;
+            dispatcher.h = ((top_blob.c + 1) / 2 + 3) / 4;
+            dispatcher.c = 1;
+        }
+        else if (use_cooperative_matrix_16_16_16)
         {
-            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 3) / 4 * 32;
-            dispatcher.h = (top_blob.c + 1) / 2;
+            dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32;
+            dispatcher.h = ((top_blob.c + 3) / 4 + 1) / 2;
             dispatcher.c = 1;
         }
 
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index 2480c854b80..30283d211c1 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -146,14 +146,14 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
 
     if (opt.use_sgemm_convolution)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0;
 
         // src = kw-kh-inch-outch
         // dst = pa-pb-inch/pa-kw-kh-outch/pb (sgemm)
-        if (use_cooperative_matrix)
+        if (use_cooperative_matrix_16_8_8)
         {
             // dst = 8a-8b-inch/8a-maxk-outch/8b
-            // dst = 16a-16b-inch/16a-maxk-outch/16b
             Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
 
             weight_data_packed.create(num_input / 8, maxk * num_output / 8, (size_t)4 * 8 * 8, 8 * 8);
@@ -179,6 +179,34 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
                 }
             }
         }
+        else if (use_cooperative_matrix_16_16_16)
+        {
+            // dst = 16a-16b-inch/16a-maxk-outch/16b
+            Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+            weight_data_packed.create(num_input / 16, maxk * num_output / 16, (size_t)4 * 16 * 16, 16 * 16);
+
+            for (int q = 0; q + 15 < num_output; q += 16)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    float* g00 = weight_data_packed.row(q / 16 * maxk + k);
+
+                    for (int p = 0; p + 15 < num_input; p += 16)
+                    {
+                        for (int i = 0; i < 16; i++)
+                        {
+                            for (int j = 0; j < 16; j++)
+                            {
+                                const float* k00 = weight_data_r2.channel(q + j).row(p + i);
+                                g00[0] = k00[k];
+                                g00++;
+                            }
+                        }
+                    }
+                }
+            }
+        }
         else
         {
             Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
@@ -253,16 +281,29 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
             if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::deconvolution_pack4to8_gemm;
             if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::deconvolution_pack8to4_gemm;
 
-            if (use_cooperative_matrix)
+            if (use_cooperative_matrix_16_8_8)
             {
-                shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_8_8;
+                if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_8_8;
+                else
+                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_8_8;
+            }
+            else if (use_cooperative_matrix_16_16_16)
+            {
+                if (vkdev->info.support_VK_KHR_cooperative_matrix())
+                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_16_16;
+                else
+                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_16_16;
             }
 
             pipeline_deconvolution_gemm = new Pipeline(vkdev);
-            if (use_cooperative_matrix)
+            if (use_cooperative_matrix_16_8_8)
+            {
+                pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8
+            }
+            else if (use_cooperative_matrix_16_16_16)
             {
-                // TODO proper unroll y
-                pipeline_deconvolution_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4
+                pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16
             }
             else if (opt.use_shader_local_memory)
             {
@@ -505,7 +546,8 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
     VkMat top_blob_bordered;
     if (opt.use_sgemm_convolution)
     {
-        bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0;
+        bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0;
 
         const int maxk = kernel_w * kernel_h;
 
@@ -534,10 +576,16 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
             dispatcher.h = top_blob_col.h;
             dispatcher.c = 1;
 
-            if (use_cooperative_matrix)
+            if (use_cooperative_matrix_16_8_8)
+            {
+                dispatcher.w = ((top_blob_col.w + 15) / 16 + 1) / 2 * 32;
+                dispatcher.h = ((top_blob_col.h + 1) / 2 + 3) / 4;
+                dispatcher.c = 1;
+            }
+            else if (use_cooperative_matrix_16_16_16)
             {
-                dispatcher.w = ((top_blob_col.w + 15) / 16 + 3) / 4 * 32;
-                dispatcher.h = (top_blob_col.h + 1) / 2;
+                dispatcher.w = ((top_blob_col.w + 15) / 16 + 1) / 2 * 32;
+                dispatcher.h = ((top_blob_col.h + 3) / 4 + 1) / 2;
                 dispatcher.c = 1;
             }
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
deleted file mode 100644
index 3d9c5d442c6..00000000000
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
+++ /dev/null
@@ -1,257 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#if NCNN_fp16_storage
-#extension GL_EXT_shader_16bit_storage: require
-#endif
-#if NCNN_fp16_arithmetic
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#endif
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int bias_term = 0;
-layout (constant_id = 1) const int activation_type = 0;
-layout (constant_id = 2) const float activation_param_0 = 0;
-layout (constant_id = 3) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 4
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define LOCAL_SIZE_Y 4
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v2[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v3[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2;
-
-    const int lx = int(gl_LocalInvocationID.x);
-    const int ly = int(gl_LocalInvocationID.y);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias;
-
-        coopMatLoadNV(bias, bias_data, gy, 0, false);
-
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-    }
-    else
-    {
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    }
-
-    int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = (z + ly) * 2 * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (ly < remain)
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = (z + ly) * 2 * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < remain; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outcstep) || gy >= psc(outc))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false);
-
-    barrier();
-
-    {
-        int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-        uvec2 sum0_u2 = tmp_v0[tmp_vi];
-        uvec2 sum1_u2 = tmp_v1[tmp_vi];
-        uvec2 sum2_u2 = tmp_v2[tmp_vi];
-        uvec2 sum3_u2 = tmp_v3[tmp_vi];
-
-        afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-        afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-        afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y));
-        afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y));
-
-        sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-        sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-        sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
-        sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
-
-        int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
-        {
-            if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
-            if (gx + 16 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
-            if (gx + 32 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 32, sum2);
-            if (gx + 48 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 48, sum3);
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
new file mode 100644
index 00000000000..79641acbc40
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
@@ -0,0 +1,239 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
+
+    if (bias_term == 1)
+    {
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias0;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias1;
+
+        coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+    }
+    else
+    {
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    }
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outcstep) || gy >= psc(outc))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum3);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            uvec2 sum0_u2 = tmp_v0[tmp_vi];
+            uvec2 sum1_u2 = tmp_v1[tmp_vi];
+
+            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+
+            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+
+            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
+                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
new file mode 100644
index 00000000000..3c82d995202
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
@@ -0,0 +1,298 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
+
+    if (bias_term == 1)
+    {
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias0;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias1;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias2;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias3;
+
+        coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
+
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
+        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
+        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
+        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+    }
+    else
+    {
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    }
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outcstep) || gy >= psc(outc))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum3);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum4);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum5);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum6);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum7);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep))
+                {
+                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
+                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi, sum0);
+                }
+                if (gx + lxm16 + 16 < psc(outcstep))
+                {
+                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
+                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi + 16, sum1);
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
new file mode 100644
index 00000000000..2c0f57e708c
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
@@ -0,0 +1,239 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    }
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outcstep) || gy >= psc(outc))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            uvec2 sum0_u2 = tmp_v0[tmp_vi];
+            uvec2 sum1_u2 = tmp_v1[tmp_vi];
+
+            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+
+            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+
+            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
+                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
new file mode 100644
index 00000000000..97322e6ed9e
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
@@ -0,0 +1,298 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
+        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
+        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    }
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outcstep) || gy >= psc(outc))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep))
+                {
+                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
+                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi, sum0);
+                }
+                if (gx + lxm16 + 16 < psc(outcstep))
+                {
+                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
+                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi + 16, sum1);
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
deleted file mode 100644
index bb72fc76d9e..00000000000
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
+++ /dev/null
@@ -1,210 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#if NCNN_fp16_storage
-#extension GL_EXT_shader_16bit_storage: require
-#endif
-#if NCNN_fp16_arithmetic
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#endif
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int batch = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
-layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
-layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int c;
-    int cstep;
-
-    int outw;
-    int outc;
-    int outcstep;
-} p;
-
-#define LOCAL_SIZE_Y 4
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v2[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v3[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2;
-    int gz = int(gl_GlobalInvocationID.z);
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-
-    const int lx = int(gl_LocalInvocationID.x);
-    const int ly = int(gl_LocalInvocationID.y);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = gz * psc(cstep) + (z + ly) * 2 * psc(outw) + gx + lxd16 * psc(outw) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gz * psc(c) * psc(outc) * 4 + gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_tm_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (ly < remain)
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = gz * psc(cstep) + (z + ly) * 2 * psc(outw) + gx + lxd16 * psc(outw) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < remain; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gz * psc(c) * psc(outc) * 4 + gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_tm_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false);
-
-    barrier();
-
-    {
-        int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-        int gi = gz * psc(outcstep) + gy * psc(outw) + gx + lxd16 * psc(outw) + lxm16;
-
-        if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
-        if (gx + 16 + lxm16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
-        if (gx + 32 + lxm16 < psc(outw)) top_tm_blob_data[gi + 32] = tmp_v2[tmp_vi];
-        if (gx + 48 + lxm16 < psc(outw)) top_tm_blob_data[gi + 48] = tmp_v3[tmp_vi];
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
new file mode 100644
index 00000000000..c4a494e917a
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
@@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int batch = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
+layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outw;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_tm_data[w_offset];
+                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_tm_data[w_offset];
+                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum3);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+            const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
+                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
new file mode 100644
index 00000000000..785c917bbf4
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int batch = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
+layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outw;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
+                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
+                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum3);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum4);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum5);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum6);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum7);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
+                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
new file mode 100644
index 00000000000..bcca39eb615
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
@@ -0,0 +1,196 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int batch = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
+layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outw;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_tm_data[w_offset];
+                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_tm_data[w_offset];
+                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+            const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
+                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
new file mode 100644
index 00000000000..35d3b4faba5
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
@@ -0,0 +1,241 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int batch = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
+layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outw;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
+                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
+                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
+                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
deleted file mode 100644
index 80a0463a9b1..00000000000
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
+++ /dev/null
@@ -1,294 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#if NCNN_fp16_storage
-#extension GL_EXT_shader_16bit_storage: require
-#endif
-#if NCNN_fp16_arithmetic
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#endif
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define LOCAL_SIZE_Y 4
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v2[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v3[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2;
-
-    const int outsize = psc(outw) * psc(outh);
-
-    const int lx = int(gl_LocalInvocationID.x);
-    const int ly = int(gl_LocalInvocationID.y);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias;
-
-        coopMatLoadNV(bias, bias_data, gy, 0, false);
-
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias);
-    }
-    else
-    {
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    }
-
-    const int maxk = kernel_w * kernel_h;
-    const int N = psc(c) / 2 * maxk;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            const int sz = (z + ly) / maxk * 2;
-            const int kk = (z + ly) % maxk;
-
-            const int ky = kk / kernel_w;
-            const int kx = kk % kernel_w;
-
-            const ivec4 gx16 = gx + ivec4(0, 16, 32, 48) + lxm16;
-
-            const ivec4 sy16 = gx16 / psc(outw);
-            const ivec4 sx16 = gx16 % psc(outw);
-
-            const ivec4 sxs16 = sx16 * stride_w;
-            const ivec4 sys16 = sy16 * stride_h;
-
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            const ivec4 v_offset = sz * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w + lxd16 * psc(cstep);
-
-            tmp_v0[tmp_vi] = gx16.r < outsize ? bottom_blob_data[v_offset.r] : uvec2(0);
-            tmp_v1[tmp_vi] = gx16.g < outsize ? bottom_blob_data[v_offset.g] : uvec2(0);
-            tmp_v2[tmp_vi] = gx16.b < outsize ? bottom_blob_data[v_offset.b] : uvec2(0);
-            tmp_v3[tmp_vi] = gx16.a < outsize ? bottom_blob_data[v_offset.a] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * maxk * 4 + (z + z4) * 16 + lxm8* 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (ly < remain)
-        {
-            const int sz = (z + ly) / maxk * 2;
-            const int kk = (z + ly) % maxk;
-
-            const int ky = kk / kernel_w;
-            const int kx = kk % kernel_w;
-
-            const ivec4 gx16 = gx + ivec4(0, 16, 32, 48) + lxm16;
-
-            const ivec4 sy16 = gx16 / psc(outw);
-            const ivec4 sx16 = gx16 % psc(outw);
-
-            const ivec4 sxs16 = sx16 * stride_w;
-            const ivec4 sys16 = sy16 * stride_h;
-
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            const ivec4 v_offset = sz * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w + lxd16 * psc(cstep);
-
-            tmp_v0[tmp_vi] = gx16.r < outsize ? bottom_blob_data[v_offset.r] : uvec2(0);
-            tmp_v1[tmp_vi] = gx16.g < outsize ? bottom_blob_data[v_offset.g] : uvec2(0);
-            tmp_v2[tmp_vi] = gx16.b < outsize ? bottom_blob_data[v_offset.b] : uvec2(0);
-            tmp_v3[tmp_vi] = gx16.a < outsize ? bottom_blob_data[v_offset.a] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < remain; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * maxk * 4 + (z + z4) * 16 + lxm8* 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= outsize || gy >= psc(outc))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false);
-
-    barrier();
-
-    {
-        int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-        uvec2 sum0_u2 = tmp_v0[tmp_vi];
-        uvec2 sum1_u2 = tmp_v1[tmp_vi];
-        uvec2 sum2_u2 = tmp_v2[tmp_vi];
-        uvec2 sum3_u2 = tmp_v3[tmp_vi];
-
-        afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-        afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-        afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y));
-        afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y));
-
-        sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-        sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-        sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
-        sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
-
-        int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
-        {
-            if (gx + lxm16 < outsize) buffer_st4(top_blob_data, gi, sum0);
-            if (gx + 16 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 16, sum1);
-            if (gx + 32 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 32, sum2);
-            if (gx + 48 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 48, sum3);
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
new file mode 100644
index 00000000000..0e5d83acf57
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
@@ -0,0 +1,276 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int outsize = psc(outw) * psc(outh);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
+
+    if (bias_term == 1)
+    {
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias0;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias1;
+
+        coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+    }
+    else
+    {
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    }
+
+    const int maxk = kernel_w * kernel_h;
+    const int N = psc(c) / 4 * maxk;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int sz = (z + lxd16) / maxk;
+                const int kk = (z + lxd16) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
+
+                const ivec2 sy16 = gx16 / psc(outw);
+                const ivec2 sx16 = gx16 % psc(outw);
+
+                const ivec2 sxs16 = sx16 * stride_w;
+                const ivec2 sys16 = sy16 * stride_h;
+
+                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int sz = (z + lxd16) / maxk;
+                const int kk = (z + lxd16) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
+
+                const ivec2 sy16 = gx16 / psc(outw);
+                const ivec2 sx16 = gx16 % psc(outw);
+
+                const ivec2 sxs16 = sx16 * stride_w;
+                const ivec2 sys16 = sy16 * stride_h;
+
+                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum3);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            uvec2 sum0_u2 = tmp_v0[tmp_vi];
+            uvec2 sum1_u2 = tmp_v1[tmp_vi];
+
+            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+
+            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+
+            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
+                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
new file mode 100644
index 00000000000..2fc6d199c05
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
@@ -0,0 +1,335 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int outsize = psc(outw) * psc(outh);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
+
+    if (bias_term == 1)
+    {
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias0;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias1;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias2;
+        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias3;
+
+        coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+        coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
+
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
+        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
+        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
+        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+    }
+    else
+    {
+        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    }
+
+    const int maxk = kernel_w * kernel_h;
+    const int N = psc(c) / 2 * maxk;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                const int sz = (z + lxd8) / maxk;
+                const int kk = (z + lxd8) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
+
+                const ivec4 sy16 = gx16 / psc(outw);
+                const ivec4 sx16 = gx16 % psc(outw);
+
+                const ivec4 sxs16 = sx16 * stride_w;
+                const ivec4 sys16 = sy16 * stride_h;
+
+                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                const int sz = (z + lxd8) / maxk;
+                const int kk = (z + lxd8) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
+
+                const ivec4 sy16 = gx16 / psc(outw);
+                const ivec4 sx16 = gx16 % psc(outw);
+
+                const ivec4 sxs16 = sx16 * stride_w;
+                const ivec4 sys16 = sy16 * stride_h;
+
+                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum3);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum4);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum5);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum6);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum7);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep))
+                {
+                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
+                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi, sum0);
+                }
+                if (gx + lxm16 + 16 < psc(outcstep))
+                {
+                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
+                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi + 16, sum1);
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
new file mode 100644
index 00000000000..71cef19638c
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
@@ -0,0 +1,276 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int outsize = psc(outw) * psc(outh);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    }
+
+    const int maxk = kernel_w * kernel_h;
+    const int N = psc(c) / 4 * maxk;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int sz = (z + lxd16) / maxk;
+                const int kk = (z + lxd16) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
+
+                const ivec2 sy16 = gx16 / psc(outw);
+                const ivec2 sx16 = gx16 % psc(outw);
+
+                const ivec2 sxs16 = sx16 * stride_w;
+                const ivec2 sys16 = sy16 * stride_h;
+
+                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int sz = (z + lxd16) / maxk;
+                const int kk = (z + lxd16) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
+
+                const ivec2 sy16 = gx16 / psc(outw);
+                const ivec2 sx16 = gx16 % psc(outw);
+
+                const ivec2 sxs16 = sx16 * stride_w;
+                const ivec2 sys16 = sy16 * stride_h;
+
+                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            uvec2 sum0_u2 = tmp_v0[tmp_vi];
+            uvec2 sum1_u2 = tmp_v1[tmp_vi];
+
+            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+
+            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+
+            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + lxd16 * 4 + j < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
+                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
new file mode 100644
index 00000000000..4f1c1f6ed1c
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
@@ -0,0 +1,335 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int outsize = psc(outw) * psc(outh);
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
+        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
+        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    }
+
+    const int maxk = kernel_w * kernel_h;
+    const int N = psc(c) / 2 * maxk;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                const int sz = (z + lxd8) / maxk;
+                const int kk = (z + lxd8) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
+
+                const ivec4 sy16 = gx16 / psc(outw);
+                const ivec4 sx16 = gx16 % psc(outw);
+
+                const ivec4 sxs16 = sx16 * stride_w;
+                const ivec4 sys16 = sy16 * stride_h;
+
+                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                const int sz = (z + lxd8) / maxk;
+                const int kk = (z + lxd8) % maxk;
+
+                const int ky = kk / kernel_w;
+                const int kx = kk % kernel_w;
+
+                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
+
+                const ivec4 sy16 = gx16 / psc(outw);
+                const ivec4 sx16 = gx16 % psc(outw);
+
+                const ivec4 sxs16 = sx16 * stride_w;
+                const ivec4 sys16 = sy16 * stride_h;
+
+                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
+
+                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
+                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= outsize || gy >= psc(outc))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
+
+            if (gy + j * 2 + lxd16 < psc(outc))
+            {
+                if (gx + lxm16 < psc(outcstep))
+                {
+                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
+                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
+                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi, sum0);
+                }
+                if (gx + lxm16 + 16 < psc(outcstep))
+                {
+                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
+                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
+                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+                    buffer_st4(top_blob_data, gi + 16, sum1);
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
deleted file mode 100644
index 140bea6b99c..00000000000
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
+++ /dev/null
@@ -1,211 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#if NCNN_fp16_storage
-#extension GL_EXT_shader_16bit_storage: require
-#endif
-#if NCNN_fp16_arithmetic
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#endif
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int maxk = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-} p;
-
-#define LOCAL_SIZE_Y 4
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v2[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v3[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16;
-    int gy = int(gl_GlobalInvocationID.y);
-
-    const int lx = int(gl_LocalInvocationID.x);
-    const int ly = int(gl_LocalInvocationID.y);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-
-    int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = (z + ly) * 2 * psc(cstep) + gx + lxd16 * psc(cstep) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * 8 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (ly < remain)
-        {
-            int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-            int v_offset = (z + ly) * 2 * psc(cstep) + gx + lxd16 * psc(cstep) + lxm16;
-
-            tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-            tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-            tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 32] : uvec2(0);
-            tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 48] : uvec2(0);
-        }
-
-        if (lx < 16)
-        {
-            for (int z4 = 0; z4 < remain; z4++)
-            {
-                int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8;
-                int w_offset = gy * psc(c) * 8 + (z + z4) * 16 + lxm8 * 2 + lxd8;
-
-                tmp_k[tmp_ki] = weight_data[w_offset];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-            coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false);
-            coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B;
-            coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B, sum0);
-            sum1 = coopMatMulAddNV(A1, B, sum1);
-            sum2 = coopMatMulAddNV(A2, B, sum2);
-            sum3 = coopMatMulAddNV(A3, B, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy * 2 >= psc(outh))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false);
-
-    barrier();
-
-    {
-        int tmp_vi = ly*16*2 + lxm16*2+lxd16;
-        int gi = (gy / maxk * maxk * 2 + gy % maxk) * psc(outw) + gx + lxd16 * maxk*psc(outw) + lxm16;
-
-        if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
-        if (gx + 16 + lxm16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
-        if (gx + 32 + lxm16 < psc(outw)) col_blob_data[gi + 32] = tmp_v2[tmp_vi];
-        if (gx + 48 + lxm16 < psc(outw)) col_blob_data[gi + 48] = tmp_v3[tmp_vi];
-    }
-}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
new file mode 100644
index 00000000000..490b8f5edb5
--- /dev/null
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int maxk = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseB> B1;
+            coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum3);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16);
+
+            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
+            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
new file mode 100644
index 00000000000..82a4f75104a
--- /dev/null
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
@@ -0,0 +1,239 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_KHR_cooperative_matrix: require
+
+layout (constant_id = 0) const int maxk = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
+            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
+            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B0;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B1;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B2;
+            coopmat<float16_t, gl_ScopeSubgroup, 8, 8, gl_MatrixUseB> B3;
+            coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+            coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+            // sum += v * k
+            sum0 = coopMatMulAdd(A0, B0, sum0);
+            sum1 = coopMatMulAdd(A1, B0, sum1);
+            sum2 = coopMatMulAdd(A0, B1, sum2);
+            sum3 = coopMatMulAdd(A1, B1, sum3);
+            sum4 = coopMatMulAdd(A0, B2, sum4);
+            sum5 = coopMatMulAdd(A1, B2, sum5);
+            sum6 = coopMatMulAdd(A0, B3, sum6);
+            sum7 = coopMatMulAdd(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum3_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum3);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum4_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum4);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum5);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum6);
+    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum7);
+
+    coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16);
+
+            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
+            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
new file mode 100644
index 00000000000..bbeca6b301d
--- /dev/null
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int maxk = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+} p;
+
+#define UNROLL_INCH 2
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
+shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+
+    const int N = psc(c) / 4;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd16 == 0)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
+
+                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
+
+                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+
+                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
+
+                tmp_k0[tmp_i] = weight_data[w_offset];
+                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+
+    barrier();
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
+
+            const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16);
+
+            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
+            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
+        }
+    }
+}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
new file mode 100644
index 00000000000..4d7a03e829f
--- /dev/null
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
@@ -0,0 +1,239 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_KHR_memory_scope_semantics: require
+#extension GL_EXT_shader_explicit_arithmetic_types: require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#extension GL_NV_cooperative_matrix: require
+
+layout (constant_id = 0) const int maxk = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+
+layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+} p;
+
+#define UNROLL_INCH 4
+
+shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
+shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
+shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
+shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
+    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
+
+    const int lx = int(gl_LocalInvocationID.x);
+
+    const int lxd8 = lx / 8; // 0 1 2 3
+    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
+
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+
+    const int N = psc(c) / 2;
+
+    int z = 0;
+    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
+    {
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (z < N)
+    {
+        const int remain = N - z;
+
+        if (lxd8 < remain)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
+
+                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
+
+                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
+                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
+                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
+                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
+
+                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
+
+                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
+
+                tmp_k0[tmp_ki] = weight_data[w_offset];
+                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
+                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
+                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
+            }
+        }
+
+        barrier();
+
+        for (int z4 = 0; z4 < remain; z4++)
+        {
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+        }
+
+        barrier();
+    }
+
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+
+    barrier();
+
+    const int lxd16 = lx / 16; // 0 1
+    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
+
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
+            const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16);
+
+            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
+            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
+        }
+    }
+}
diff --git a/src/vulkan_header_fix.h b/src/vulkan_header_fix.h
index cd1efed46bc..0a5ea9bbd0a 100644
--- a/src/vulkan_header_fix.h
+++ b/src/vulkan_header_fix.h
@@ -389,4 +389,61 @@ typedef enum VkInstanceCreateFlagBits
 } VkInstanceCreateFlagBits;
 #endif // VK_HEADER_VERSION < 208
 
+#if VK_HEADER_VERSION < 255
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR   (VkStructureType)1000506000
+#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR                 (VkStructureType)1000506001
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002
+typedef enum VkComponentTypeKHR
+{
+    VK_COMPONENT_TYPE_FLOAT16_KHR = 0,
+    VK_COMPONENT_TYPE_FLOAT32_KHR = 1,
+    VK_COMPONENT_TYPE_FLOAT64_KHR = 2,
+    VK_COMPONENT_TYPE_SINT8_KHR = 3,
+    VK_COMPONENT_TYPE_SINT16_KHR = 4,
+    VK_COMPONENT_TYPE_SINT32_KHR = 5,
+    VK_COMPONENT_TYPE_SINT64_KHR = 6,
+    VK_COMPONENT_TYPE_UINT8_KHR = 7,
+    VK_COMPONENT_TYPE_UINT16_KHR = 8,
+    VK_COMPONENT_TYPE_UINT32_KHR = 9,
+    VK_COMPONENT_TYPE_UINT64_KHR = 10,
+    VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkComponentTypeKHR;
+typedef enum VkScopeKHR
+{
+    VK_SCOPE_DEVICE_KHR = 1,
+    VK_SCOPE_WORKGROUP_KHR = 2,
+    VK_SCOPE_SUBGROUP_KHR = 3,
+    VK_SCOPE_QUEUE_FAMILY_KHR = 5,
+    VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF
+} VkScopeKHR;
+typedef struct VkCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    uint32_t MSize;
+    uint32_t NSize;
+    uint32_t KSize;
+    VkComponentTypeKHR AType;
+    VkComponentTypeKHR BType;
+    VkComponentTypeKHR CType;
+    VkComponentTypeKHR ResultType;
+    VkBool32 saturatingAccumulation;
+    VkScopeKHR scope;
+} VkCooperativeMatrixPropertiesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkBool32 cooperativeMatrix;
+    VkBool32 cooperativeMatrixRobustBufferAccess;
+} VkPhysicalDeviceCooperativeMatrixFeaturesKHR;
+typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR
+{
+    VkStructureType sType;
+    void* pNext;
+    VkShaderStageFlags cooperativeMatrixSupportedStages;
+} VkPhysicalDeviceCooperativeMatrixPropertiesKHR;
+typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties);
+#endif // VK_HEADER_VERSION < 255
+
 #endif // NCNN_VULKAN_HEADER_FIX_H
diff --git a/tests/testutil.h b/tests/testutil.h
index 1a86c639bab..b879fa527fb 100644
--- a/tests/testutil.h
+++ b/tests/testutil.h
@@ -1508,7 +1508,8 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
         {1, 0, 0, 0, 0, 0, 0},
         {1, 1, 0, 0, 1, 0, 0},
         {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 1, 1},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
     };
 
     const int opt_count = sizeof(options) / sizeof(options[0]);
@@ -1544,7 +1545,8 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec
         {1, 0, 0, 0, 0, 0, 0},
         {1, 1, 0, 0, 1, 0, 0},
         {1, 0, 1, 0, 0, 1, 0},
-        {1, 1, 1, 1, 0, 1, 1},
+        {1, 1, 1, 1, 0, 0, 0},
+        {1, 1, 1, 1, 1, 1, 1},
     };
 
     const int opt_count = sizeof(options) / sizeof(options[0]);