From c45c01c7c10dc2482ee973f602580ecfaaa102d7 Mon Sep 17 00:00:00 2001 From: nihui Date: Thu, 27 Jul 2023 19:21:13 +0800 Subject: [PATCH] enable VK_KHR_cooperative_matrix (#4823) * enable VK_KHR_cooperative_matrix * add khr cm shader * update glslang * print matrix info --- .ci/test-coverage.yml | 20 +- glslang | 2 +- src/gpu.cpp | 193 ++++++++-- src/gpu.h | 6 + src/layer/vulkan/convolution_vulkan.cpp | 290 ++++++++++++--- src/layer/vulkan/deconvolution_vulkan.cpp | 72 +++- .../convolution_pack4_1x1s1d1_cm_16_8_8.comp | 257 -------------- ...olution_pack4_1x1s1d1_khr_cm_16_16_16.comp | 239 +++++++++++++ ...nvolution_pack4_1x1s1d1_khr_cm_16_8_8.comp | 298 ++++++++++++++++ ...volution_pack4_1x1s1d1_nv_cm_16_16_16.comp | 239 +++++++++++++ ...onvolution_pack4_1x1s1d1_nv_cm_16_8_8.comp | 298 ++++++++++++++++ ...pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp | 210 ----------- ...3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp | 196 ++++++++++ ...4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp | 241 +++++++++++++ ..._3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp | 196 ++++++++++ ...k4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp | 241 +++++++++++++ .../convolution_pack4_gemm_cm_16_8_8.comp | 294 --------------- ...onvolution_pack4_gemm_khr_cm_16_16_16.comp | 276 +++++++++++++++ .../convolution_pack4_gemm_khr_cm_16_8_8.comp | 335 ++++++++++++++++++ ...convolution_pack4_gemm_nv_cm_16_16_16.comp | 276 +++++++++++++++ .../convolution_pack4_gemm_nv_cm_16_8_8.comp | 335 ++++++++++++++++++ .../deconvolution_pack4_gemm_cm_16_8_8.comp | 211 ----------- ...onvolution_pack4_gemm_khr_cm_16_16_16.comp | 195 ++++++++++ ...econvolution_pack4_gemm_khr_cm_16_8_8.comp | 239 +++++++++++++ ...convolution_pack4_gemm_nv_cm_16_16_16.comp | 195 ++++++++++ ...deconvolution_pack4_gemm_nv_cm_16_8_8.comp | 239 +++++++++++++ src/vulkan_header_fix.h | 57 +++ tests/testutil.h | 6 +- 28 files changed, 4580 insertions(+), 1076 deletions(-) delete mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp delete mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp delete mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp delete mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp create mode 100644 src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml index 9e5a054a847..a693f415883 100644 --- a/.ci/test-coverage.yml +++ b/.ci/test-coverage.yml @@ -52,7 +52,7 @@ jobs: uses: cache@1.* with: cachePaths: swiftshader-install - cacheKey: swiftshader-linux-install-20230420 + cacheKey: swiftshader-linux-install-20230420-1 - name: checkout-swiftshader if: steps.cache-swiftshader.outputs.cacheHit != 'true' @@ -126,7 +126,7 @@ jobs: uses: cache@1.* with: cachePaths: lavapipe-install - cacheKey: lavapipe-linux-install-20211127-3 + cacheKey: lavapipe-linux-install-20211127-4 - name: checkout-lavapipe if: steps.cache-lavapipe.outputs.cacheHit != 'true' @@ -280,7 +280,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-arm-install-20220831 + cacheKey: qemu-arm-install-20220831-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -386,7 +386,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-aarch64-install-20220831 + cacheKey: qemu-aarch64-install-20220831-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -473,7 +473,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-mipsel-install-20220831 + cacheKey: qemu-mipsel-install-20220831-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -552,7 +552,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-mips64el-install-20220831 + cacheKey: qemu-mips64el-install-20220831-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -631,7 +631,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-riscv64-install-20230624 + cacheKey: qemu-riscv64-install-20230624-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -716,7 +716,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-riscv64-install-20230624 + cacheKey: qemu-riscv64-install-20230624-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' @@ -751,7 +751,7 @@ jobs: uses: cache@1.* with: cachePaths: rv64gcv-install - cacheKey: rv64gcv-linux-install-20221029 + cacheKey: rv64gcv-linux-install-20221029-1 - name: checkout-riscv-gnu-toolchain if: steps.cache-rv64gcv.outputs.cacheHit != 'true' @@ -861,7 +861,7 @@ jobs: uses: cache@1.* with: cachePaths: qemu-install - cacheKey: qemu-loongarch64-install-20230524 + cacheKey: qemu-loongarch64-install-20230524-1 - name: checkout-qemu if: steps.cache-qemu.outputs.cacheHit != 'true' diff --git a/glslang b/glslang index 88fd417b0bb..4420f9b33ba 160000 --- a/glslang +++ b/glslang @@ -1 +1 @@ -Subproject commit 88fd417b0bb7d91755961c70e846d274c182f2b0 +Subproject commit 4420f9b33ba44928d5c82d9eae0c3bb4d5674c05 diff --git a/src/gpu.cpp b/src/gpu.cpp index c743d42c90b..88c44d53f9b 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -110,6 +110,9 @@ int support_VK_EXT_validation_flags = 0; int support_VK_KHR_android_surface = 0; #endif // __ANDROID_API__ >= 26 +// VK_KHR_cooperative_matrix +PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = 0; + // VK_KHR_external_memory_capabilities PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0; @@ -223,6 +226,8 @@ class GpuInfoPrivate // cooperative matrix bool support_cooperative_matrix; bool support_cooperative_matrix_16_8_8; + bool support_cooperative_matrix_16_8_16; + bool support_cooperative_matrix_16_16_16; // extension capability int support_VK_KHR_8bit_storage; @@ -230,6 +235,7 @@ class GpuInfoPrivate int support_VK_KHR_bind_memory2; int support_VK_KHR_buffer_device_address; int support_VK_KHR_create_renderpass2; + int support_VK_KHR_cooperative_matrix; int support_VK_KHR_dedicated_allocation; int support_VK_KHR_descriptor_update_template; int support_VK_KHR_external_memory; @@ -527,6 +533,16 @@ bool GpuInfo::support_cooperative_matrix_16_8_8() const return d->support_cooperative_matrix_16_8_8; } +bool GpuInfo::support_cooperative_matrix_16_8_16() const +{ + return d->support_cooperative_matrix_16_8_16; +} + +bool GpuInfo::support_cooperative_matrix_16_16_16() const +{ + return d->support_cooperative_matrix_16_16_16; +} + int GpuInfo::support_VK_KHR_8bit_storage() const { return d->support_VK_KHR_8bit_storage; @@ -552,6 +568,11 @@ int GpuInfo::support_VK_KHR_create_renderpass2() const return d->support_VK_KHR_create_renderpass2; } +int GpuInfo::support_VK_KHR_cooperative_matrix() const +{ + return d->support_VK_KHR_cooperative_matrix; +} + int GpuInfo::support_VK_KHR_dedicated_allocation() const { return d->support_VK_KHR_dedicated_allocation; @@ -709,6 +730,11 @@ static int init_instance_extension() } #endif // __ANDROID_API__ >= 26 + // VK_KHR_cooperative_matrix + { + vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR"); + } + // VK_NV_cooperative_matrix { vkGetPhysicalDeviceCooperativeMatrixPropertiesNV = (PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesNV)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceCooperativeMatrixPropertiesNV"); @@ -1399,6 +1425,7 @@ int create_gpu_instance() gpu_info.support_VK_KHR_bind_memory2 = 0; gpu_info.support_VK_KHR_buffer_device_address = 0; gpu_info.support_VK_KHR_create_renderpass2 = 0; + gpu_info.support_VK_KHR_cooperative_matrix = 0; gpu_info.support_VK_KHR_dedicated_allocation = 0; gpu_info.support_VK_KHR_descriptor_update_template = 0; gpu_info.support_VK_KHR_external_memory = 0; @@ -1439,6 +1466,8 @@ int create_gpu_instance() gpu_info.support_VK_KHR_buffer_device_address = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_create_renderpass2") == 0) gpu_info.support_VK_KHR_create_renderpass2 = exp.specVersion; + else if (strcmp(exp.extensionName, "VK_KHR_cooperative_matrix") == 0) + gpu_info.support_VK_KHR_cooperative_matrix = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0) gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion; else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0) @@ -1495,6 +1524,12 @@ int create_gpu_instance() gpu_info.support_VK_EXT_buffer_device_address = 0; } + if (gpu_info.support_VK_KHR_cooperative_matrix) + { + // we prefer khr extension + gpu_info.support_VK_NV_cooperative_matrix = 0; + } + // check features gpu_info.support_fp16_packed = true; gpu_info.support_fp16_storage = false; @@ -1505,6 +1540,8 @@ int create_gpu_instance() gpu_info.support_ycbcr_conversion = false; gpu_info.support_cooperative_matrix = false; gpu_info.support_cooperative_matrix_16_8_8 = false; + gpu_info.support_cooperative_matrix_16_8_16 = false; + gpu_info.support_cooperative_matrix_16_16_16 = false; if (support_VK_KHR_get_physical_device_properties2) { void* queryExtensionFeatures = 0; @@ -1550,14 +1587,22 @@ int create_gpu_instance() } // query cooperative_matrix - VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures; - queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; + VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures; + queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; queryCooperativeMatrixFeatures.pNext = 0; - if (gpu_info.support_VK_NV_cooperative_matrix) + VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV; + queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; + queryCooperativeMatrixFeaturesNV.pNext = 0; + if (gpu_info.support_VK_KHR_cooperative_matrix) { queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrixFeatures; } + else if (gpu_info.support_VK_NV_cooperative_matrix) + { + queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures; + queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV; + } VkPhysicalDeviceFeatures2KHR queryFeatures; queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; @@ -1583,10 +1628,14 @@ int create_gpu_instance() { gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion; } - if (gpu_info.support_VK_NV_cooperative_matrix) + if (gpu_info.support_VK_KHR_cooperative_matrix) { gpu_info.support_cooperative_matrix = queryCooperativeMatrixFeatures.cooperativeMatrix; } + else if (gpu_info.support_VK_NV_cooperative_matrix) + { + gpu_info.support_cooperative_matrix = queryCooperativeMatrixFeaturesNV.cooperativeMatrix; + } } else { @@ -1622,36 +1671,97 @@ int create_gpu_instance() if (gpu_info.support_cooperative_matrix) { // query supported cooperative matrix types and operations - uint32_t propertyCount = 0; - ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0); - if (ret != VK_SUCCESS) + if (gpu_info.support_VK_KHR_cooperative_matrix) { - NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); - } + uint32_t propertyCount = 0; + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, 0); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); + } - std::vector properties(propertyCount); - for (uint32_t j = 0; j < properties.size(); j++) - { - properties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV; - properties[j].pNext = 0; + std::vector properties(propertyCount); + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, properties.data()); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); + } + + for (uint32_t j = 0; j < properties.size(); j++) + { + const VkCooperativeMatrixPropertiesKHR& cmp = properties[j]; + // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope); + + if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR + && cmp.scope == VK_SCOPE_SUBGROUP_KHR) + { + gpu_info.support_cooperative_matrix_16_8_8 = true; + } + if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR + && cmp.scope == VK_SCOPE_SUBGROUP_KHR) + { + gpu_info.support_cooperative_matrix_16_8_16 = true; + } + if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_KHR && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_KHR + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_KHR && cmp.ResultType == VK_COMPONENT_TYPE_FLOAT32_KHR + && cmp.scope == VK_SCOPE_SUBGROUP_KHR) + { + gpu_info.support_cooperative_matrix_16_16_16 = true; + } + } } - ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data()); - if (ret != VK_SUCCESS) + else { - NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); - } + uint32_t propertyCount = 0; + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, 0); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); + } - for (uint32_t j = 0; j < properties.size(); j++) - { - const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; - // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope); + std::vector properties(propertyCount); + for (uint32_t j = 0; j < properties.size(); j++) + { + properties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV; + properties[j].pNext = 0; + } + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, properties.data()); + if (ret != VK_SUCCESS) + { + NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); + } - if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 - && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV - && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV - && cmp.scope == VK_SCOPE_SUBGROUP_NV) + for (uint32_t j = 0; j < properties.size(); j++) { - gpu_info.support_cooperative_matrix_16_8_8 = true; + const VkCooperativeMatrixPropertiesNV& cmp = properties[j]; + // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope); + + if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 8 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV + && cmp.scope == VK_SCOPE_SUBGROUP_NV) + { + gpu_info.support_cooperative_matrix_16_8_8 = true; + } + if (cmp.MSize == 16 && cmp.NSize == 8 && cmp.KSize == 16 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV + && cmp.scope == VK_SCOPE_SUBGROUP_NV) + { + gpu_info.support_cooperative_matrix_16_8_16 = true; + } + if (cmp.MSize == 16 && cmp.NSize == 16 && cmp.KSize == 16 + && cmp.AType == VK_COMPONENT_TYPE_FLOAT16_NV && cmp.BType == VK_COMPONENT_TYPE_FLOAT16_NV + && cmp.CType == VK_COMPONENT_TYPE_FLOAT32_NV && cmp.DType == VK_COMPONENT_TYPE_FLOAT32_NV + && cmp.scope == VK_SCOPE_SUBGROUP_NV) + { + gpu_info.support_cooperative_matrix_16_16_16 = true; + } } } } @@ -1668,10 +1778,14 @@ int create_gpu_instance() gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, gpu_info.support_int8_packed, gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); - NCNN_LOGE("[%u %s] subgroup=%u basic=%d vote=%d ballot=%d shuffle=%d", i, physicalDeviceProperties.deviceName, + NCNN_LOGE("[%u %s] subgroup=%u basic/vote/ballot/shuffle=%d/%d/%d/%d", i, physicalDeviceProperties.deviceName, gpu_info.subgroup_size, gpu_info.support_subgroup_basic, gpu_info.support_subgroup_vote, gpu_info.support_subgroup_ballot, gpu_info.support_subgroup_shuffle); + NCNN_LOGE("[%u %s] fp16-matrix-16_8_8/16_8_16/16_16_16=%d/%d/%d", i, physicalDeviceProperties.deviceName, + gpu_info.support_cooperative_matrix_16_8_8, gpu_info.support_cooperative_matrix_16_8_16, + gpu_info.support_cooperative_matrix_16_16_16); + gpu_info_index++; } @@ -2038,6 +2152,8 @@ VulkanDevice::VulkanDevice(int device_index) enabledExtensions.push_back("VK_KHR_buffer_device_address"); if (info.support_VK_KHR_create_renderpass2()) enabledExtensions.push_back("VK_KHR_create_renderpass2"); + if (info.support_VK_KHR_cooperative_matrix()) + enabledExtensions.push_back("VK_KHR_cooperative_matrix"); if (info.support_VK_KHR_dedicated_allocation()) enabledExtensions.push_back("VK_KHR_dedicated_allocation"); if (info.support_VK_KHR_descriptor_update_template()) @@ -2140,15 +2256,28 @@ VulkanDevice::VulkanDevice(int device_index) } // enable cooperative matrix - VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeatures; - queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; + VkPhysicalDeviceCooperativeMatrixFeaturesKHR queryCooperativeMatrixFeatures; + queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; queryCooperativeMatrixFeatures.pNext = 0; queryCooperativeMatrixFeatures.cooperativeMatrix = info.support_cooperative_matrix(); queryCooperativeMatrixFeatures.cooperativeMatrixRobustBufferAccess = VK_FALSE; + VkPhysicalDeviceCooperativeMatrixFeaturesNV queryCooperativeMatrixFeaturesNV; + queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; + queryCooperativeMatrixFeaturesNV.pNext = 0; + queryCooperativeMatrixFeaturesNV.cooperativeMatrix = info.support_cooperative_matrix(); + queryCooperativeMatrixFeaturesNV.cooperativeMatrixRobustBufferAccess = VK_FALSE; if (support_VK_KHR_get_physical_device_properties2 && info.support_cooperative_matrix()) { - queryCooperativeMatrixFeatures.pNext = enabledExtensionFeatures; - enabledExtensionFeatures = &queryCooperativeMatrixFeatures; + if (info.support_VK_KHR_cooperative_matrix()) + { + queryCooperativeMatrixFeatures.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &queryCooperativeMatrixFeatures; + } + else + { + queryCooperativeMatrixFeaturesNV.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &queryCooperativeMatrixFeaturesNV; + } } std::vector compute_queue_priorities(info.compute_queue_count(), 1.f); // 0.f ~ 1.f diff --git a/src/gpu.h b/src/gpu.h index 345329f7d47..1eff228e4eb 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -55,6 +55,9 @@ extern int support_VK_EXT_validation_flags; extern int support_VK_KHR_android_surface; #endif // __ANDROID_API__ >= 26 +// VK_KHR_cooperative_matrix +extern PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR; + // VK_KHR_external_memory_capabilities extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR; @@ -176,6 +179,8 @@ class NCNN_EXPORT GpuInfo // cooperative matrix feature bool support_cooperative_matrix() const; bool support_cooperative_matrix_16_8_8() const; + bool support_cooperative_matrix_16_8_16() const; + bool support_cooperative_matrix_16_16_16() const; // extension capability int support_VK_KHR_8bit_storage() const; @@ -183,6 +188,7 @@ class NCNN_EXPORT GpuInfo int support_VK_KHR_bind_memory2() const; int support_VK_KHR_buffer_device_address() const; int support_VK_KHR_create_renderpass2() const; + int support_VK_KHR_cooperative_matrix() const; int support_VK_KHR_dedicated_allocation() const; int support_VK_KHR_descriptor_update_template() const; int support_VK_KHR_external_memory() const; diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index caab40f5000..79879e815af 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -180,7 +180,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; // winograd43 transform kernel if (opt.use_winograd43_convolution) @@ -233,7 +234,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { // src = 36-inch-outch // dst = 8b-8a-inch/8a-outch/8b-36 @@ -260,6 +261,33 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } } + else if (use_cooperative_matrix_16_16_16) + { + // src = 36-inch-outch + // dst = 16b-16a-inch/16a-outch/16b-36 + weight_winograd43_data_packed.create(num_input / 16, num_output / 16, 36, (size_t)4 * 16 * 16, 16 * 16); + + for (int k = 0; k < 36; k++) + { + float* g00 = weight_winograd43_data_packed.channel(k); + + for (int q = 0; q + (16 - 1) < num_output; q += 16) + { + for (int p = 0; p + (16 - 1) < num_input; p += 16) + { + for (int i = 0; i < 16; i++) + { + for (int j = 0; j < 16; j++) + { + const float* k00 = weight_data_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } else { // src = 36-inch-outch @@ -375,16 +403,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_3x3s1d1_winograd_gemm; if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_3x3s1d1_winograd_gemm; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8; + else + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8; + } + else if (use_cooperative_matrix_16_16_16) { - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8; + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16; + else + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16; } pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev); - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - // TODO proper unroll y - pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4 + pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1); + } + else if (use_cooperative_matrix_16_16_16) + { + pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(32, 1, 1); } else if (opt.use_shader_local_memory) { @@ -471,7 +512,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { // src = 16-inch-outch // dst = 8b-8a-inch/8a-outch/8b-16 @@ -498,6 +539,33 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } } + else if (use_cooperative_matrix_16_16_16) + { + // src = 16-inch-outch + // dst = 16b-16a-inch/16a-outch/16b-16 + weight_winograd23_data_packed.create(num_input / 16, num_output / 16, 16, (size_t)4 * 16 * 16, 16 * 16); + + for (int k = 0; k < 16; k++) + { + float* g00 = weight_winograd23_data_packed.channel(k); + + for (int q = 0; q + (16 - 1) < num_output; q += 16) + { + for (int p = 0; p + (16 - 1) < num_input; p += 16) + { + for (int i = 0; i < 16; i++) + { + for (int j = 0; j < 16; j++) + { + const float* k00 = weight_data_tm.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } else { // src = 16-inch-outch @@ -613,16 +681,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_3x3s1d1_winograd_gemm; if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_3x3s1d1_winograd_gemm; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8; + else + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8; + } + else if (use_cooperative_matrix_16_16_16) { - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8; + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16; + else + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16; } pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev); - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1); + } + else if (use_cooperative_matrix_16_16_16) { - // TODO proper unroll y - pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4 + pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(32, 1, 1); } else if (opt.use_shader_local_memory) { @@ -666,11 +747,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) // dst = pa-pb-kw-kh-inch/pa-outch/pb if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; - if (use_cooperative_matrix) + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; + + if (use_cooperative_matrix_16_8_8) { // dst = 8b-8a-maxk-inch/8a-outch/8b - // dst = 16b-16a-maxk-inch/16a-outch/16b Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); weight_data_packed.create(maxk * num_input / 8, num_output / 8, (size_t)4 * 8 * 8, 8 * 8); @@ -696,6 +778,34 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } } + else if (use_cooperative_matrix_16_16_16) + { + // dst = 16b-16a-maxk-inch/16a-outch/16b + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(maxk * num_input / 16, num_output / 16, (size_t)4 * 16 * 16, 16 * 16); + + for (int q = 0; q + 15 < num_output; q += 16) + { + float* g00 = weight_data_packed.row(q / 16); + + for (int p = 0; p + 15 < num_input; p += 16) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 16; i++) + { + for (int j = 0; j < 16; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } else { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); @@ -728,11 +838,12 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } else { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; - if (use_cooperative_matrix) + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && is_conv1x1s1d1 && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; + + if (use_cooperative_matrix_16_8_8) { // dst = 8b-8a-inch/8a-outch/8b - // dst = 16b-16a-inch/16a-outch/16b Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); weight_data_packed.create(maxk, num_input / 8, num_output / 8, (size_t)4 * 8 * 8, 8 * 8); @@ -758,6 +869,34 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } } } + else if (use_cooperative_matrix_16_16_16) + { + // dst = 16b-16a-inch/16a-outch/16b + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(maxk, num_input / 16, num_output / 16, (size_t)4 * 16 * 16, 16 * 16); + + for (int q = 0; q + 15 < num_output; q += 16) + { + float* g00 = weight_data_packed.channel(q / 16); + + for (int p = 0; p + 15 < num_input; p += 16) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < 16; i++) + { + for (int j = 0; j < 16; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } else { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); @@ -801,7 +940,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && num_input >= 16 && num_output >= 16) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; // check blob shape if (!vkdev->shape_support_image_storage(shape_bordered_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) @@ -856,16 +996,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm; if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_8_8; + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_8_8; + else + shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_8_8; + } + else if (use_cooperative_matrix_16_16_16) + { + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_16_16; + else + shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_16_16; } pipeline_convolution_gemm = new Pipeline(vkdev); - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - // TODO proper unroll y - pipeline_convolution_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4 + pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8 + } + else if (use_cooperative_matrix_16_16_16) + { + pipeline_convolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16 } else if (opt.use_shader_local_memory) { @@ -879,7 +1032,8 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } else if (is_conv1x1s1d1) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; std::vector specializations(4 + 8); specializations[0].i = bias_term; @@ -906,16 +1060,29 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1; if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_8_8; + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_8_8; + else + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_8_8; + } + else if (use_cooperative_matrix_16_16_16) + { + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_16_16; + else + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_16_16; } pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - // TODO proper unroll y - pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4 + pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_8_8 + } + else if (use_cooperative_matrix_16_16_16) + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(32, 1, 1); // 16_16_16 } else if (opt.use_shader_local_memory) { @@ -1223,7 +1390,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom if (opt.use_winograd_convolution && (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && is_conv3x3s1d1 && channels * elempack >= 16 && num_output >= 16) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0; bool pre_winograd43 = opt.use_winograd43_convolution; if (opt.use_winograd23_convolution) @@ -1233,7 +1401,9 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom if (vkdev->info.type() != 0 && (w <= 12 && h <= 12)) pre_winograd43 = false; - if (use_cooperative_matrix && (w <= 18 && h <= 18)) + if (use_cooperative_matrix_16_8_8 && (w <= 18 && h <= 18)) + pre_winograd43 = false; + else if (use_cooperative_matrix_16_16_16 && (w <= 18 && h <= 18)) pre_winograd43 = false; } @@ -1295,10 +1465,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom dispatcher.h = top_tm_blob.h; dispatcher.c = 36; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - dispatcher.w = ((top_tm_blob.w + 15) / 16 + 3) / 4 * 32; - dispatcher.h = (top_tm_blob.h + 1) / 2; + dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_tm_blob.h + 1) / 2 + 3) / 4; + dispatcher.c = 36; + } + else if (use_cooperative_matrix_16_16_16) + { + dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_tm_blob.h + 3) / 4 + 1) / 2; dispatcher.c = 36; } @@ -1391,10 +1567,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom dispatcher.h = top_tm_blob.h; dispatcher.c = 16; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_tm_blob.h + 1) / 2 + 3) / 4; + dispatcher.c = 16; + } + else if (use_cooperative_matrix_16_16_16) { - dispatcher.w = ((top_tm_blob.w + 15) / 16 + 3) / 4 * 32; - dispatcher.h = (top_tm_blob.h + 1) / 2; + dispatcher.w = ((top_tm_blob.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_tm_blob.h + 3) / 4 + 1) / 2; dispatcher.c = 16; } @@ -1434,7 +1616,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0; // gemm top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); @@ -1462,10 +1645,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom dispatcher.h = top_blob.c; dispatcher.c = 1; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob.c + 1) / 2 + 3) / 4; + dispatcher.c = 1; + } + else if (use_cooperative_matrix_16_16_16) { - dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 3) / 4 * 32; - dispatcher.h = (top_blob.c + 1) / 2; + dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob.c + 3) / 4 + 1) / 2; dispatcher.c = 1; } @@ -1475,7 +1664,8 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom } if (is_conv1x1s1d1) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); if (top_blob.empty()) @@ -1502,10 +1692,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom dispatcher.h = top_blob.c; dispatcher.c = 1; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob.c + 1) / 2 + 3) / 4; + dispatcher.c = 1; + } + else if (use_cooperative_matrix_16_16_16) { - dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 3) / 4 * 32; - dispatcher.h = (top_blob.c + 1) / 2; + dispatcher.w = ((top_blob.w * top_blob.h + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob.c + 3) / 4 + 1) / 2; dispatcher.c = 1; } diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp index 2480c854b80..30283d211c1 100644 --- a/src/layer/vulkan/deconvolution_vulkan.cpp +++ b/src/layer/vulkan/deconvolution_vulkan.cpp @@ -146,14 +146,14 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) if (opt.use_sgemm_convolution) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && num_input % 16 == 0 && num_output % 16 == 0; // src = kw-kh-inch-outch // dst = pa-pb-inch/pa-kw-kh-outch/pb (sgemm) - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { // dst = 8a-8b-inch/8a-maxk-outch/8b - // dst = 16a-16b-inch/16a-maxk-outch/16b Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); weight_data_packed.create(num_input / 8, maxk * num_output / 8, (size_t)4 * 8 * 8, 8 * 8); @@ -179,6 +179,34 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) } } } + else if (use_cooperative_matrix_16_16_16) + { + // dst = 16a-16b-inch/16a-maxk-outch/16b + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(num_input / 16, maxk * num_output / 16, (size_t)4 * 16 * 16, 16 * 16); + + for (int q = 0; q + 15 < num_output; q += 16) + { + for (int k = 0; k < maxk; k++) + { + float* g00 = weight_data_packed.row(q / 16 * maxk + k); + + for (int p = 0; p + 15 < num_input; p += 16) + { + for (int i = 0; i < 16; i++) + { + for (int j = 0; j < 16; j++) + { + const float* k00 = weight_data_r2.channel(q + j).row(p + i); + g00[0] = k00[k]; + g00++; + } + } + } + } + } + } else { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); @@ -253,16 +281,29 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::deconvolution_pack4to8_gemm; if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::deconvolution_pack8to4_gemm; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) { - shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_8_8; + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_8_8; + else + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_8_8; + } + else if (use_cooperative_matrix_16_16_16) + { + if (vkdev->info.support_VK_KHR_cooperative_matrix()) + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_16_16; + else + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_16_16; } pipeline_deconvolution_gemm = new Pipeline(vkdev); - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_8_8 + } + else if (use_cooperative_matrix_16_16_16) { - // TODO proper unroll y - pipeline_deconvolution_gemm->set_local_size_xyz(32, 4, 1); // 16_8_8 ly*4 + pipeline_deconvolution_gemm->set_local_size_xyz(32, 1, 1); // 16_16_16 } else if (opt.use_shader_local_memory) { @@ -505,7 +546,8 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC VkMat top_blob_bordered; if (opt.use_sgemm_convolution) { - bool use_cooperative_matrix = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_8_8 = vkdev->info.support_cooperative_matrix_16_8_8() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 8 == 0 && num_output % 8 == 0; + bool use_cooperative_matrix_16_16_16 = vkdev->info.support_cooperative_matrix_16_16_16() && opt.use_cooperative_matrix && !opt.use_image_storage && !opt.use_shader_pack8 && opt.use_fp16_storage && channels * elempack % 16 == 0 && num_output % 16 == 0; const int maxk = kernel_w * kernel_h; @@ -534,10 +576,16 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC dispatcher.h = top_blob_col.h; dispatcher.c = 1; - if (use_cooperative_matrix) + if (use_cooperative_matrix_16_8_8) + { + dispatcher.w = ((top_blob_col.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob_col.h + 1) / 2 + 3) / 4; + dispatcher.c = 1; + } + else if (use_cooperative_matrix_16_16_16) { - dispatcher.w = ((top_blob_col.w + 15) / 16 + 3) / 4 * 32; - dispatcher.h = (top_blob_col.h + 1) / 2; + dispatcher.w = ((top_blob_col.w + 15) / 16 + 1) / 2 * 32; + dispatcher.h = ((top_blob_col.h + 3) / 4 + 1) / 2; dispatcher.c = 1; } diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp deleted file mode 100644 index 3d9c5d442c6..00000000000 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp +++ /dev/null @@ -1,257 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; - -#define shape_constant_id_offset 4 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define LOCAL_SIZE_Y 4 -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v2[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v3[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2; - - const int lx = int(gl_LocalInvocationID.x); - const int ly = int(gl_LocalInvocationID.y); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias; - - coopMatLoadNV(bias, bias_data, gy, 0, false); - - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - } - else - { - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - } - - int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = (z + ly) * 2 * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (ly < remain) - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = (z + ly) * 2 * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < remain; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (gx >= psc(outcstep) || gy >= psc(outc)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false); - - barrier(); - - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - uvec2 sum2_u2 = tmp_v2[tmp_vi]; - uvec2 sum3_u2 = tmp_v3[tmp_vi]; - - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y)); - afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y)); - - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1); - sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1); - - int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16; - { - if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); - if (gx + 16 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); - if (gx + 32 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 32, sum2); - if (gx + 48 + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 48, sum3); - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp new file mode 100644 index 00000000000..79641acbc40 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp @@ -0,0 +1,239 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + coopmat sum0; + coopmat sum1; + coopmat sum2; + coopmat sum3; + + if (bias_term == 1) + { + coopmat bias0; + coopmat bias1; + + coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); + + sum0 = coopmat(bias0); + sum1 = coopmat(bias0); + sum2 = coopmat(bias1); + sum3 = coopmat(bias1); + } + else + { + sum0 = coopmat(0.f); + sum1 = coopmat(0.f); + sum2 = coopmat(0.f); + sum3 = coopmat(0.f); + } + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outcstep) || gy >= psc(outc)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + + coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + + const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); + if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp new file mode 100644 index 00000000000..3c82d995202 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp @@ -0,0 +1,298 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + coopmat sum0; + coopmat sum1; + coopmat sum2; + coopmat sum3; + coopmat sum4; + coopmat sum5; + coopmat sum6; + coopmat sum7; + + if (bias_term == 1) + { + coopmat bias0; + coopmat bias1; + coopmat bias2; + coopmat bias3; + + coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor); + + sum0 = coopmat(bias0); + sum1 = coopmat(bias0); + sum2 = coopmat(bias1); + sum3 = coopmat(bias1); + sum4 = coopmat(bias2); + sum5 = coopmat(bias2); + sum6 = coopmat(bias3); + sum7 = coopmat(bias3); + } + else + { + sum0 = coopmat(0.f); + sum1 = coopmat(0.f); + sum2 = coopmat(0.f); + sum3 = coopmat(0.f); + sum4 = coopmat(0.f); + sum5 = coopmat(0.f); + sum6 = coopmat(0.f); + sum7 = coopmat(0.f); + } + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outcstep) || gy >= psc(outc)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + coopmat sum4_fp16 = coopmat(sum4); + coopmat sum5_fp16 = coopmat(sum5); + coopmat sum6_fp16 = coopmat(sum6); + coopmat sum7_fp16 = coopmat(sum7); + + coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) + { + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi, sum0); + } + if (gx + lxm16 + 16 < psc(outcstep)) + { + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp new file mode 100644 index 00000000000..2c0f57e708c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp @@ -0,0 +1,239 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + } + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outcstep) || gy >= psc(outc)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + + const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); + if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp new file mode 100644 index 00000000000..97322e6ed9e --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp @@ -0,0 +1,298 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); + coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); + coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + } + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outcstep) || gy >= psc(outc)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) + { + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi, sum0); + } + if (gx + lxm16 + 16 < psc(outcstep)) + { + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp deleted file mode 100644 index bb72fc76d9e..00000000000 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp +++ /dev/null @@ -1,210 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int batch = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; -layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outw; - int outc; - int outcstep; -} p; - -#define LOCAL_SIZE_Y 4 -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v2[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v3[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2; - int gz = int(gl_GlobalInvocationID.z); - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - - const int lx = int(gl_LocalInvocationID.x); - const int ly = int(gl_LocalInvocationID.y); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = gz * psc(cstep) + (z + ly) * 2 * psc(outw) + gx + lxd16 * psc(outw) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gz * psc(c) * psc(outc) * 4 + gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_tm_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (ly < remain) - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = gz * psc(cstep) + (z + ly) * 2 * psc(outw) + gx + lxd16 * psc(outw) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < remain; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gz * psc(c) * psc(outc) * 4 + gy * psc(c) * 4 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_tm_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false); - - barrier(); - - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int gi = gz * psc(outcstep) + gy * psc(outw) + gx + lxd16 * psc(outw) + lxm16; - - if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + 16 + lxm16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; - if (gx + 32 + lxm16 < psc(outw)) top_tm_blob_data[gi + 32] = tmp_v2[tmp_vi]; - if (gx + 48 + lxm16 < psc(outw)) top_tm_blob_data[gi + 48] = tmp_v3[tmp_vi]; - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp new file mode 100644 index 00000000000..c4a494e917a --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp @@ -0,0 +1,196 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outw; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + int gz = int(gl_GlobalInvocationID.z); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + coopmat sum0 = coopmat(0.f); + coopmat sum1 = coopmat(0.f); + coopmat sum2 = coopmat(0.f); + coopmat sum3 = coopmat(0.f); + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_tm_data[w_offset]; + tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_tm_data[w_offset]; + tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + + coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp new file mode 100644 index 00000000000..785c917bbf4 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp @@ -0,0 +1,241 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outw; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + int gz = int(gl_GlobalInvocationID.z); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + coopmat sum0 = coopmat(0.f); + coopmat sum1 = coopmat(0.f); + coopmat sum2 = coopmat(0.f); + coopmat sum3 = coopmat(0.f); + coopmat sum4 = coopmat(0.f); + coopmat sum5 = coopmat(0.f); + coopmat sum6 = coopmat(0.f); + coopmat sum7 = coopmat(0.f); + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_tm_data[w_offset]; + tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_tm_data[w_offset]; + tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + coopmat sum4_fp16 = coopmat(sum4); + coopmat sum5_fp16 = coopmat(sum5); + coopmat sum6_fp16 = coopmat(sum6); + coopmat sum7_fp16 = coopmat(sum7); + + coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp new file mode 100644 index 00000000000..bcca39eb615 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp @@ -0,0 +1,196 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outw; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + int gz = int(gl_GlobalInvocationID.z); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_tm_data[w_offset]; + tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_tm_data[w_offset]; + tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp new file mode 100644 index 00000000000..35d3b4faba5 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp @@ -0,0 +1,241 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outw; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + int gz = int(gl_GlobalInvocationID.z); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_tm_data[w_offset]; + tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_tm_data[w_offset]; + tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp deleted file mode 100644 index 80a0463a9b1..00000000000 --- a/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp +++ /dev/null @@ -1,294 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define LOCAL_SIZE_Y 4 -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v2[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v3[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2; - - const int outsize = psc(outw) * psc(outh); - - const int lx = int(gl_LocalInvocationID.x); - const int ly = int(gl_LocalInvocationID.y); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias; - - coopMatLoadNV(bias, bias_data, gy, 0, false); - - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias); - } - else - { - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - } - - const int maxk = kernel_w * kernel_h; - const int N = psc(c) / 2 * maxk; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - const int sz = (z + ly) / maxk * 2; - const int kk = (z + ly) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec4 gx16 = gx + ivec4(0, 16, 32, 48) + lxm16; - - const ivec4 sy16 = gx16 / psc(outw); - const ivec4 sx16 = gx16 % psc(outw); - - const ivec4 sxs16 = sx16 * stride_w; - const ivec4 sys16 = sy16 * stride_h; - - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - const ivec4 v_offset = sz * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w + lxd16 * psc(cstep); - - tmp_v0[tmp_vi] = gx16.r < outsize ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v1[tmp_vi] = gx16.g < outsize ? bottom_blob_data[v_offset.g] : uvec2(0); - tmp_v2[tmp_vi] = gx16.b < outsize ? bottom_blob_data[v_offset.b] : uvec2(0); - tmp_v3[tmp_vi] = gx16.a < outsize ? bottom_blob_data[v_offset.a] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * maxk * 4 + (z + z4) * 16 + lxm8* 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (ly < remain) - { - const int sz = (z + ly) / maxk * 2; - const int kk = (z + ly) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec4 gx16 = gx + ivec4(0, 16, 32, 48) + lxm16; - - const ivec4 sy16 = gx16 / psc(outw); - const ivec4 sx16 = gx16 % psc(outw); - - const ivec4 sxs16 = sx16 * stride_w; - const ivec4 sys16 = sy16 * stride_h; - - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - const ivec4 v_offset = sz * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w + lxd16 * psc(cstep); - - tmp_v0[tmp_vi] = gx16.r < outsize ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v1[tmp_vi] = gx16.g < outsize ? bottom_blob_data[v_offset.g] : uvec2(0); - tmp_v2[tmp_vi] = gx16.b < outsize ? bottom_blob_data[v_offset.b] : uvec2(0); - tmp_v3[tmp_vi] = gx16.a < outsize ? bottom_blob_data[v_offset.a] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < remain; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * maxk * 4 + (z + z4) * 16 + lxm8* 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (gx >= outsize || gy >= psc(outc)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false); - - barrier(); - - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - uvec2 sum2_u2 = tmp_v2[tmp_vi]; - uvec2 sum3_u2 = tmp_v3[tmp_vi]; - - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y)); - afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y)); - - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1); - sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1); - - int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16; - { - if (gx + lxm16 < outsize) buffer_st4(top_blob_data, gi, sum0); - if (gx + 16 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 16, sum1); - if (gx + 32 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 32, sum2); - if (gx + 48 + lxm16 < outsize) buffer_st4(top_blob_data, gi + 48, sum3); - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp new file mode 100644 index 00000000000..0e5d83acf57 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp @@ -0,0 +1,276 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int outsize = psc(outw) * psc(outh); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + coopmat sum0; + coopmat sum1; + coopmat sum2; + coopmat sum3; + + if (bias_term == 1) + { + coopmat bias0; + coopmat bias1; + + coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); + + sum0 = coopmat(bias0); + sum1 = coopmat(bias0); + sum2 = coopmat(bias1); + sum3 = coopmat(bias1); + } + else + { + sum0 = coopmat(0.f); + sum1 = coopmat(0.f); + sum2 = coopmat(0.f); + sum3 = coopmat(0.f); + } + + const int maxk = kernel_w * kernel_h; + const int N = psc(c) / 4 * maxk; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int sz = (z + lxd16) / maxk; + const int kk = (z + lxd16) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); + + const ivec2 sy16 = gx16 / psc(outw); + const ivec2 sx16 = gx16 % psc(outw); + + const ivec2 sxs16 = sx16 * stride_w; + const ivec2 sys16 = sy16 * stride_h; + + const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int sz = (z + lxd16) / maxk; + const int kk = (z + lxd16) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); + + const ivec2 sy16 = gx16 / psc(outw); + const ivec2 sx16 = gx16 % psc(outw); + + const ivec2 sxs16 = sx16 * stride_w; + const ivec2 sys16 = sy16 * stride_h; + + const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= outsize || gy >= psc(outc)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + + coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + + const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); + if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp new file mode 100644 index 00000000000..2fc6d199c05 --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp @@ -0,0 +1,335 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int outsize = psc(outw) * psc(outh); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + coopmat sum0; + coopmat sum1; + coopmat sum2; + coopmat sum3; + coopmat sum4; + coopmat sum5; + coopmat sum6; + coopmat sum7; + + if (bias_term == 1) + { + coopmat bias0; + coopmat bias1; + coopmat bias2; + coopmat bias3; + + coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor); + + sum0 = coopmat(bias0); + sum1 = coopmat(bias0); + sum2 = coopmat(bias1); + sum3 = coopmat(bias1); + sum4 = coopmat(bias2); + sum5 = coopmat(bias2); + sum6 = coopmat(bias3); + sum7 = coopmat(bias3); + } + else + { + sum0 = coopmat(0.f); + sum1 = coopmat(0.f); + sum2 = coopmat(0.f); + sum3 = coopmat(0.f); + sum4 = coopmat(0.f); + sum5 = coopmat(0.f); + sum6 = coopmat(0.f); + sum7 = coopmat(0.f); + } + + const int maxk = kernel_w * kernel_h; + const int N = psc(c) / 2 * maxk; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + const int sz = (z + lxd8) / maxk; + const int kk = (z + lxd8) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); + + const ivec4 sy16 = gx16 / psc(outw); + const ivec4 sx16 = gx16 % psc(outw); + + const ivec4 sxs16 = sx16 * stride_w; + const ivec4 sys16 = sy16 * stride_h; + + const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); + tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + const int sz = (z + lxd8) / maxk; + const int kk = (z + lxd8) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); + + const ivec4 sy16 = gx16 / psc(outw); + const ivec4 sx16 = gx16 % psc(outw); + + const ivec4 sxs16 = sx16 * stride_w; + const ivec4 sys16 = sy16 * stride_h; + + const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); + tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= outsize || gy >= psc(outc)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + coopmat sum4_fp16 = coopmat(sum4); + coopmat sum5_fp16 = coopmat(sum5); + coopmat sum6_fp16 = coopmat(sum6); + coopmat sum7_fp16 = coopmat(sum7); + + coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) + { + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi, sum0); + } + if (gx + lxm16 + 16 < psc(outcstep)) + { + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp new file mode 100644 index 00000000000..71cef19638c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp @@ -0,0 +1,276 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int outsize = psc(outw) * psc(outh); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + } + + const int maxk = kernel_w * kernel_h; + const int N = psc(c) / 4 * maxk; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int sz = (z + lxd16) / maxk; + const int kk = (z + lxd16) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); + + const ivec2 sy16 = gx16 / psc(outw); + const ivec2 sx16 = gx16 % psc(outw); + + const ivec2 sxs16 = sx16 * stride_w; + const ivec2 sys16 = sy16 * stride_h; + + const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int sz = (z + lxd16) / maxk; + const int kk = (z + lxd16) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); + + const ivec2 sy16 = gx16 / psc(outw); + const ivec2 sx16 = gx16 % psc(outw); + + const ivec2 sxs16 = sx16 * stride_w; + const ivec2 sys16 = sy16 * stride_h; + + const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= outsize || gy >= psc(outc)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + + const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); + + if (gy + lxd16 * 4 + j < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); + if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } +} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp new file mode 100644 index 00000000000..4f1c1f6ed1c --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp @@ -0,0 +1,335 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_GOOGLE_include_directive: enable +#include "vulkan_activation.comp" + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int outsize = psc(outw) * psc(outh); + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); + coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); + coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + } + + const int maxk = kernel_w * kernel_h; + const int N = psc(c) / 2 * maxk; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + const int sz = (z + lxd8) / maxk; + const int kk = (z + lxd8) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); + + const ivec4 sy16 = gx16 / psc(outw); + const ivec4 sx16 = gx16 % psc(outw); + + const ivec4 sxs16 = sx16 * stride_w; + const ivec4 sys16 = sy16 * stride_h; + + const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); + tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + const int sz = (z + lxd8) / maxk; + const int kk = (z + lxd8) % maxk; + + const int ky = kk / kernel_w; + const int kx = kk % kernel_w; + + const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); + + const ivec4 sy16 = gx16 / psc(outw); + const ivec4 sx16 = gx16 % psc(outw); + + const ivec4 sxs16 = sx16 * stride_w; + const ivec4 sys16 = sy16 * stride_h; + + const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; + + tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); + tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); + tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); + tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= outsize || gy >= psc(outc)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); + + if (gy + j * 2 + lxd16 < psc(outc)) + { + if (gx + lxm16 < psc(outcstep)) + { + uvec2 sum0_u2 = tmp_v0[tmp_vi]; + afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); + sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi, sum0); + } + if (gx + lxm16 + 16 < psc(outcstep)) + { + uvec2 sum1_u2 = tmp_v1[tmp_vi]; + afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); + sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); + buffer_st4(top_blob_data, gi + 16, sum1); + } + } + } + } +} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp deleted file mode 100644 index 140bea6b99c..00000000000 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp +++ /dev/null @@ -1,211 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int maxk = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; -} p; - -#define LOCAL_SIZE_Y 4 -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v2[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v3[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k[LOCAL_SIZE_Y * UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 4 * 16; - int gy = int(gl_GlobalInvocationID.y); - - const int lx = int(gl_LocalInvocationID.x); - const int ly = int(gl_LocalInvocationID.y); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - - int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = (z + ly) * 2 * psc(cstep) + gx + lxd16 * psc(cstep) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * 8 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (ly < remain) - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int v_offset = (z + ly) * 2 * psc(cstep) + gx + lxd16 * psc(cstep) + lxm16; - - tmp_v0[tmp_vi] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_vi] = (gx + 16 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v2[tmp_vi] = (gx + 32 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 32] : uvec2(0); - tmp_v3[tmp_vi] = (gx + 48 + lxm16) < psc(outw) ? bottom_blob_data[v_offset + 48] : uvec2(0); - } - - if (lx < 16) - { - for (int z4 = 0; z4 < remain; z4++) - { - int tmp_ki = ly*UNROLL_INCH*8*2 + z4*8*2 + lxm8*2+lxd8; - int w_offset = gy * psc(c) * 8 + (z + z4) * 16 + lxm8 * 2 + lxd8; - - tmp_k[tmp_ki] = weight_data[w_offset]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A3; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - coopMatLoadNV(A2, tmp_v2, z4*16*2, 2, false); - coopMatLoadNV(A3, tmp_v3, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B; - coopMatLoadNV(B, tmp_k, ly*UNROLL_INCH*8*2 + z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B, sum0); - sum1 = coopMatMulAddNV(A1, B, sum1); - sum2 = coopMatMulAddNV(A2, B, sum2); - sum3 = coopMatMulAddNV(A3, B, sum3); - } - - barrier(); - } - - if (gx >= psc(outw) || gy * 2 >= psc(outh)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, ly*16*2, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, ly*16*2, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v2, ly*16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v3, ly*16*2, 2, false); - - barrier(); - - { - int tmp_vi = ly*16*2 + lxm16*2+lxd16; - int gi = (gy / maxk * maxk * 2 + gy % maxk) * psc(outw) + gx + lxd16 * maxk*psc(outw) + lxm16; - - if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + 16 + lxm16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; - if (gx + 32 + lxm16 < psc(outw)) col_blob_data[gi + 32] = tmp_v2[tmp_vi]; - if (gx + 48 + lxm16 < psc(outw)) col_blob_data[gi + 48] = tmp_v3[tmp_vi]; - } -} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp new file mode 100644 index 00000000000..490b8f5edb5 --- /dev/null +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int maxk = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + coopmat sum0 = coopmat(0.f); + coopmat sum1 = coopmat(0.f); + coopmat sum2 = coopmat(0.f); + coopmat sum3 = coopmat(0.f); + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopMatLoad(B0, tmp_k0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + + coopMatStore(sum0_fp16, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16); + + if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } +} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp new file mode 100644 index 00000000000..82a4f75104a --- /dev/null +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp @@ -0,0 +1,239 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_KHR_cooperative_matrix: require + +layout (constant_id = 0) const int maxk = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + coopmat sum0 = coopmat(0.f); + coopmat sum1 = coopmat(0.f); + coopmat sum2 = coopmat(0.f); + coopmat sum3 = coopmat(0.f); + coopmat sum4 = coopmat(0.f); + coopmat sum5 = coopmat(0.f); + coopmat sum6 = coopmat(0.f); + coopmat sum7 = coopmat(0.f); + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + coopmat A0; + coopmat A1; + coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(A1, tmp_v1, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + coopmat B0; + coopmat B1; + coopmat B2; + coopmat B3; + coopMatLoad(B0, tmp_k0, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B1, tmp_k1, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B2, tmp_k2, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(B3, tmp_k3, z4*8*2, 2, gl_CooperativeMatrixLayoutRowMajor); + + // sum += v * k + sum0 = coopMatMulAdd(A0, B0, sum0); + sum1 = coopMatMulAdd(A1, B0, sum1); + sum2 = coopMatMulAdd(A0, B1, sum2); + sum3 = coopMatMulAdd(A1, B1, sum3); + sum4 = coopMatMulAdd(A0, B2, sum4); + sum5 = coopMatMulAdd(A1, B2, sum5); + sum6 = coopMatMulAdd(A0, B3, sum6); + sum7 = coopMatMulAdd(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + coopmat sum0_fp16 = coopmat(sum0); + coopmat sum1_fp16 = coopmat(sum1); + coopmat sum2_fp16 = coopmat(sum2); + coopmat sum3_fp16 = coopmat(sum3); + coopmat sum4_fp16 = coopmat(sum4); + coopmat sum5_fp16 = coopmat(sum5); + coopmat sum6_fp16 = coopmat(sum6); + coopmat sum7_fp16 = coopmat(sum7); + + coopMatStore(sum0_fp16, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum1_fp16, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum2_fp16, tmp_v0, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum3_fp16, tmp_v1, 16*2, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum4_fp16, tmp_v0, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16); + + if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } +} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp new file mode 100644 index 00000000000..bbeca6b301d --- /dev/null +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int maxk = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; +} p; + +#define UNROLL_INCH 2 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; +shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + + const int N = psc(c) / 4; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd16 == 0) + { + for (int j = 0; j < 4; j++) + { + const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; + + const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); + + tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + + const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); + + tmp_k0[tmp_i] = weight_data[w_offset]; + tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); + + barrier(); + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; + + const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16); + + if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } +} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp new file mode 100644 index 00000000000..4d7a03e829f --- /dev/null +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp @@ -0,0 +1,239 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#extension GL_KHR_memory_scope_semantics: require +#extension GL_EXT_shader_explicit_arithmetic_types: require +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#extension GL_NV_cooperative_matrix: require + +layout (constant_id = 0) const int maxk = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; + +layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; +} p; + +#define UNROLL_INCH 4 + +shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; +shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; +shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; +shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; + int gy = int(gl_GlobalInvocationID.y) * 2 * 4; + + const int lx = int(gl_LocalInvocationID.x); + + const int lxd8 = lx / 8; // 0 1 2 3 + const int lxm8 = lx % 8; // 0 1 2 3 .... 7 + + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + + const int N = psc(c) / 2; + + int z = 0; + for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) + { + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < UNROLL_INCH; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (z < N) + { + const int remain = N - z; + + if (lxd8 < remain) + { + for (int j = 0; j < 2; j++) + { + const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; + + int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); + + tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); + tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); + tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); + tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); + + const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; + + int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); + + tmp_k0[tmp_ki] = weight_data[w_offset]; + tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; + tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; + tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); + } + + barrier(); + } + + if (gx >= psc(outw) || gy >= psc(outh)) + return; + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); + + barrier(); + + const int lxd16 = lx / 16; // 0 1 + const int lxm16 = lx % 16; // 0 1 2 3 .... 15 + + { + for (int j = 0; j < 4; j++) + { + const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; + const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16); + + if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; + if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; + } + } +} diff --git a/src/vulkan_header_fix.h b/src/vulkan_header_fix.h index cd1efed46bc..0a5ea9bbd0a 100644 --- a/src/vulkan_header_fix.h +++ b/src/vulkan_header_fix.h @@ -389,4 +389,61 @@ typedef enum VkInstanceCreateFlagBits } VkInstanceCreateFlagBits; #endif // VK_HEADER_VERSION < 208 +#if VK_HEADER_VERSION < 255 +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR (VkStructureType)1000506000 +#define VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506001 +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_PROPERTIES_KHR (VkStructureType)1000506002 +typedef enum VkComponentTypeKHR +{ + VK_COMPONENT_TYPE_FLOAT16_KHR = 0, + VK_COMPONENT_TYPE_FLOAT32_KHR = 1, + VK_COMPONENT_TYPE_FLOAT64_KHR = 2, + VK_COMPONENT_TYPE_SINT8_KHR = 3, + VK_COMPONENT_TYPE_SINT16_KHR = 4, + VK_COMPONENT_TYPE_SINT32_KHR = 5, + VK_COMPONENT_TYPE_SINT64_KHR = 6, + VK_COMPONENT_TYPE_UINT8_KHR = 7, + VK_COMPONENT_TYPE_UINT16_KHR = 8, + VK_COMPONENT_TYPE_UINT32_KHR = 9, + VK_COMPONENT_TYPE_UINT64_KHR = 10, + VK_COMPONENT_TYPE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkComponentTypeKHR; +typedef enum VkScopeKHR +{ + VK_SCOPE_DEVICE_KHR = 1, + VK_SCOPE_WORKGROUP_KHR = 2, + VK_SCOPE_SUBGROUP_KHR = 3, + VK_SCOPE_QUEUE_FAMILY_KHR = 5, + VK_SCOPE_MAX_ENUM_KHR = 0x7FFFFFFF +} VkScopeKHR; +typedef struct VkCooperativeMatrixPropertiesKHR +{ + VkStructureType sType; + void* pNext; + uint32_t MSize; + uint32_t NSize; + uint32_t KSize; + VkComponentTypeKHR AType; + VkComponentTypeKHR BType; + VkComponentTypeKHR CType; + VkComponentTypeKHR ResultType; + VkBool32 saturatingAccumulation; + VkScopeKHR scope; +} VkCooperativeMatrixPropertiesKHR; +typedef struct VkPhysicalDeviceCooperativeMatrixFeaturesKHR +{ + VkStructureType sType; + void* pNext; + VkBool32 cooperativeMatrix; + VkBool32 cooperativeMatrixRobustBufferAccess; +} VkPhysicalDeviceCooperativeMatrixFeaturesKHR; +typedef struct VkPhysicalDeviceCooperativeMatrixPropertiesKHR +{ + VkStructureType sType; + void* pNext; + VkShaderStageFlags cooperativeMatrixSupportedStages; +} VkPhysicalDeviceCooperativeMatrixPropertiesKHR; +typedef VkResult(VKAPI_PTR* PFN_vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR)(VkPhysicalDevice physicalDevice, uint32_t* pPropertyCount, VkCooperativeMatrixPropertiesKHR* pProperties); +#endif // VK_HEADER_VERSION < 255 + #endif // NCNN_VULKAN_HEADER_FIX_H diff --git a/tests/testutil.h b/tests/testutil.h index 1a86c639bab..b879fa527fb 100644 --- a/tests/testutil.h +++ b/tests/testutil.h @@ -1508,7 +1508,8 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec {1, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 1, 0, 0}, {1, 0, 1, 0, 0, 1, 0}, - {1, 1, 1, 1, 0, 1, 1}, + {1, 1, 1, 1, 0, 0, 0}, + {1, 1, 1, 1, 1, 1, 1}, }; const int opt_count = sizeof(options) / sizeof(options[0]); @@ -1544,7 +1545,8 @@ int test_layer(const char* layer_type, const ncnn::ParamDict& pd, const std::vec {1, 0, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 1, 0, 0}, {1, 0, 1, 0, 0, 1, 0}, - {1, 1, 1, 1, 0, 1, 1}, + {1, 1, 1, 1, 0, 0, 0}, + {1, 1, 1, 1, 1, 1, 1}, }; const int opt_count = sizeof(options) / sizeof(options[0]);