add vector optimization for loongarch64 (#4242)

Tencent · Nov 11, 2022 · 279222c · 279222c
1 parent a2af636
commit 279222c
Show file tree

Hide file tree

Showing 137 changed files with 31,155 additions and 3 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -290,6 +290,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
     else()
         message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
     endif()
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
+    set(NCNN_TARGET_ARCH loongarch)
+
+    include(CheckCXXCompilerFlag)
+
+    check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+
+    if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
+        option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
+    else()
+        message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
+    endif()
+
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     set(NCNN_TARGET_ARCH riscv)
 
@@ -332,8 +345,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
     set(NCNN_TARGET_ARCH powerpc)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)")
-    set(NCNN_TARGET_ARCH mips)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
     set(NCNN_TARGET_ARCH xtensa)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")

diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
@@ -270,6 +270,12 @@ macro(ncnn_add_layer class)
         endif()
     endif()
 
+    if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch")
+        if(NCNN_LSX)
+            ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
+        endif()
+    endif()
+
     if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
         if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")

diff --git a/cmake/ncnn_generate_lsx_source.cmake b/cmake/ncnn_generate_lsx_source.cmake
@@ -0,0 +1,14 @@
+
+# must define SRC DST CLASS
+
+file(READ ${SRC} source_data)
+
+# replace
+string(TOUPPER ${CLASS} CLASS_UPPER)
+string(TOLOWER ${CLASS} CLASS_LOWER)
+
+string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
+string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
+string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")
+
+file(WRITE ${DST} "${source_data}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -460,6 +460,12 @@ if(NCNN_TARGET_ARCH STREQUAL "mips")
     endif()
 endif()
 
+if(NCNN_TARGET_ARCH STREQUAL "loongarch")
+    if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
+        target_compile_options(ncnn PRIVATE -mlsx)
+    endif()
+endif()
+
 if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
     if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
         if(NCNN_COMPILER_SUPPORT_RVV_ZFH)

diff --git a/src/cpu.cpp b/src/cpu.cpp
@@ -173,7 +173,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
         return 0;
     }
 
-#if __aarch64__ || __mips64 || __riscv_xlen == 64
+#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
     struct
     {
         uint64_t tag;
@@ -250,6 +250,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
 #define HWCAP_LOONGSON_MMI (1 << 11)
 #endif
 
+#if __loongarch64
+// from arch/loongarch/include/uapi/asm/hwcap.h
+#define HWCAP_LOONGARCH_LSX  (1 << 4)
+#define HWCAP_LOONGARCH_LASX (1 << 5)
+#endif
+
 #if __riscv
 // from arch/riscv/include/uapi/asm/hwcap.h
 #define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
@@ -1084,6 +1090,32 @@ int cpu_support_mips_msa()
 #endif
 }
 
+int cpu_support_loongarch_lsx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LSX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
+int cpu_support_loongarch_lasx()
+{
+#if defined __ANDROID__ || defined __linux__
+#if __loongarch64
+    return g_hwcaps & HWCAP_LOONGARCH_LASX;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
 int cpu_support_loongson_mmi()
 {
 #if defined __ANDROID__ || defined __linux__

diff --git a/src/cpu.h b/src/cpu.h
@@ -100,6 +100,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16();
 // avx512_fp16 = x86 avx512 fp16
 NCNN_EXPORT int cpu_support_x86_avx512_fp16();
 
+// lsx = loongarch lsx
+NCNN_EXPORT int cpu_support_loongarch_lsx();
+// lasx = loongarch lasx
+NCNN_EXPORT int cpu_support_loongarch_lasx();
+
 // msa = mips mas
 NCNN_EXPORT int cpu_support_mips_msa();
 // mmi = loongson mmi

diff --git a/src/layer.cpp b/src/layer.cpp
@@ -253,6 +253,13 @@ Layer* create_layer(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_AVX
+#if NCNN_RUNTIME_CPU && NCNN_LSX
+    if (ncnn::cpu_support_loongarch_lsx())
+    {
+        layer_creator = layer_registry_lsx[index].creator;
+    }
+    else
+#endif // NCNN_RUNTIME_CPU && NCNN_LSX
 #if NCNN_RUNTIME_CPU && NCNN_MSA
     if (ncnn::cpu_support_mips_msa())
     {

diff --git a/src/layer/loongarch/absval_loongarch.cpp b/src/layer/loongarch/absval_loongarch.cpp
@@ -0,0 +1,67 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "absval_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+namespace ncnn {
+
+AbsVal_loongarch::AbsVal_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif
+}
+
+int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int d = bottom_top_blob.d;
+    int channels = bottom_top_blob.c;
+    int elempack = bottom_top_blob.elempack;
+    int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        float* ptr = bottom_top_blob.channel(q);
+
+        int i = 0;
+#if __loongarch_sx
+        for (; i + 3 < size; i += 4)
+        {
+            __builtin_prefetch(ptr + 16);
+            __m128i _p = __lsx_vld(ptr, 0);
+            __m128i _outp = __lsx_vbitclri_w(_p, 31);
+            __lsx_vst(_outp, ptr, 0);
+
+            ptr += 4;
+        }
+#endif // __loongarch_sx
+        for (; i < size; i++)
+        {
+            *ptr = *ptr > 0 ? *ptr : -*ptr;
+
+            ptr++;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/loongarch/absval_loongarch.h b/src/layer/loongarch/absval_loongarch.h
@@ -0,0 +1,32 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_ABSVAL_LOONGARCH_H
+#define LAYER_ABSVAL_LOONGARCH_H
+
+#include "absval.h"
+
+namespace ncnn {
+
+class AbsVal_loongarch : virtual public AbsVal
+{
+public:
+    AbsVal_loongarch();
+
+    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_ABSVAL_LOONGARCH_H
diff --git a/src/layer/loongarch/batchnorm_loongarch.cpp b/src/layer/loongarch/batchnorm_loongarch.cpp
@@ -0,0 +1,145 @@
+// yala is pleased to support the open source community by making ncnn available.
+//
+//
+// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "batchnorm_loongarch.h"
+
+#if __loongarch_sx
+#include <lsxintrin.h>
+#endif // __loongarch_sx
+
+#include "loongarch_usability.h"
+
+namespace ncnn {
+
+BatchNorm_loongarch::BatchNorm_loongarch()
+{
+#if __loongarch_sx
+    support_packing = true;
+#endif // __loongarch_sx
+}
+
+int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int elempack = bottom_top_blob.elempack;
+
+    if (dims == 1)
+    {
+        int w = bottom_top_blob.w * elempack;
+
+#if __loongarch_sx
+        int nn_w = w / 4;
+        int remain_w_start = nn_w * 4;
+#else
+        int remain_w_start = 0;
+#endif // __loongarch_sx
+
+        float* ptr = bottom_top_blob;
+
+#if __loongarch_sx
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < nn_w; i++)
+        {
+            float* ptr0 = ptr + i * 4;
+
+            __m128 _p = (__m128)__lsx_vld(ptr0, 0);
+            __m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
+            __m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
+            _p = __lsx_vfmadd_s(_b, _p, _a);
+            __lsx_vst(_p, ptr0, 0);
+        }
+#endif // __loongarch_sx
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = remain_w_start; i < w; i++)
+        {
+            ptr[i] = b_data[i] * ptr[i] + a_data[i];
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_top_blob.w * elempack;
+        int h = bottom_top_blob.h;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            float* ptr = bottom_top_blob.row(i);
+            float a = a_data[i];
+            float b = b_data[i];
+
+            int j = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; j + 3 < w; j += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; j < w; j++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    if (dims == 3 || dims == 4)
+    {
+        int w = bottom_top_blob.w;
+        int h = bottom_top_blob.h;
+        int d = bottom_top_blob.d;
+        int c = bottom_top_blob.c;
+        int size = w * h * d * elempack;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < c; q++)
+        {
+            float* ptr = bottom_top_blob.channel(q);
+            float a = a_data[q];
+            float b = b_data[q];
+
+            int i = 0;
+#if __loongarch_sx
+            __m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
+            __m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
+            for (; i + 3 < size; i += 4)
+            {
+                __builtin_prefetch(ptr + 16);
+                __m128 _p = (__m128)__lsx_vld(ptr, 0);
+                _p = __lsx_vfmadd_s(_b, _p, _a);
+                __lsx_vst(_p, ptr, 0);
+
+                ptr += 4;
+            }
+#endif // __loongarch_sx
+            for (; i < size; i++)
+            {
+                *ptr = b * *ptr + a;
+                ptr++;
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn