Skip to content

Commit

Permalink
add vector optimization for loongarch64 (#4242)
Browse files Browse the repository at this point in the history
  • Loading branch information
junchao-loongson authored Nov 11, 2022
1 parent a2af636 commit 279222c
Show file tree
Hide file tree
Showing 137 changed files with 31,155 additions and 3 deletions.
15 changes: 13 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(mips)")
else()
message(WARNING "The compiler does not support loongson mmi extension. NCNN_MMI will be OFF.")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch64|loongarch32)")
set(NCNN_TARGET_ARCH loongarch)

include(CheckCXXCompilerFlag)

check_cxx_compiler_flag("-mlsx" NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)

if(NCNN_COMPILER_SUPPORT_LOONGARCH_LSX)
option(NCNN_LSX "optimize loongarch platform with lsx extension" ON)
else()
message(WARNING "The compiler does not support lsx extension. NCNN_LSX will be OFF.")
endif()

elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
set(NCNN_TARGET_ARCH riscv)

Expand Down Expand Up @@ -332,8 +345,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
set(NCNN_TARGET_ARCH powerpc)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(loongarch)")
set(NCNN_TARGET_ARCH mips)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(xtensa)")
set(NCNN_TARGET_ARCH xtensa)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x)")
Expand Down
6 changes: 6 additions & 0 deletions cmake/ncnn_add_layer.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,12 @@ macro(ncnn_add_layer class)
endif()
endif()

if(NCNN_RUNTIME_CPU AND NCNN_TARGET_ARCH STREQUAL "loongarch")
if(NCNN_LSX)
ncnn_add_arch_opt_layer(${class} lsx "-mlsx")
endif()
endif()

if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
Expand Down
14 changes: 14 additions & 0 deletions cmake/ncnn_generate_lsx_source.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@

# must define SRC DST CLASS

file(READ ${SRC} source_data)

# replace
string(TOUPPER ${CLASS} CLASS_UPPER)
string(TOLOWER ${CLASS} CLASS_LOWER)

string(REGEX REPLACE "LAYER_${CLASS_UPPER}_LOONGARCH_H" "LAYER_${CLASS_UPPER}_LOONGARCH_LSX_H" source_data "${source_data}")
string(REGEX REPLACE "${CLASS}_loongarch" "${CLASS}_loongarch_lsx" source_data "${source_data}")
string(REGEX REPLACE "#include \"${CLASS_LOWER}_loongarch.h\"" "#include \"${CLASS_LOWER}_loongarch_lsx.h\"" source_data "${source_data}")

file(WRITE ${DST} "${source_data}")
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,12 @@ if(NCNN_TARGET_ARCH STREQUAL "mips")
endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "loongarch")
if(NOT NCNN_RUNTIME_CPU AND NCNN_LSX)
target_compile_options(ncnn PRIVATE -mlsx)
endif()
endif()

if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
Expand Down
34 changes: 33 additions & 1 deletion src/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ static unsigned int get_elf_hwcap_from_proc_self_auxv(unsigned int type)
return 0;
}

#if __aarch64__ || __mips64 || __riscv_xlen == 64
#if __aarch64__ || __mips64 || __riscv_xlen == 64 || __loongarch64
struct
{
uint64_t tag;
Expand Down Expand Up @@ -250,6 +250,12 @@ static unsigned int g_hwcaps2 = get_elf_hwcap(AT_HWCAP2);
#define HWCAP_LOONGSON_MMI (1 << 11)
#endif

#if __loongarch64
// from arch/loongarch/include/uapi/asm/hwcap.h
#define HWCAP_LOONGARCH_LSX (1 << 4)
#define HWCAP_LOONGARCH_LASX (1 << 5)
#endif

#if __riscv
// from arch/riscv/include/uapi/asm/hwcap.h
#define COMPAT_HWCAP_ISA_F (1 << ('F' - 'A'))
Expand Down Expand Up @@ -1084,6 +1090,32 @@ int cpu_support_mips_msa()
#endif
}

int cpu_support_loongarch_lsx()
{
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LSX;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_loongarch_lasx()
{
#if defined __ANDROID__ || defined __linux__
#if __loongarch64
return g_hwcaps & HWCAP_LOONGARCH_LASX;
#else
return 0;
#endif
#else
return 0;
#endif
}

int cpu_support_loongson_mmi()
{
#if defined __ANDROID__ || defined __linux__
Expand Down
5 changes: 5 additions & 0 deletions src/cpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ NCNN_EXPORT int cpu_support_x86_avx512_bf16();
// avx512_fp16 = x86 avx512 fp16
NCNN_EXPORT int cpu_support_x86_avx512_fp16();

// lsx = loongarch lsx
NCNN_EXPORT int cpu_support_loongarch_lsx();
// lasx = loongarch lasx
NCNN_EXPORT int cpu_support_loongarch_lasx();

// msa = mips mas
NCNN_EXPORT int cpu_support_mips_msa();
// mmi = loongson mmi
Expand Down
7 changes: 7 additions & 0 deletions src/layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,13 @@ Layer* create_layer(int index)
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_AVX
#if NCNN_RUNTIME_CPU && NCNN_LSX
if (ncnn::cpu_support_loongarch_lsx())
{
layer_creator = layer_registry_lsx[index].creator;
}
else
#endif // NCNN_RUNTIME_CPU && NCNN_LSX
#if NCNN_RUNTIME_CPU && NCNN_MSA
if (ncnn::cpu_support_mips_msa())
{
Expand Down
67 changes: 67 additions & 0 deletions src/layer/loongarch/absval_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "absval_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

namespace ncnn {

AbsVal_loongarch::AbsVal_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif
}

int AbsVal_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int channels = bottom_top_blob.c;
int elempack = bottom_top_blob.elempack;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
float* ptr = bottom_top_blob.channel(q);

int i = 0;
#if __loongarch_sx
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128i _p = __lsx_vld(ptr, 0);
__m128i _outp = __lsx_vbitclri_w(_p, 31);
__lsx_vst(_outp, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
*ptr = *ptr > 0 ? *ptr : -*ptr;

ptr++;
}
}

return 0;
}

} // namespace ncnn
32 changes: 32 additions & 0 deletions src/layer/loongarch/absval_loongarch.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#ifndef LAYER_ABSVAL_LOONGARCH_H
#define LAYER_ABSVAL_LOONGARCH_H

#include "absval.h"

namespace ncnn {

class AbsVal_loongarch : virtual public AbsVal
{
public:
AbsVal_loongarch();

virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_ABSVAL_LOONGARCH_H
145 changes: 145 additions & 0 deletions src/layer/loongarch/batchnorm_loongarch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// yala is pleased to support the open source community by making ncnn available.
//
//
// Copyright (C) 2022 yala <zhaojunchao@loongson.cn>;<junchao82@qq.com>. All rights reserved.
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "batchnorm_loongarch.h"

#if __loongarch_sx
#include <lsxintrin.h>
#endif // __loongarch_sx

#include "loongarch_usability.h"

namespace ncnn {

BatchNorm_loongarch::BatchNorm_loongarch()
{
#if __loongarch_sx
support_packing = true;
#endif // __loongarch_sx
}

int BatchNorm_loongarch::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
{
int dims = bottom_top_blob.dims;
int elempack = bottom_top_blob.elempack;

if (dims == 1)
{
int w = bottom_top_blob.w * elempack;

#if __loongarch_sx
int nn_w = w / 4;
int remain_w_start = nn_w * 4;
#else
int remain_w_start = 0;
#endif // __loongarch_sx

float* ptr = bottom_top_blob;

#if __loongarch_sx
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < nn_w; i++)
{
float* ptr0 = ptr + i * 4;

__m128 _p = (__m128)__lsx_vld(ptr0, 0);
__m128 _a = (__m128)__lsx_vld((const float*)a_data + i * 4, 0);
__m128 _b = (__m128)__lsx_vld((const float*)b_data + i * 4, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr0, 0);
}
#endif // __loongarch_sx

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = remain_w_start; i < w; i++)
{
ptr[i] = b_data[i] * ptr[i] + a_data[i];
}
}

if (dims == 2)
{
int w = bottom_top_blob.w * elempack;
int h = bottom_top_blob.h;

#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < h; i++)
{
float* ptr = bottom_top_blob.row(i);
float a = a_data[i];
float b = b_data[i];

int j = 0;
#if __loongarch_sx
__m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
__m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + i * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
for (; j + 3 < w; j += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; j < w; j++)
{
*ptr = b * *ptr + a;
ptr++;
}
}
}

if (dims == 3 || dims == 4)
{
int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int d = bottom_top_blob.d;
int c = bottom_top_blob.c;
int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < c; q++)
{
float* ptr = bottom_top_blob.channel(q);
float a = a_data[q];
float b = b_data[q];

int i = 0;
#if __loongarch_sx
__m128 _a = elempack == 4 ? (__m128)__lsx_vld((const float*)a_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(a);
__m128 _b = elempack == 4 ? (__m128)__lsx_vld((const float*)b_data + q * 4, 0) : (__m128)__lsx_vreplfr2vr_s(b);
for (; i + 3 < size; i += 4)
{
__builtin_prefetch(ptr + 16);
__m128 _p = (__m128)__lsx_vld(ptr, 0);
_p = __lsx_vfmadd_s(_b, _p, _a);
__lsx_vst(_p, ptr, 0);

ptr += 4;
}
#endif // __loongarch_sx
for (; i < size; i++)
{
*ptr = b * *ptr + a;
ptr++;
}
}
}

return 0;
}

} // namespace ncnn
Loading

0 comments on commit 279222c

Please sign in to comment.