diff --git a/CMakeLists.txt b/CMakeLists.txt index d7165e8048f..5dc093d8943 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -416,21 +416,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V) + check_cxx_source_compiles("#include \nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V) - set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zvfh") - check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH) - # if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH) - # set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") - # check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH) - # endif() + set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) unset(CMAKE_REQUIRED_FLAGS) - if(NCNN_COMPILER_SUPPORT_RISCV_V) + if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) option(NCNN_RVV "optimize risc-v platform with v extension" ON) - if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH) + if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) if(NCNN_RVV) option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON) endif() @@ -456,8 +454,15 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") # add_definitions(-D__rvv_tuple) # endif() else() - message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.") + message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.") endif() + + if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR) + option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON) + else() + message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.") + endif() + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") set(NCNN_TARGET_ARCH powerpc) diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 8afe15546bc..462bf45afd0 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -363,17 +363,14 @@ macro(ncnn_add_layer class) endif() if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8) - # if(NCNN_RUNTIME_CPU AND NCNN_RVV) - # ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") - - # if(NCNN_COMPILER_SUPPORT_RVV_ZVFH) - # ncnn_add_arch_opt_layer(${class} zvfh "-march=rv64gcv_zvfh") - # elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) - # ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") - # endif() - # endif() + if(NCNN_RUNTIME_CPU AND NCNN_RVV) + ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") + endif() + if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR) + ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16") + endif() if(NCNN_ZVFH) - ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zvfh -D__fp16=_Float16") + ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16") endif() endif() diff --git a/cmake/ncnn_generate_xtheadvector_source.cmake b/cmake/ncnn_generate_xtheadvector_source.cmake new file mode 100644 index 00000000000..e9c7b00c818 --- /dev/null +++ b/cmake/ncnn_generate_xtheadvector_source.cmake @@ -0,0 +1,14 @@ + +# must define SRC DST CLASS + +file(READ ${SRC} source_data) + +# replace +string(TOUPPER ${CLASS} CLASS_UPPER) +string(TOLOWER ${CLASS} CLASS_LOWER) + +string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}") +string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}") +string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}") + +file(WRITE ${DST} "${source_data}") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aec5291e48b..b6f28c55b50 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -631,24 +631,16 @@ if(NCNN_TARGET_ARCH STREQUAL "loongarch") endif() if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT C906) - if(NCNN_RVV) + if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV) set(RISCV_MARCH_FLAG "-march=rv64gcv") - else() - set(RISCV_MARCH_FLAG "-march=rv64gc") - endif() - - if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH) - set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zvfh") + if(NCNN_ZVFH) + set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh_zvfh") + target_compile_options(ncnn PRIVATE -D__fp16=_Float16) + endif() + elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH) + set(RISCV_MARCH_FLAG "-march=rv64gc_zfh_xtheadvector") target_compile_options(ncnn PRIVATE -D__fp16=_Float16) endif() - - # if(NCNN_COMPILER_SUPPORT_RVV_ZFH) - # target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh) - # elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) - # target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16) - # elseif(NCNN_COMPILER_SUPPORT_RVV) - # target_compile_options(ncnn PRIVATE -march=rv64gcv) - # endif() target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG}) endif() diff --git a/src/cpu.cpp b/src/cpu.cpp index e18671a21d5..3b52d95b8e9 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -2580,6 +2580,22 @@ int cpu_support_riscv_zvfh() #endif } +int cpu_support_riscv_xtheadvector() +{ + try_initialize_global_cpu_info(); +#if defined __ANDROID__ || defined __linux__ +#if __riscv + // v + f does not imply zfh, but how to discover zvfh properly ? + // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414 + return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F; +#else + return 0; +#endif +#else + return 0; +#endif +} + int cpu_riscv_vlenb() { try_initialize_global_cpu_info(); diff --git a/src/cpu.h b/src/cpu.h index 9530d4f7e7e..4c8e42c5712 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -118,6 +118,8 @@ NCNN_EXPORT int cpu_support_riscv_v(); NCNN_EXPORT int cpu_support_riscv_zfh(); // zvfh = riscv vector half-precision float NCNN_EXPORT int cpu_support_riscv_zvfh(); +// xtheadvector = riscv xtheadvector +NCNN_EXPORT int cpu_support_riscv_xtheadvector(); // vlenb = riscv vector length in bytes NCNN_EXPORT int cpu_riscv_vlenb(); diff --git a/src/layer.cpp b/src/layer.cpp index 1f76d546060..d82595105ce 100644 --- a/src/layer.cpp +++ b/src/layer.cpp @@ -546,6 +546,20 @@ Layer* create_layer_cpu(int index) } else #endif // NCNN_RUNTIME_CPU && NCNN_MSA +#if NCNN_RUNTIME_CPU && NCNN_RVV + if (ncnn::cpu_support_riscv_v()) + { + layer_creator = layer_registry_rvv[index].creator; + } + else +#endif // NCNN_RUNTIME_CPU && NCNN_RVV +#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR + if (ncnn::cpu_support_riscv_xtheadvector()) + { + layer_creator = layer_registry_xtheadvector[index].creator; + } + else +#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR { layer_creator = layer_registry_arch[index].creator; } diff --git a/src/layer/riscv/absval_fp16.h b/src/layer/riscv/absval_fp16.h new file mode 100755 index 00000000000..50cc5d3de17 --- /dev/null +++ b/src/layer/riscv/absval_fp16.h @@ -0,0 +1,67 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + + +#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_zvfh +void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt); +#endif + +#if __riscv_zvfh +static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl) +{ + return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl); +} +#endif // __riscv_zvfh + +static void absval_fp16(Mat& bottom_top_blob, const Option& opt) +{ +#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_xtheadvector && !__riscv_zvfh + if (ncnn::cpu_support_riscv_zvfh()) + { + absval_fp16_zvfh(bottom_top_blob, opt); + return; + } +#endif + +#if __riscv_zvfh + const int w = bottom_top_blob.w; + const int h = bottom_top_blob.h; + const int d = bottom_top_blob.d; + const int channels = bottom_top_blob.c; + const int elempack = bottom_top_blob.elempack; + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + __fp16* ptr = bottom_top_blob.channel(q); + + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + + vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); + _p = __riscv_vfabs_v_f16m8_absval(_p, vl); + __riscv_vse16_v_f16m8(ptr, _p, vl); + + ptr += vl; + n -= vl; + } + } +#else + (void)bottom_top_blob; + (void)opt; +#endif // __riscv_zvfh +} diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp index b341f48a5d8..8c84af7284c 100644 --- a/src/layer/riscv/absval_riscv.cpp +++ b/src/layer/riscv/absval_riscv.cpp @@ -13,6 +13,7 @@ // specific language governing permissions and limitations under the License. #include "absval_riscv.h" +#include "cpu.h" #if __riscv_vector #include @@ -20,12 +21,14 @@ namespace ncnn { +#include "absval_fp16.h" + AbsVal_riscv::AbsVal_riscv() { #if __riscv_vector support_packing = true; -#if NCNN_ZVFH - support_fp16_storage = cpu_support_riscv_zvfh(); +#if NCNN_ZVFH || NCNN_XTHEADVECTOR + support_fp16_storage = cpu_support_riscv_zvfh() || cpu_support_riscv_xtheadvector(); #endif #endif // __riscv_vector } @@ -39,7 +42,7 @@ static inline vfloat32m8_t __riscv_vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const { -#if NCNN_ZVFH +#if __riscv_vector int elembits = bottom_top_blob.elembits(); if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) @@ -86,4 +89,12 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const return 0; } +#if __riscv_vector +int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +{ + absval_fp16(bottom_top_blob, opt); + return 0; +} +#endif // __riscv_vector + } // namespace ncnn diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h index 0acf42ce634..b08a8a7bb31 100644 --- a/src/layer/riscv/absval_riscv.h +++ b/src/layer/riscv/absval_riscv.h @@ -27,7 +27,7 @@ class AbsVal_riscv : public AbsVal virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; protected: -#if NCNN_ZVFH +#if __riscv_vector int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const; #endif }; diff --git a/src/layer/riscv/absval_riscv_zvfh.cpp b/src/layer/riscv/absval_riscv_zvfh.cpp index c6a9c393240..1eb9554a6b2 100644 --- a/src/layer/riscv/absval_riscv_zvfh.cpp +++ b/src/layer/riscv/absval_riscv_zvfh.cpp @@ -12,50 +12,16 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#include "absval_riscv.h" - -#if __riscv_vector -#include -#endif +#include "cpu.h" +#include "mat.h" namespace ncnn { -#if __riscv_zvfh -static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl) -{ - return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl); -} +#include "absval_fp16.h" -int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const +void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt) { - int w = bottom_top_blob.w; - int h = bottom_top_blob.h; - int d = bottom_top_blob.d; - int channels = bottom_top_blob.c; - int elempack = bottom_top_blob.elempack; - int size = w * h * d * elempack; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) - { - __fp16* ptr = bottom_top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = __riscv_vsetvl_e16m8(n); - - vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl); - _p = __riscv_vfabs_v_f16m8_absval(_p, vl); - __riscv_vse16_v_f16m8(ptr, _p, vl); - - ptr += vl; - n -= vl; - } - } - - return 0; + absval_fp16(bottom_top_blob, opt); } -#endif // __riscv_zvfh } // namespace ncnn diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index 788549bb5bb..15b206b063c 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -20,6 +20,8 @@ #include "riscv_usability.h" +#include "cpu.h" + namespace ncnn { Packing_riscv::Packing_riscv() diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index fd1806d8185..6966a7368c7 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -125,54 +125,54 @@ static inline vfloat16m8_t __riscv_vle16_v_f16m8_f16m1(const __fp16* ptr) #endif // __riscv_zvfh #endif // __riscv_vector -#if __riscv_vector && __rvv_tuple +#if 0//__riscv_vector && __rvv_tuple // f32m1, vsseg.v -static inline void __riscv_vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +static inline void __riscv_vsseg8e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) { vfloat32m1x8_t _tmp = __riscv_vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); __riscv_vsseg8e32_v_f32m1x8(base, _tmp, vl); } -static inline void __riscv_vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +static inline void __riscv_vsseg4e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) { vfloat32m1x4_t _tmp = __riscv_vcreate_f32m1x4(v0, v1, v2, v3); __riscv_vsseg4e32_v_f32m1x4(base, _tmp, vl); } -static inline void __riscv_vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +static inline void __riscv_vsseg2e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) { vfloat32m1x2_t _tmp = __riscv_vcreate_f32m1x2(v0, v1); __riscv_vsseg2e32_v_f32m1x2(base, _tmp, vl); } // f32m1, vssseg.v, 8/4/2 -static inline void __riscv_vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +static inline void __riscv_vssseg8e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) { vfloat32m1x8_t _tmp = __riscv_vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); __riscv_vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl); } -static inline void __riscv_vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +static inline void __riscv_vssseg4e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) { vfloat32m1x4_t _tmp = __riscv_vcreate_f32m1x4(v0, v1, v2, v3); __riscv_vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl); } -static inline void __riscv_vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +static inline void __riscv_vssseg2e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) { vfloat32m1x2_t _tmp = __riscv_vcreate_f32m1x2(v0, v1); __riscv_vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl); } // f32m2, vsseg.v, 4/2 -static inline void __riscv_vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl) +static inline void __riscv_vsseg4e32_v_f32m2(float* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl) { vfloat32m2x4_t _tmp = __riscv_vcreate_f32m2x4(v0, v1, v2, v3); __riscv_vsseg4e32_v_f32m2x4(base, _tmp, vl); } -static inline void __riscv_vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl) +static inline void __riscv_vsseg2e32_v_f32m2(float* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl) { vfloat32m2x2_t _tmp = __riscv_vcreate_f32m2x2(v0, v1); __riscv_vsseg2e32_v_f32m2x2(base, _tmp, vl); @@ -205,7 +205,7 @@ static inline void __riscv_vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vui } // f32m1, vlseg.v 8/4/2 -static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl) +static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float* base, size_t vl) { vfloat32m1x8_t _tmp = __riscv_vlseg8e32_v_f32m1x8(base, vl); *v0 = __riscv_vget_f32m1x8_f32m1(_tmp, 0); @@ -218,7 +218,7 @@ static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, *v7 = __riscv_vget_f32m1x8_f32m1(_tmp, 7); } -static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl) +static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float* base, size_t vl) { vfloat32m1x4_t _tmp = __riscv_vlseg4e32_v_f32m1x4(base, vl); *v0 = __riscv_vget_f32m1x4_f32m1(_tmp, 0); @@ -227,7 +227,7 @@ static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, *v3 = __riscv_vget_f32m1x4_f32m1(_tmp, 3); } -static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl) +static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float* base, size_t vl) { vfloat32m1x2_t _tmp = __riscv_vlseg2e32_v_f32m1x2(base, vl); *v0 = __riscv_vget_f32m1x2_f32m1(_tmp, 0); @@ -235,7 +235,7 @@ static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, } // f32m2, vlseg.v, 4 -static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl) +static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float* base, size_t vl) { vfloat32m2x4_t _tmp = __riscv_vlseg4e32_v_f32m2x4(base, vl); *v0 = __riscv_vget_f32m2x4_f32m2(_tmp, 0); @@ -245,7 +245,7 @@ static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, } // f32m4, vlseg.v, 2 -static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl) +static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float* base, size_t vl) { vfloat32m4x2_t _tmp = __riscv_vlseg2e32_v_f32m4x2(base, vl); *v0 = __riscv_vget_f32m4x2_f32m4(_tmp, 0); @@ -253,7 +253,7 @@ static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, } // f32m4, vloxseg.v -static inline void __riscv_vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl) +static inline void __riscv_vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float* base, vuint32m4_t bindex, size_t vl) { vfloat32m4x2_t _tmp = __riscv_vloxseg2ei32_v_f32m4x2(base, bindex, vl); *v0 = __riscv_vget_f32m4x2_f32m4(_tmp, 0); @@ -399,7 +399,7 @@ static inline void __riscv_vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, #endif // __riscv_zvfh #endif // __riscv_vector -#ifdef __riscv_vector +#if __riscv_vector static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r1l, vfloat32m1_t& _r1h, @@ -411,8 +411,57 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl) { float tmp[64]; - __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l), vl); - __riscv_vsseg8e32_v_f32m1x8(&tmp[32], __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h), vl); +#if __riscv_vector +#if 1//__riscv_v_intrinsic > 12000 +#warning A + vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); + vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); +#else +#warning B + vfloat32m1x8_t _rl = vfloat32m1x8_t(); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); + vfloat32m1x8_t _rh = vfloat32m1x8_t(); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); +#endif + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl); +#elif __riscv_xtheadvector + vfloat32m1x8_t _rl = vfloat32m1x8_t(); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); + // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); + vfloat32m1x8_t _rh = vfloat32m1x8_t(); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); + // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); + __riscv_th_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); + __riscv_th_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl); +#endif + float* ptr = (float*)tmp; _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); @@ -435,7 +484,16 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl) { float tmp[16]; - __riscv_vsseg4e32_v_f32m1x4(&tmp[0], __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3), vl); +#if __riscv_vector && __riscv_v_intrinsic > 12000 + vfloat32m1x4_t _r = __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3); +#else + vfloat32m1x4_t _r = vfloat32m1x4_t(); + _r = __riscv_vset_v_f32m1_f32m1x4(_r, 0, _r0); + _r = __riscv_vset_v_f32m1_f32m1x4(_r, 1, _r1); + _r = __riscv_vset_v_f32m1_f32m1x4(_r, 2, _r2); + _r = __riscv_vset_v_f32m1_f32m1x4(_r, 3, _r3); +#endif + __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _r, vl); float* ptr = (float*)tmp; _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); @@ -519,9 +577,42 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl) { float tmp[96]; - __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l), vl); - __riscv_vsseg8e32_v_f32m1x8(&tmp[32], __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m), vl); - __riscv_vsseg8e32_v_f32m1x8(&tmp[64], __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h), vl); +#if __riscv_vector && __riscv_v_intrinsic > 12000 + vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l); + vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m); + vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h); +#else + vfloat32m1x8_t _rl = vfloat32m1x8_t(); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l); + _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l); + vfloat32m1x8_t _rm = vfloat32m1x8_t(); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 0, _r0m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 1, _r1m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 2, _r2m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 3, _r3m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 4, _r4m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 5, _r5m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 6, _r6m); + _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 7, _r7m); + vfloat32m1x8_t _rh = vfloat32m1x8_t(); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h); + _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h); +#endif + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rm, vl); + __riscv_vsseg8e32_v_f32m1x8(&tmp[64], _rh, vl); float* ptr = (float*)tmp; _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); @@ -553,7 +644,20 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl) { float tmp[32]; - __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7), vl); +#if __riscv_vector && __riscv_v_intrinsic > 12000 + vfloat32m1x8_t _r = __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7); +#else + vfloat32m1x8_t _r = vfloat32m1x8_t(); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 0, _r0); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 1, _r1); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 2, _r2); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 3, _r3); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 4, _r4); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 5, _r5); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 6, _r6); + _r = __riscv_vset_v_f32m1_f32m1x8(_r, 7, _r7); +#endif + __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _r, vl); float* ptr = (float*)tmp; _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); @@ -602,8 +706,23 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h, vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl) { float tmp[32]; - __riscv_vsseg4e32_v_f32m1x4(&tmp[0], __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l), vl); - __riscv_vsseg4e32_v_f32m1x4(&tmp[16], __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h), vl); +#if __riscv_vector && __riscv_v_intrinsic > 12000 + vfloat32m1x4_t _rl = __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l); + vfloat32m1x4_t _rh = __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h); +#else + vfloat32m1x4_t _rl = vfloat32m1x4_t(); + _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 0, _r0l); + _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 1, _r1l); + _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 2, _r2l); + _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 3, _r3l); + vfloat32m1x4_t _rh = vfloat32m1x4_t(); + _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 0, _r0h); + _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 1, _r1h); + _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 2, _r2h); + _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 3, _r3h); +#endif + __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _rl, vl); + __riscv_vsseg4e32_v_f32m1x4(&tmp[16], _rh, vl); float* ptr = (float*)tmp; _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl); _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl); diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp index c4a801e17ab..a53b05dffef 100644 --- a/src/layer/riscv/softmax_riscv.cpp +++ b/src/layer/riscv/softmax_riscv.cpp @@ -33,7 +33,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int elempack = bottom_top_blob.elempack; int positive_axis = axis < 0 ? dims + axis : axis; -#ifdef __riscv_vector +#if __riscv_vector if (dims == 1) // positive_axis == 0 { int w = bottom_top_blob.w; diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in index 97d2de37ef5..6f34f8cb0f0 100644 --- a/src/layer_registry.h.in +++ b/src/layer_registry.h.in @@ -46,6 +46,18 @@ static const layer_registry_entry layer_registry_lsx[] = { }; #endif // NCNN_RUNTIME_CPU && NCNN_LSX +#if NCNN_RUNTIME_CPU && NCNN_RVV +static const layer_registry_entry layer_registry_rvv[] = { +@layer_registry_rvv@ +}; +#endif // NCNN_RUNTIME_CPU && NCNN_RVV + +#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR +static const layer_registry_entry layer_registry_xtheadvector[] = { +@layer_registry_xtheadvector@ +}; +#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR + #if NCNN_VULKAN static const layer_registry_entry layer_registry_vulkan[] = { @layer_registry_vulkan@ diff --git a/src/platform.h.in b/src/platform.h.in index d28cbdc0014..57af8f2664c 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -60,6 +60,7 @@ #cmakedefine01 NCNN_MMI #cmakedefine01 NCNN_RVV #cmakedefine01 NCNN_ZVFH +#cmakedefine01 NCNN_XTHEADVECTOR #cmakedefine01 NCNN_INT8 #cmakedefine01 NCNN_BF16 #cmakedefine01 NCNN_FORCE_INLINE