From cd493c51bb3cc9892480e88848e739abc05f130f Mon Sep 17 00:00:00 2001 From: nihui Date: Mon, 2 Dec 2024 10:25:44 +0000 Subject: [PATCH] cast zfh --- src/layer/riscv/cast_riscv.cpp | 61 +++------ src/layer/riscv/cast_riscv.h | 5 + src/layer/riscv/cast_riscv_zfh.cpp | 119 ++++++++++++++++++ src/layer/riscv/convolution1d_riscv.cpp | 2 +- src/layer/riscv/convolution_riscv.cpp | 2 +- .../riscv/convolutiondepthwise_riscv.cpp | 2 +- src/layer/riscv/deconvolution_riscv.cpp | 2 +- .../riscv/deconvolutiondepthwise_riscv.cpp | 2 +- src/layer/riscv/innerproduct_riscv.cpp | 2 +- src/layer/riscv/rvv_mathfun.h | 2 - src/layer/riscv/rvv_mathfun_fp16s.h | 2 - src/net.cpp | 12 +- tests/testutil.cpp | 12 +- toolchains/c906-v301.toolchain.cmake | 4 +- toolchains/c910-v301.toolchain.cmake | 4 +- 15 files changed, 166 insertions(+), 67 deletions(-) create mode 100644 src/layer/riscv/cast_riscv_zfh.cpp diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp index 0cdfa3a3dc61..9b01dd5a91a2 100644 --- a/src/layer/riscv/cast_riscv.cpp +++ b/src/layer/riscv/cast_riscv.cpp @@ -18,15 +18,14 @@ #include #endif // __riscv_vector +#include "cpu.h" + namespace ncnn { Cast_riscv::Cast_riscv() { #if __riscv_vector support_packing = true; -#if __riscv_zvfh - support_fp16_storage = true; -#endif #endif // __riscv_vector } @@ -89,55 +88,33 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int size = w * h * d * elempack; -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZFH if (type_from == 1 && type_to == 2) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __riscv_vector + if (cpu_support_riscv_zvfh()) +#else + if (cpu_support_riscv_zfh()) +#endif { - const float* ptr = bottom_blob.channel(q); - __fp16* outptr = top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = __riscv_vsetvl_e32m8(n); - - vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); - vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl); - __riscv_vse16_v_f16m4(outptr, _outp, vl); - - ptr += vl; - outptr += vl; - n -= vl; - } + cast_fp32_to_fp16(bottom_blob, top_blob, opt); + return 0; } } if (type_from == 2 && type_to == 1) { - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __riscv_vector + if (cpu_support_riscv_zvfh()) +#else + if (cpu_support_riscv_zfh()) +#endif { - const __fp16* ptr = bottom_blob.channel(q); - float* outptr = top_blob.channel(q); - - int n = size; - while (n > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n); - - vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); - vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl); - __riscv_vse32_v_f32m8(outptr, _outp, vl); - - ptr += vl; - outptr += vl; - n -= vl; - } + cast_fp16_to_fp32(bottom_blob, top_blob, opt); + return 0; } } -#endif // __riscv_vector && __riscv_zvfh +#endif // NCNN_ZFH if (type_from == 3 && type_to == 1) { @@ -152,6 +129,8 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt outptr[i] = (float)ptr[i]; } } + + return 0; } // TODO more cast type diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h index 7c6fbb6d4cea..4d4d4d3b4240 100644 --- a/src/layer/riscv/cast_riscv.h +++ b/src/layer/riscv/cast_riscv.h @@ -25,6 +25,11 @@ class Cast_riscv : public Cast Cast_riscv(); virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +#if NCNN_ZFH + void cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + void cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp new file mode 100644 index 000000000000..114b9feb8f2a --- /dev/null +++ b/src/layer/riscv/cast_riscv_zfh.cpp @@ -0,0 +1,119 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "cast_riscv.h" + +namespace ncnn { + +void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_zfh + NCNN_LOGE("__riscv_zfh"); +#endif +#if __riscv_zvfh + NCNN_LOGE("__riscv_zvfh"); +#endif + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); +#if __riscv_zfh + __fp16* outptr = top_blob.channel(q); +#else + unsigned short* outptr = top_blob.channel(q); +#endif + + int i = 0; +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + + vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl); + vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl); + __riscv_vse16_v_f16m4(outptr, _outp, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#endif // __riscv_zvfh + for (; i < size; i++) + { +#if __riscv_zfh + *outptr++ = (__fp16)(*ptr++); +#else + *outptr++ = float32_to_float16(*ptr++); +#endif + } + } +} + +void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + const int w = bottom_blob.w; + const int h = bottom_blob.h; + const int d = bottom_blob.d; + const int channels = bottom_blob.c; + const int elempack = bottom_blob.elempack; + + const int size = w * h * d * elempack; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { +#if __riscv_zfh + const __fp16* ptr = bottom_blob.channel(q); +#else + const unsigned short* ptr = bottom_blob.channel(q); +#endif + float* outptr = top_blob.channel(q); + + int i = 0; +#if __riscv_zvfh + int n = size; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + + vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl); + vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl); + __riscv_vse32_v_f32m8(outptr, _outp, vl); + + ptr += vl; + outptr += vl; + n -= vl; + } +#endif // __riscv_zvfh + for (; i < size; i++) + { +#if __riscv_zfh + *outptr++ = (float)(*ptr++); +#else + *outptr++ = float16_to_float32(*ptr++); +#endif + } + } +} + +} // namespace ncnn diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp index b2c1c198857c..ca01f6c4a782 100644 --- a/src/layer/riscv/convolution1d_riscv.cpp +++ b/src/layer/riscv/convolution1d_riscv.cpp @@ -46,7 +46,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt) return 0; #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index c252d25f2351..9e7f5afdab49 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -120,7 +120,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) #endif #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index a0926077b340..6014b647193c 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -65,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) #endif #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp index 27afddd795cf..f76274470ab3 100644 --- a/src/layer/riscv/deconvolution_riscv.cpp +++ b/src/layer/riscv/deconvolution_riscv.cpp @@ -52,7 +52,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt) return 0; #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index 5fee5c24764d..ab22becae09c 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -46,7 +46,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) return 0; #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 3b8751f6a2af..8b113b3ea571 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -63,7 +63,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) #endif #if NCNN_ZFH - if (opt.use_fp16_storage) + if (support_fp16_storage && opt.use_fp16_storage) { return create_pipeline_fp16s(opt); } diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 2e152b079bf9..e7ceac3432e1 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -373,8 +373,6 @@ _RVV_FLOAT32_POW_OP(8, 4) _v = exp_ps(_v, vl); \ _v = __riscv_vfadd_vf_f32m##LMUL(_v, 1.f, vl); \ vfloat32m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f32m##LMUL(_v, 1.f, vl); \ - _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); \ - /* _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); */ \ return _reciprocal; \ } #else // __riscv_xtheadvector diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 08174608c4bc..fa65e0308222 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -373,8 +373,6 @@ _RVV_FLOAT16_POW_OP(8, 2) _v = exp_ps(_v, vl); \ _v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \ vfloat16m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \ - _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); \ - /* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \ return _reciprocal; \ } #else // __riscv_xtheadvector diff --git a/src/net.cpp b/src/net.cpp index d5e13791b1d2..750f1252f39f 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -639,15 +639,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } else #endif // NCNN_VFPV4 -#if NCNN_ZVFH - if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && layer->support_fp16_storage) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && layer->support_fp16_storage) { Mat bottom_blob_fp16; cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt); bottom_blob = bottom_blob_fp16; } else -#endif // NCNN_ZVFH +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && layer->support_bf16_storage) { @@ -767,15 +767,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio } else #endif // NCNN_VFPV4 -#if NCNN_ZVFH - if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && !layer->support_fp16_storage) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && !layer->support_fp16_storage) { Mat bottom_blob_fp32; cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt); bottom_blob = bottom_blob_fp32; } else -#endif // NCNN_ZVFH +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && !layer->support_bf16_storage) { diff --git a/tests/testutil.cpp b/tests/testutil.cpp index 5acd56848ee3..63d0743057bb 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -342,13 +342,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc } else #endif // NCNN_VFPV4 -#if NCNN_ZVFH - if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) +#if NCNN_ZFH + if (opt.use_fp16_storage && (&& ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) { ncnn::cast_float32_to_float16(a, a4, opt); } else -#endif // NCNN_ZVFH +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING)) { @@ -470,13 +470,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc } else #endif // NCNN_VFPV4 -#if NCNN_ZVFH - if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16) +#if NCNN_ZFH + if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16) { ncnn::cast_float16_to_float32(c4_unpacked, c, opt); } else -#endif // NCNN_ZVFH +#endif // NCNN_ZFH #if NCNN_BF16 if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16) { diff --git a/toolchains/c906-v301.toolchain.cmake b/toolchains/c906-v301.toolchain.cmake index 787b11c0a4cd..e67e0b0c14a4 100644 --- a/toolchains/c906-v301.toolchain.cmake +++ b/toolchains/c906-v301.toolchain.cmake @@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) endif() -set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static") +set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") +set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static") # cache flags set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") diff --git a/toolchains/c910-v301.toolchain.cmake b/toolchains/c910-v301.toolchain.cmake index ee53d77903f0..145a4415bdeb 100644 --- a/toolchains/c910-v301.toolchain.cmake +++ b/toolchains/c910-v301.toolchain.cmake @@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) endif() -set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static") -set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static") +set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") +set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static") # cache flags set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")