From cd493c51bb3cc9892480e88848e739abc05f130f Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Mon, 2 Dec 2024 10:25:44 +0000
Subject: [PATCH] cast zfh

---
 src/layer/riscv/cast_riscv.cpp                |  61 +++------
 src/layer/riscv/cast_riscv.h                  |   5 +
 src/layer/riscv/cast_riscv_zfh.cpp            | 119 ++++++++++++++++++
 src/layer/riscv/convolution1d_riscv.cpp       |   2 +-
 src/layer/riscv/convolution_riscv.cpp         |   2 +-
 .../riscv/convolutiondepthwise_riscv.cpp      |   2 +-
 src/layer/riscv/deconvolution_riscv.cpp       |   2 +-
 .../riscv/deconvolutiondepthwise_riscv.cpp    |   2 +-
 src/layer/riscv/innerproduct_riscv.cpp        |   2 +-
 src/layer/riscv/rvv_mathfun.h                 |   2 -
 src/layer/riscv/rvv_mathfun_fp16s.h           |   2 -
 src/net.cpp                                   |  12 +-
 tests/testutil.cpp                            |  12 +-
 toolchains/c906-v301.toolchain.cmake          |   4 +-
 toolchains/c910-v301.toolchain.cmake          |   4 +-
 15 files changed, 166 insertions(+), 67 deletions(-)
 create mode 100644 src/layer/riscv/cast_riscv_zfh.cpp

diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp
index 0cdfa3a3dc61..9b01dd5a91a2 100644
--- a/src/layer/riscv/cast_riscv.cpp
+++ b/src/layer/riscv/cast_riscv.cpp
@@ -18,15 +18,14 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Cast_riscv::Cast_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if __riscv_zvfh
-    support_fp16_storage = true;
-#endif
 #endif // __riscv_vector
 }
 
@@ -89,55 +88,33 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
 
     int size = w * h * d * elempack;
 
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZFH
     if (type_from == 1 && type_to == 2)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+#if __riscv_vector
+        if (cpu_support_riscv_zvfh())
+#else
+        if (cpu_support_riscv_zfh())
+#endif
         {
-            const float* ptr = bottom_blob.channel(q);
-            __fp16* outptr = top_blob.channel(q);
-
-            int n = size;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e32m8(n);
-
-                vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
-                vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
-                __riscv_vse16_v_f16m4(outptr, _outp, vl);
-
-                ptr += vl;
-                outptr += vl;
-                n -= vl;
-            }
+            cast_fp32_to_fp16(bottom_blob, top_blob, opt);
+            return 0;
         }
     }
 
     if (type_from == 2 && type_to == 1)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+#if __riscv_vector
+        if (cpu_support_riscv_zvfh())
+#else
+        if (cpu_support_riscv_zfh())
+#endif
         {
-            const __fp16* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
-
-            int n = size;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n);
-
-                vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
-                vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
-                __riscv_vse32_v_f32m8(outptr, _outp, vl);
-
-                ptr += vl;
-                outptr += vl;
-                n -= vl;
-            }
+            cast_fp16_to_fp32(bottom_blob, top_blob, opt);
+            return 0;
         }
     }
-#endif // __riscv_vector && __riscv_zvfh
+#endif // NCNN_ZFH
 
     if (type_from == 3 && type_to == 1)
     {
@@ -152,6 +129,8 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
                 outptr[i] = (float)ptr[i];
             }
         }
+
+        return 0;
     }
 
     // TODO more cast type
diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h
index 7c6fbb6d4cea..4d4d4d3b4240 100644
--- a/src/layer/riscv/cast_riscv.h
+++ b/src/layer/riscv/cast_riscv.h
@@ -25,6 +25,11 @@ class Cast_riscv : public Cast
     Cast_riscv();
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+#if NCNN_ZFH
+    void cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    void cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp
new file mode 100644
index 000000000000..114b9feb8f2a
--- /dev/null
+++ b/src/layer/riscv/cast_riscv_zfh.cpp
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cast_riscv.h"
+
+namespace ncnn {
+
+void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_zfh
+    NCNN_LOGE("__riscv_zfh");
+#endif
+#if __riscv_zvfh
+    NCNN_LOGE("__riscv_zvfh");
+#endif
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+#if __riscv_zfh
+        __fp16* outptr = top_blob.channel(q);
+#else
+        unsigned short* outptr = top_blob.channel(q);
+#endif
+
+        int i = 0;
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+            vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
+            __riscv_vse16_v_f16m4(outptr, _outp, vl);
+
+            ptr += vl;
+            outptr += vl;
+            n -= vl;
+        }
+#endif // __riscv_zvfh
+        for (; i < size; i++)
+        {
+#if __riscv_zfh
+            *outptr++ = (__fp16)(*ptr++);
+#else
+            *outptr++ = float32_to_float16(*ptr++);
+#endif
+        }
+    }
+}
+
+void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+#if __riscv_zfh
+        const __fp16* ptr = bottom_blob.channel(q);
+#else
+        const unsigned short* ptr = bottom_blob.channel(q);
+#endif
+        float* outptr = top_blob.channel(q);
+
+        int i = 0;
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m4(n);
+
+            vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
+            vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
+            __riscv_vse32_v_f32m8(outptr, _outp, vl);
+
+            ptr += vl;
+            outptr += vl;
+            n -= vl;
+        }
+#endif // __riscv_zvfh
+        for (; i < size; i++)
+        {
+#if __riscv_zfh
+            *outptr++ = (float)(*ptr++);
+#else
+            *outptr++ = float16_to_float32(*ptr++);
+#endif
+        }
+    }
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index b2c1c198857c..ca01f6c4a782 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -46,7 +46,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index c252d25f2351..9e7f5afdab49 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -120,7 +120,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index a0926077b340..6014b647193c 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -65,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 27afddd795cf..f76274470ab3 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -52,7 +52,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index 5fee5c24764d..ab22becae09c 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -46,7 +46,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 3b8751f6a2af..8b113b3ea571 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -63,7 +63,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 2e152b079bf9..e7ceac3432e1 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -373,8 +373,6 @@ _RVV_FLOAT32_POW_OP(8, 4)
         _v = exp_ps(_v, vl);                                                                                                                                       \
         _v = __riscv_vfadd_vf_f32m##LMUL(_v, 1.f, vl);                                                                                                             \
         vfloat32m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f32m##LMUL(_v, 1.f, vl);                                                                               \
-        _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl);       \
-        /* _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); */ \
         return _reciprocal;                                                                                                                                        \
     }
 #else // __riscv_xtheadvector
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index 08174608c4bc..fa65e0308222 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -373,8 +373,6 @@ _RVV_FLOAT16_POW_OP(8, 2)
         _v = exp_ps(_v, vl);                                                                                                                                               \
         _v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl);                                                                                                             \
         vfloat16m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f16m##LMUL(_v, (__fp16)1.f, vl);                                                                               \
-        _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl);       \
-        /* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \
         return _reciprocal;                                                                                                                                                \
     }
 #else // __riscv_xtheadvector
diff --git a/src/net.cpp b/src/net.cpp
index d5e13791b1d2..750f1252f39f 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -639,15 +639,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         }
         else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-        if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && layer->support_fp16_storage)
+#if NCNN_ZFH
+        if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && layer->support_fp16_storage)
         {
             Mat bottom_blob_fp16;
             cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
             bottom_blob = bottom_blob_fp16;
         }
         else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
         if (opt.use_bf16_storage && layer->support_bf16_storage)
         {
@@ -767,15 +767,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         }
         else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-        if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && !layer->support_fp16_storage)
+#if NCNN_ZFH
+        if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && !layer->support_fp16_storage)
         {
             Mat bottom_blob_fp32;
             cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
             bottom_blob = bottom_blob_fp32;
         }
         else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
         if (opt.use_bf16_storage && !layer->support_bf16_storage)
         {
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 5acd56848ee3..63d0743057bb 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -342,13 +342,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc
     }
     else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+#if NCNN_ZFH
+    if (opt.use_fp16_storage && (&& ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
         ncnn::cast_float32_to_float16(a, a4, opt);
     }
     else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
     if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
@@ -470,13 +470,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc
     }
     else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+#if NCNN_ZFH
+    if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16)
     {
         ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
     }
     else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
     if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
     {
diff --git a/toolchains/c906-v301.toolchain.cmake b/toolchains/c906-v301.toolchain.cmake
index 787b11c0a4cd..e67e0b0c14a4 100644
--- a/toolchains/c906-v301.toolchain.cmake
+++ b/toolchains/c906-v301.toolchain.cmake
@@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
     set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 endif()
 
-set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
-set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
+set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
+set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
 
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
diff --git a/toolchains/c910-v301.toolchain.cmake b/toolchains/c910-v301.toolchain.cmake
index ee53d77903f0..145a4415bdeb 100644
--- a/toolchains/c910-v301.toolchain.cmake
+++ b/toolchains/c910-v301.toolchain.cmake
@@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
     set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 endif()
 
-set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
-set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
+set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
+set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
 
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")