cast zfh

Tencent · Dec 2, 2024 · cd493c5 · cd493c5
1 parent fefea29
commit cd493c5
Show file tree

Hide file tree

Showing 15 changed files with 166 additions and 67 deletions.
diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp
@@ -18,15 +18,14 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Cast_riscv::Cast_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if __riscv_zvfh
-    support_fp16_storage = true;
-#endif
 #endif // __riscv_vector
 }
 
@@ -89,55 +88,33 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
 
     int size = w * h * d * elempack;
 
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZFH
     if (type_from == 1 && type_to == 2)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+#if __riscv_vector
+        if (cpu_support_riscv_zvfh())
+#else
+        if (cpu_support_riscv_zfh())
+#endif
         {
-            const float* ptr = bottom_blob.channel(q);
-            __fp16* outptr = top_blob.channel(q);
-
-            int n = size;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e32m8(n);
-
-                vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
-                vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
-                __riscv_vse16_v_f16m4(outptr, _outp, vl);
-
-                ptr += vl;
-                outptr += vl;
-                n -= vl;
-            }
+            cast_fp32_to_fp16(bottom_blob, top_blob, opt);
+            return 0;
         }
     }
 
     if (type_from == 2 && type_to == 1)
     {
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+#if __riscv_vector
+        if (cpu_support_riscv_zvfh())
+#else
+        if (cpu_support_riscv_zfh())
+#endif
         {
-            const __fp16* ptr = bottom_blob.channel(q);
-            float* outptr = top_blob.channel(q);
-
-            int n = size;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n);
-
-                vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
-                vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
-                __riscv_vse32_v_f32m8(outptr, _outp, vl);
-
-                ptr += vl;
-                outptr += vl;
-                n -= vl;
-            }
+            cast_fp16_to_fp32(bottom_blob, top_blob, opt);
+            return 0;
         }
     }
-#endif // __riscv_vector && __riscv_zvfh
+#endif // NCNN_ZFH
 
     if (type_from == 3 && type_to == 1)
     {
@@ -152,6 +129,8 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
                 outptr[i] = (float)ptr[i];
             }
         }
+
+        return 0;
     }
 
     // TODO more cast type

diff --git a/src/layer/riscv/cast_riscv.h b/src/layer/riscv/cast_riscv.h
@@ -25,6 +25,11 @@ class Cast_riscv : public Cast
     Cast_riscv();
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+#if NCNN_ZFH
+    void cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    void cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
 };
 
 } // namespace ncnn

diff --git a/src/layer/riscv/cast_riscv_zfh.cpp b/src/layer/riscv/cast_riscv_zfh.cpp
@@ -0,0 +1,119 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "cast_riscv.h"
+
+namespace ncnn {
+
+void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_zfh
+    NCNN_LOGE("__riscv_zfh");
+#endif
+#if __riscv_zvfh
+    NCNN_LOGE("__riscv_zvfh");
+#endif
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = bottom_blob.channel(q);
+#if __riscv_zfh
+        __fp16* outptr = top_blob.channel(q);
+#else
+        unsigned short* outptr = top_blob.channel(q);
+#endif
+
+        int i = 0;
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+
+            vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
+            vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
+            __riscv_vse16_v_f16m4(outptr, _outp, vl);
+
+            ptr += vl;
+            outptr += vl;
+            n -= vl;
+        }
+#endif // __riscv_zvfh
+        for (; i < size; i++)
+        {
+#if __riscv_zfh
+            *outptr++ = (__fp16)(*ptr++);
+#else
+            *outptr++ = float32_to_float16(*ptr++);
+#endif
+        }
+    }
+}
+
+void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int d = bottom_blob.d;
+    const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+#if __riscv_zfh
+        const __fp16* ptr = bottom_blob.channel(q);
+#else
+        const unsigned short* ptr = bottom_blob.channel(q);
+#endif
+        float* outptr = top_blob.channel(q);
+
+        int i = 0;
+#if __riscv_zvfh
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m4(n);
+
+            vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
+            vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
+            __riscv_vse32_v_f32m8(outptr, _outp, vl);
+
+            ptr += vl;
+            outptr += vl;
+            n -= vl;
+        }
+#endif // __riscv_zvfh
+        for (; i < size; i++)
+        {
+#if __riscv_zfh
+            *outptr++ = (float)(*ptr++);
+#else
+            *outptr++ = float16_to_float32(*ptr++);
+#endif
+        }
+    }
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
@@ -46,7 +46,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
@@ -120,7 +120,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -65,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
@@ -52,7 +52,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -46,7 +46,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
         return 0;
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
@@ -63,7 +63,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
 #endif
 
 #if NCNN_ZFH
-    if (opt.use_fp16_storage)
+    if (support_fp16_storage && opt.use_fp16_storage)
     {
         return create_pipeline_fp16s(opt);
     }

diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
@@ -373,8 +373,6 @@ _RVV_FLOAT32_POW_OP(8, 4)
         _v = exp_ps(_v, vl);                                                                                                                                       \
         _v = __riscv_vfadd_vf_f32m##LMUL(_v, 1.f, vl);                                                                                                             \
         vfloat32m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f32m##LMUL(_v, 1.f, vl);                                                                               \
-        _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl);       \
-        /* _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); */ \
         return _reciprocal;                                                                                                                                        \
     }
 #else // __riscv_xtheadvector

diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -373,8 +373,6 @@ _RVV_FLOAT16_POW_OP(8, 2)
         _v = exp_ps(_v, vl);                                                                                                                                               \
         _v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl);                                                                                                             \
         vfloat16m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f16m##LMUL(_v, (__fp16)1.f, vl);                                                                               \
-        _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl);       \
-        /* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \
         return _reciprocal;                                                                                                                                                \
     }
 #else // __riscv_xtheadvector

diff --git a/src/net.cpp b/src/net.cpp
@@ -639,15 +639,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         }
         else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-        if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && layer->support_fp16_storage)
+#if NCNN_ZFH
+        if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && layer->support_fp16_storage)
         {
             Mat bottom_blob_fp16;
             cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
             bottom_blob = bottom_blob_fp16;
         }
         else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
         if (opt.use_bf16_storage && layer->support_bf16_storage)
         {
@@ -767,15 +767,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         }
         else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-        if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && !layer->support_fp16_storage)
+#if NCNN_ZFH
+        if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && !layer->support_fp16_storage)
         {
             Mat bottom_blob_fp32;
             cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
             bottom_blob = bottom_blob_fp32;
         }
         else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
         if (opt.use_bf16_storage && !layer->support_bf16_storage)
         {

diff --git a/tests/testutil.cpp b/tests/testutil.cpp
@@ -342,13 +342,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc
     }
     else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+#if NCNN_ZFH
+    if (opt.use_fp16_storage && (&& ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
         ncnn::cast_float32_to_float16(a, a4, opt);
     }
     else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
     if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
@@ -470,13 +470,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc
     }
     else
 #endif // NCNN_VFPV4
-#if NCNN_ZVFH
-    if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+#if NCNN_ZFH
+    if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16)
     {
         ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
     }
     else
-#endif // NCNN_ZVFH
+#endif // NCNN_ZFH
 #if NCNN_BF16
     if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
     {

diff --git a/toolchains/c906-v301.toolchain.cmake b/toolchains/c906-v301.toolchain.cmake
@@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
     set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 endif()
 
-set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
-set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
+set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
+set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
 
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")

diff --git a/toolchains/c910-v301.toolchain.cmake b/toolchains/c910-v301.toolchain.cmake
@@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
     set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
 endif()
 
-set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
-set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
+set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
+set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
 
 # cache flags
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")