Skip to content

Commit

Permalink
cast zfh
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Dec 2, 2024
1 parent fefea29 commit cd493c5
Show file tree
Hide file tree
Showing 15 changed files with 166 additions and 67 deletions.
61 changes: 20 additions & 41 deletions src/layer/riscv/cast_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,14 @@
#include <riscv_vector.h>
#endif // __riscv_vector

#include "cpu.h"

namespace ncnn {

Cast_riscv::Cast_riscv()
{
#if __riscv_vector
support_packing = true;
#if __riscv_zvfh
support_fp16_storage = true;
#endif
#endif // __riscv_vector
}

Expand Down Expand Up @@ -89,55 +88,33 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt

int size = w * h * d * elempack;

#if __riscv_vector && __riscv_zvfh
#if NCNN_ZFH
if (type_from == 1 && type_to == 2)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
#if __riscv_vector
if (cpu_support_riscv_zvfh())
#else
if (cpu_support_riscv_zfh())
#endif
{
const float* ptr = bottom_blob.channel(q);
__fp16* outptr = top_blob.channel(q);

int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
__riscv_vse16_v_f16m4(outptr, _outp, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
cast_fp32_to_fp16(bottom_blob, top_blob, opt);
return 0;
}
}

if (type_from == 2 && type_to == 1)
{
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
#if __riscv_vector
if (cpu_support_riscv_zvfh())
#else
if (cpu_support_riscv_zfh())
#endif
{
const __fp16* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m4(n);

vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
__riscv_vse32_v_f32m8(outptr, _outp, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
cast_fp16_to_fp32(bottom_blob, top_blob, opt);
return 0;
}
}
#endif // __riscv_vector && __riscv_zvfh
#endif // NCNN_ZFH

if (type_from == 3 && type_to == 1)
{
Expand All @@ -152,6 +129,8 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
outptr[i] = (float)ptr[i];
}
}

return 0;
}

// TODO more cast type
Expand Down
5 changes: 5 additions & 0 deletions src/layer/riscv/cast_riscv.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ class Cast_riscv : public Cast
Cast_riscv();

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

#if NCNN_ZFH
void cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
void cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
#endif
};

} // namespace ncnn
Expand Down
119 changes: 119 additions & 0 deletions src/layer/riscv/cast_riscv_zfh.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#include "cast_riscv.h"

namespace ncnn {

void Cast_riscv::cast_fp32_to_fp16(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
#if __riscv_zfh
NCNN_LOGE("__riscv_zfh");
#endif
#if __riscv_zvfh
NCNN_LOGE("__riscv_zvfh");
#endif
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;

const int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
#if __riscv_zfh
__fp16* outptr = top_blob.channel(q);
#else
unsigned short* outptr = top_blob.channel(q);
#endif

int i = 0;
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e32m8(n);

vfloat32m8_t _p = __riscv_vle32_v_f32m8(ptr, vl);
vfloat16m4_t _outp = __riscv_vfncvt_f_f_w_f16m4(_p, vl);
__riscv_vse16_v_f16m4(outptr, _outp, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#endif // __riscv_zvfh
for (; i < size; i++)
{
#if __riscv_zfh
*outptr++ = (__fp16)(*ptr++);
#else
*outptr++ = float32_to_float16(*ptr++);
#endif
}
}
}

void Cast_riscv::cast_fp16_to_fp32(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
const int w = bottom_blob.w;
const int h = bottom_blob.h;
const int d = bottom_blob.d;
const int channels = bottom_blob.c;
const int elempack = bottom_blob.elempack;

const int size = w * h * d * elempack;

#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
#if __riscv_zfh
const __fp16* ptr = bottom_blob.channel(q);
#else
const unsigned short* ptr = bottom_blob.channel(q);
#endif
float* outptr = top_blob.channel(q);

int i = 0;
#if __riscv_zvfh
int n = size;
while (n > 0)
{
size_t vl = __riscv_vsetvl_e16m4(n);

vfloat16m4_t _p = __riscv_vle16_v_f16m4(ptr, vl);
vfloat32m8_t _outp = __riscv_vfwcvt_f_f_v_f32m8(_p, vl);
__riscv_vse32_v_f32m8(outptr, _outp, vl);

ptr += vl;
outptr += vl;
n -= vl;
}
#endif // __riscv_zvfh
for (; i < size; i++)
{
#if __riscv_zfh
*outptr++ = (float)(*ptr++);
#else
*outptr++ = float16_to_float32(*ptr++);
#endif
}
}
}

} // namespace ncnn
2 changes: 1 addition & 1 deletion src/layer/riscv/convolution1d_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ int Convolution1D_riscv::create_pipeline(const Option& opt)
return 0;

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/convolution_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
#endif

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/convolutiondepthwise_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
#endif

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/deconvolution_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ int Deconvolution_riscv::create_pipeline(const Option& opt)
return 0;

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/deconvolutiondepthwise_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
return 0;

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 1 addition & 1 deletion src/layer/riscv/innerproduct_riscv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
#endif

#if NCNN_ZFH
if (opt.use_fp16_storage)
if (support_fp16_storage && opt.use_fp16_storage)
{
return create_pipeline_fp16s(opt);
}
Expand Down
2 changes: 0 additions & 2 deletions src/layer/riscv/rvv_mathfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,8 +373,6 @@ _RVV_FLOAT32_POW_OP(8, 4)
_v = exp_ps(_v, vl); \
_v = __riscv_vfadd_vf_f32m##LMUL(_v, 1.f, vl); \
vfloat32m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f32m##LMUL(_v, 1.f, vl); \
_reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); \
/* _reciprocal = __riscv_vfmul_vv_f32m##LMUL(__riscv_vfrsub_vf_f32m##LMUL(__riscv_vfmul_vv_f32m##LMUL(_v, _reciprocal, vl), 2.f, vl), _reciprocal, vl); */ \
return _reciprocal; \
}
#else // __riscv_xtheadvector
Expand Down
2 changes: 0 additions & 2 deletions src/layer/riscv/rvv_mathfun_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -373,8 +373,6 @@ _RVV_FLOAT16_POW_OP(8, 2)
_v = exp_ps(_v, vl); \
_v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \
vfloat16m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \
_reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); \
/* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \
return _reciprocal; \
}
#else // __riscv_xtheadvector
Expand Down
12 changes: 6 additions & 6 deletions src/net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -639,15 +639,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
}
else
#endif // NCNN_VFPV4
#if NCNN_ZVFH
if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && layer->support_fp16_storage)
#if NCNN_ZFH
if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && layer->support_fp16_storage)
{
Mat bottom_blob_fp16;
cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
bottom_blob = bottom_blob_fp16;
}
else
#endif // NCNN_ZVFH
#endif // NCNN_ZFH
#if NCNN_BF16
if (opt.use_bf16_storage && layer->support_bf16_storage)
{
Expand Down Expand Up @@ -767,15 +767,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
}
else
#endif // NCNN_VFPV4
#if NCNN_ZVFH
if (opt.use_fp16_storage && cpu_support_riscv_zvfh() && !layer->support_fp16_storage)
#if NCNN_ZFH
if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && !layer->support_fp16_storage)
{
Mat bottom_blob_fp32;
cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
bottom_blob = bottom_blob_fp32;
}
else
#endif // NCNN_ZVFH
#endif // NCNN_ZFH
#if NCNN_BF16
if (opt.use_bf16_storage && !layer->support_bf16_storage)
{
Expand Down
12 changes: 6 additions & 6 deletions tests/testutil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,13 +342,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc
}
else
#endif // NCNN_VFPV4
#if NCNN_ZVFH
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
#if NCNN_ZFH
if (opt.use_fp16_storage && (&& ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
ncnn::cast_float32_to_float16(a, a4, opt);
}
else
#endif // NCNN_ZVFH
#endif // NCNN_ZFH
#if NCNN_BF16
if (opt.use_bf16_storage && op->support_bf16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
{
Expand Down Expand Up @@ -470,13 +470,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc
}
else
#endif // NCNN_VFPV4
#if NCNN_ZVFH
if (opt.use_fp16_storage && ncnn::cpu_support_riscv_zvfh() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
#if NCNN_ZFH
if (opt.use_fp16_storage && (ncnn::cpu_support_riscv_zvfh() || (!ncnn::cpu_support_riscv_v() && ncnn::cpu_support_riscv_zfh())) && op->support_fp16_storage && c4_unpacked.elembits() == 16)
{
ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
}
else
#endif // NCNN_ZVFH
#endif // NCNN_ZFH
#if NCNN_BF16
if (opt.use_bf16_storage && op->support_bf16_storage && c4_unpacked.elembits() == 16)
{
Expand Down
4 changes: 2 additions & 2 deletions toolchains/c906-v301.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
endif()

set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -D__riscv_zvfh=1 -static")
set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c906 -DC906=1 -static")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
Expand Down
4 changes: 2 additions & 2 deletions toolchains/c910-v301.toolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ if(NOT CMAKE_FIND_ROOT_PATH_MODE_PACKAGE)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
endif()

set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -D__riscv_zvfh=1 -static")
set(CMAKE_C_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")
set(CMAKE_CXX_FLAGS "-march=rv64gc_zfh_xtheadvector_xtheadc -mabi=lp64d -mtune=c910 -DC906=1 -static")

# cache flags
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
Expand Down

0 comments on commit cd493c5

Please sign in to comment.