From 9117c40707f29d66053bb8903e2dbaf9b414ebd3 Mon Sep 17 00:00:00 2001 From: nihuini Date: Mon, 26 Aug 2024 15:25:17 +0800 Subject: [PATCH] x --- src/layer/riscv/gru_riscv.cpp | 660 +----------------------- src/layer/riscv/gru_riscv.h | 2 +- src/layer/riscv/gru_riscv_zvfh.cpp | 661 +++++++++++++++++++++++++ src/layer/riscv/instancenorm_riscv.cpp | 2 +- 4 files changed, 672 insertions(+), 653 deletions(-) create mode 100644 src/layer/riscv/gru_riscv_zvfh.cpp diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp index 6d6330c3eef..47f6dee9118 100644 --- a/src/layer/riscv/gru_riscv.cpp +++ b/src/layer/riscv/gru_riscv.cpp @@ -208,8 +208,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we GRU_riscv::GRU_riscv() { -#if __riscv_vector && __riscv_zvfh - support_fp16_storage = true; +#if NCNN_ZVFH + support_fp16_storage = cpu_support_riscv_zvfh(); #endif } @@ -223,7 +223,7 @@ int GRU_riscv::create_pipeline(const Option& opt) } #endif -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH if (opt.use_fp16_storage && opt.use_fp16_arithmetic) return create_pipeline_fp16sa(opt); #endif @@ -240,10 +240,11 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) } #endif - int elembits = bottom_blob.elembits(); #if __riscv_vector -#if __riscv_zvfh +#if NCNN_ZVFH + int elembits = bottom_blob.elembits(); + if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -322,10 +323,11 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t #endif const Mat& bottom_blob = bottom_blobs[0]; - int elembits = bottom_blob.elembits(); #if __riscv_vector -#if __riscv_zvfh +#if NCNN_ZVFH + int elembits = bottom_blob.elembits(); + if (opt.use_fp16_storage && elembits == 16) { if (opt.use_fp16_arithmetic) @@ -408,648 +410,4 @@ int GRU_riscv::forward(const std::vector& bottom_blobs, std::vector& t return GRU::forward(bottom_blobs, top_blobs, opt); } -#if __riscv_vector && __riscv_zvfh -static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) -{ - int size = bottom_blob.w; - int T = bottom_blob.h; - - int num_output = top_blob.w; - - // 2 x num_output - Mat gates(2, num_output, 4u, opt.workspace_allocator); - if (gates.empty()) - return -100; - - // unroll - for (int t = 0; t < T; t++) - { - int ti = reverse ? T - 1 - t : t; - - const __fp16* x = bottom_blob.row(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - float* gates_data = gates.row(q); - - // gate reset update - const float* bias_c_R = bias_c.row(0); - const float* bias_c_U = bias_c.row(1); - - const float* weight_xc_R = weight_xc.row(num_output * 0 + q); - const float* weight_xc_U = weight_xc.row(num_output * 1 + q); - const float* weight_hc_R = weight_hc.row(num_output * 0 + q); - const float* weight_hc_U = weight_hc.row(num_output * 1 + q); - - float R = bias_c_R[q]; - float U = bias_c_U[q]; - - int n = size; - const __fp16* ptr_x = x; - const float* ptr_xcr = weight_xc_R; - const float* ptr_xcu = weight_xc_U; - while (n > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n); - vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x, vl), vl); - vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl); - vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl); - vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); - vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); - - _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl); - _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl); - _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl); - _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl); - - R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); - U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); - - ptr_x += vl; - ptr_xcr += vl; - ptr_xcu += vl; - n -= vl; - } - ptr_x = NULL; - ptr_xcr = NULL; - ptr_xcu = NULL; - - int n_out = num_output; - const float* ptr_hc = hidden_state; - const float* ptr_hcr = weight_hc_R; - const float* ptr_hcu = weight_hc_U; - while (n_out > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n_out); - vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl); - vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl); - vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl); - vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); - vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); - - _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl); - _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl); - _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl); - _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl); - - R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); - U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); - - ptr_hc += vl; - ptr_hcr += vl; - ptr_hcu += vl; - n_out -= vl; - } - ptr_hc = NULL; - ptr_hcr = NULL; - ptr_hcu = NULL; - - // sigmoid(R) - // sigmoid(U) - R = 1.f / (1.f + exp(-R)); - U = 1.f / (1.f + exp(-U)); - - // gate new - const float* bias_c_WN = bias_c.row(2); - const float* bias_c_BN = bias_c.row(3); - - const float* weight_xc_N = weight_xc.row(num_output * 2 + q); - const float* weight_hc_N = weight_hc.row(num_output * 2 + q); - - float N = bias_c_BN[q]; - - int n_out2 = num_output; - const float* ptr_hc2 = hidden_state; - const float* ptr_whc_n = weight_hc_N; - while (n_out2 > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n_out2); - - vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl); - vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl); - vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); - - _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl); - _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl); - - N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); - n_out2 -= vl; - ptr_hc2 += vl; - ptr_whc_n += vl; - } - ptr_hc2 = NULL; - ptr_whc_n = NULL; - - N = bias_c_WN[q] + R * N; - - int n2 = size; - const __fp16* ptr_x2 = x; - const float* ptr_xcn = weight_xc_N; - while (n2 > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n2); - - vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x2, vl), vl); - vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl); - vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); - - _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl); - _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl); - N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); - - n2 -= vl; - ptr_x2 += vl; - ptr_xcn += vl; - } - ptr_x2 = NULL; - ptr_xcn = NULL; - - // tanh(N) - N = tanh(N); - - gates_data[0] = U; - gates_data[1] = N; - } - - // h_t := (1 - update) .* new + update .* h_{t-1} - __fp16* output_data = top_blob.row<__fp16>(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - const float* gates_data = gates.row(q); - - float U = gates_data[0]; - float N = gates_data[1]; - - float H = (1 - U) * N + U * hidden_state[q]; - - hidden_state[q] = H; - output_data[q] = (__fp16)H; - } - } - - return 0; -} - -int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - int T = bottom_blob.h; - - int num_directions = direction == 2 ? 2 : 1; - // initial hidden state - Mat hidden(num_output, 4u, opt.workspace_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); - if (ret0 != 0) - return ret0; - - hidden.fill(0.0f); - - int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - return 0; -} - -int GRU_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - int T = bottom_blob.h; - int num_directions = direction == 2 ? 2 : 1; - - Mat hidden; - Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; - if (bottom_blobs.size() == 2) - { - Option opt_cast = opt; - opt_cast.blob_allocator = hidden_allocator; - cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); - } - else - { - hidden.create(num_output, num_directions, 4u, hidden_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - } - - Mat& top_blob = top_blobs[0]; - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - Mat hidden0 = hidden.row_range(0, 1); - int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - Mat hidden0 = hidden.row_range(0, 1); - int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); - if (ret0 != 0) - return ret0; - - Mat hidden1 = hidden.row_range(1, 1); - int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - if (top_blobs.size() == 2) - { - cast_float32_to_float16(hidden, top_blobs[1], opt); - } - - return 0; -} - -#endif - -//fp16sa start at here -#if __riscv_vector && __riscv_zvfh - -int GRU_riscv::create_pipeline_fp16sa(const Option& opt) -{ - cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt); - cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt); - cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt); - - if (opt.lightmode) - { - weight_xc_data.release(); - bias_c_data.release(); - weight_hc_data.release(); - } - - return 0; -} - -static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) -{ - int size = bottom_blob.w; - int T = bottom_blob.h; - - int num_output = top_blob.w; - - // 2 x num_output - Mat gates(2, num_output, 4u, opt.workspace_allocator); - if (gates.empty()) - return -100; - - // unroll - for (int t = 0; t < T; t++) - { - int ti = reverse ? T - 1 - t : t; - - const __fp16* x = bottom_blob.row(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - float* gates_data = gates.row(q); - - // gate reset update - const __fp16* bias_c_R = bias_c.row(0); - const __fp16* bias_c_U = bias_c.row(1); - - const __fp16* weight_xc_R = weight_xc.row(num_output * 0 + q); - const __fp16* weight_xc_U = weight_xc.row(num_output * 1 + q); - const __fp16* weight_hc_R = weight_hc.row(num_output * 0 + q); - const __fp16* weight_hc_U = weight_hc.row(num_output * 1 + q); - - __fp16 R = bias_c_R[q]; - __fp16 U = bias_c_U[q]; - - int n = size; - const __fp16* ptr_x = x; - const __fp16* ptr_xcr = weight_xc_R; - const __fp16* ptr_xcu = weight_xc_U; - while (n > 0) - { - size_t vl = __riscv_vsetvl_e16m8(n); - vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x, vl); - vfloat16m8_t _xcr = __riscv_vle16_v_f16m8(ptr_xcr, vl); - vfloat16m8_t _xcu = __riscv_vle16_v_f16m8(ptr_xcu, vl); - vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); - vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); - - _xcr = __riscv_vfmul_vv_f16m8(_xcr, _x, vl); - _xcu = __riscv_vfmul_vv_f16m8(_xcu, _x, vl); - _scalar_r = __riscv_vfredusum_vs_f16m8_f16m1(_xcr, _scalar_r, vl); - _scalar_u = __riscv_vfredusum_vs_f16m8_f16m1(_xcu, _scalar_u, vl); - - R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); - U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); - - ptr_x += vl; - ptr_xcr += vl; - ptr_xcu += vl; - n -= vl; - } - - int n_out = num_output; - const float* ptr_hc = hidden_state; - const __fp16* ptr_hcr = weight_hc_R; - const __fp16* ptr_hcu = weight_hc_U; - while (n_out > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n_out); - vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc, vl), vl); - vfloat16m4_t _hcr = __riscv_vle16_v_f16m4(ptr_hcr, vl); - vfloat16m4_t _hcu = __riscv_vle16_v_f16m4(ptr_hcu, vl); - vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); - vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); - - _hcr = __riscv_vfmul_vv_f16m4(_hcr, _h_cont, vl); - _hcu = __riscv_vfmul_vv_f16m4(_hcu, _h_cont, vl); - _scalar_r = __riscv_vfredusum_vs_f16m4_f16m1(_hcr, _scalar_r, vl); - _scalar_u = __riscv_vfredusum_vs_f16m4_f16m1(_hcu, _scalar_u, vl); - - R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); - U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); - - ptr_hc += vl; - ptr_hcr += vl; - ptr_hcu += vl; - n_out -= vl; - } - - // sigmoid(R) - // sigmoid(U) - R = 1.f / (1.f + (__fp16)exp((float)(-R))); - U = 1.f / (1.f + (__fp16)exp((float)(-U))); - - // gate new - const __fp16* bias_c_WN = bias_c.row(2); - const __fp16* bias_c_BN = bias_c.row(3); - - const __fp16* weight_xc_N = weight_xc.row(num_output * 2 + q); - const __fp16* weight_hc_N = weight_hc.row(num_output * 2 + q); - - __fp16 N = bias_c_BN[q]; - - int n_out2 = num_output; - const float* ptr_hc2 = hidden_state; - const __fp16* ptr_whc_n = weight_hc_N; - while (n_out2 > 0) - { - size_t vl = __riscv_vsetvl_e16m4(n_out2); - - vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc2, vl), vl); - vfloat16m4_t _whc_n = __riscv_vle16_v_f16m4(ptr_whc_n, vl); - vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); - - _h_cont = __riscv_vfmul_vv_f16m4(_whc_n, _h_cont, vl); - _scalar_n = __riscv_vfredusum_vs_f16m4_f16m1(_h_cont, _scalar_n, vl); - - N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); - n_out2 -= vl; - ptr_hc2 += vl; - ptr_whc_n += vl; - } - N = bias_c_WN[q] + R * N; - - int n2 = size; - const __fp16* ptr_x2 = x; - const __fp16* ptr_xcn = weight_xc_N; - while (n2 > 0) - { - size_t vl = __riscv_vsetvl_e16m8(n2); - - vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x2, vl); - vfloat16m8_t _xcn = __riscv_vle16_v_f16m8(ptr_xcn, vl); - vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); - - _xcn = __riscv_vfmul_vv_f16m8(_x, _xcn, vl); - _scalar_n = __riscv_vfredusum_vs_f16m8_f16m1(_xcn, _scalar_n, vl); - N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); - - n2 -= vl; - ptr_x2 += vl; - ptr_xcn += vl; - } - - // tanh(N) - N = (__fp16)tanh((float)N); - - gates_data[0] = U; - gates_data[1] = N; - } - - // h_t := (1 - update) .* new + update .* h_{t-1} - __fp16* output_data = top_blob.row<__fp16>(ti); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < num_output; q++) - { - const float* gates_data = gates.row(q); - - float U = gates_data[0]; - float N = gates_data[1]; - - float H = (1 - U) * N + U * hidden_state[q]; - - hidden_state[q] = H; - output_data[q] = H; - } - } - - return 0; -} - -int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const -{ - int T = bottom_blob.h; - - int num_directions = direction == 2 ? 2 : 1; - // initial hidden state - Mat hidden(num_output, 4u, opt.workspace_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret0 != 0) - return ret0; - - hidden.fill(0.0f); - - int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - return 0; -} - -int GRU_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const -{ - const Mat& bottom_blob = bottom_blobs[0]; - int T = bottom_blob.h; - int num_directions = direction == 2 ? 2 : 1; - - Mat hidden; - Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; - if (bottom_blobs.size() == 2) - { - Option opt_cast = opt; - opt_cast.blob_allocator = hidden_allocator; - cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); - } - else - { - hidden.create(num_output, num_directions, 4u, hidden_allocator); - if (hidden.empty()) - return -100; - hidden.fill(0.f); - } - - Mat& top_blob = top_blobs[0]; - top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); - if (top_blob.empty()) - return -100; - - // Uni directional - if (direction == 0 || direction == 1) - { - int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); - if (ret != 0) - return ret; - } - - if (direction == 2) - { - Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_forward.empty()) - return -100; - - Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); - if (top_blob_reverse.empty()) - return -100; - - Mat hidden0 = hidden.row_range(0, 1); - int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt); - if (ret0 != 0) - return ret0; - - Mat hidden1 = hidden.row_range(1, 1); - int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt); - if (ret1 != 0) - return ret1; - - // concat w - for (int i = 0; i < T; i++) - { - const __fp16* pf = top_blob_forward.row(i); - const __fp16* pr = top_blob_reverse.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - - memcpy(ptr, pf, num_output * sizeof(__fp16)); - memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); - } - } - - if (top_blobs.size() == 2) - { - cast_float32_to_float16(hidden, top_blobs[1], opt); - } - - return 0; -} - -#endif - } // namespace ncnn diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h index 3d5532e198d..32d75d83d58 100644 --- a/src/layer/riscv/gru_riscv.h +++ b/src/layer/riscv/gru_riscv.h @@ -29,7 +29,7 @@ class GRU_riscv : public GRU virtual int create_pipeline(const Option& opt); protected: -#if __riscv_vector && __riscv_zvfh +#if NCNN_ZVFH int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; diff --git a/src/layer/riscv/gru_riscv_zvfh.cpp b/src/layer/riscv/gru_riscv_zvfh.cpp new file mode 100644 index 00000000000..d714d690e88 --- /dev/null +++ b/src/layer/riscv/gru_riscv_zvfh.cpp @@ -0,0 +1,661 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "gru_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +namespace ncnn { + +#if __riscv_zvfh +static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) +{ + int size = bottom_blob.w; + int T = bottom_blob.h; + + int num_output = top_blob.w; + + // 2 x num_output + Mat gates(2, num_output, 4u, opt.workspace_allocator); + if (gates.empty()) + return -100; + + // unroll + for (int t = 0; t < T; t++) + { + int ti = reverse ? T - 1 - t : t; + + const __fp16* x = bottom_blob.row(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + float* gates_data = gates.row(q); + + // gate reset update + const float* bias_c_R = bias_c.row(0); + const float* bias_c_U = bias_c.row(1); + + const float* weight_xc_R = weight_xc.row(num_output * 0 + q); + const float* weight_xc_U = weight_xc.row(num_output * 1 + q); + const float* weight_hc_R = weight_hc.row(num_output * 0 + q); + const float* weight_hc_U = weight_hc.row(num_output * 1 + q); + + float R = bias_c_R[q]; + float U = bias_c_U[q]; + + int n = size; + const __fp16* ptr_x = x; + const float* ptr_xcr = weight_xc_R; + const float* ptr_xcu = weight_xc_U; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n); + vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x, vl), vl); + vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl); + vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); + + _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl); + _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); + + ptr_x += vl; + ptr_xcr += vl; + ptr_xcu += vl; + n -= vl; + } + ptr_x = NULL; + ptr_xcr = NULL; + ptr_xcu = NULL; + + int n_out = num_output; + const float* ptr_hc = hidden_state; + const float* ptr_hcr = weight_hc_R; + const float* ptr_hcu = weight_hc_U; + while (n_out > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out); + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl); + vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl); + vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl); + vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl); + vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl); + + _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl); + _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl); + _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r); + U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u); + + ptr_hc += vl; + ptr_hcr += vl; + ptr_hcu += vl; + n_out -= vl; + } + ptr_hc = NULL; + ptr_hcr = NULL; + ptr_hcu = NULL; + + // sigmoid(R) + // sigmoid(U) + R = 1.f / (1.f + exp(-R)); + U = 1.f / (1.f + exp(-U)); + + // gate new + const float* bias_c_WN = bias_c.row(2); + const float* bias_c_BN = bias_c.row(3); + + const float* weight_xc_N = weight_xc.row(num_output * 2 + q); + const float* weight_hc_N = weight_hc.row(num_output * 2 + q); + + float N = bias_c_BN[q]; + + int n_out2 = num_output; + const float* ptr_hc2 = hidden_state; + const float* ptr_whc_n = weight_hc_N; + while (n_out2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out2); + + vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl); + vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); + + _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl); + + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); + n_out2 -= vl; + ptr_hc2 += vl; + ptr_whc_n += vl; + } + ptr_hc2 = NULL; + ptr_whc_n = NULL; + + N = bias_c_WN[q] + R * N; + + int n2 = size; + const __fp16* ptr_x2 = x; + const float* ptr_xcn = weight_xc_N; + while (n2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n2); + + vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x2, vl), vl); + vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl); + vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl); + + _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl); + _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl); + N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n); + + n2 -= vl; + ptr_x2 += vl; + ptr_xcn += vl; + } + ptr_x2 = NULL; + ptr_xcn = NULL; + + // tanh(N) + N = tanh(N); + + gates_data[0] = U; + gates_data[1] = N; + } + + // h_t := (1 - update) .* new + update .* h_{t-1} + __fp16* output_data = top_blob.row<__fp16>(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + const float* gates_data = gates.row(q); + + float U = gates_data[0]; + float N = gates_data[1]; + + float H = (1 - U) * N + U * hidden_state[q]; + + hidden_state[q] = H; + output_data[q] = (__fp16)H; + } + } + + return 0; +} + +int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int T = bottom_blob.h; + + int num_directions = direction == 2 ? 2 : 1; + // initial hidden state + Mat hidden(num_output, 4u, opt.workspace_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt); + if (ret0 != 0) + return ret0; + + hidden.fill(0.0f); + + int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + return 0; +} + +int GRU_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int T = bottom_blob.h; + int num_directions = direction == 2 ? 2 : 1; + + Mat hidden; + Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; + if (bottom_blobs.size() == 2) + { + Option opt_cast = opt; + opt_cast.blob_allocator = hidden_allocator; + cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); + } + else + { + hidden.create(num_output, num_directions, 4u, hidden_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + Mat hidden0 = hidden.row_range(0, 1); + int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + Mat hidden0 = hidden.row_range(0, 1); + int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt); + if (ret0 != 0) + return ret0; + + Mat hidden1 = hidden.row_range(1, 1); + int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + if (top_blobs.size() == 2) + { + cast_float32_to_float16(hidden, top_blobs[1], opt); + } + + return 0; +} + +int GRU_riscv::create_pipeline_fp16sa(const Option& opt) +{ + cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt); + cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt); + cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt); + + if (opt.lightmode) + { + weight_xc_data.release(); + bias_c_data.release(); + weight_hc_data.release(); + } + + return 0; +} + +static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt) +{ + int size = bottom_blob.w; + int T = bottom_blob.h; + + int num_output = top_blob.w; + + // 2 x num_output + Mat gates(2, num_output, 4u, opt.workspace_allocator); + if (gates.empty()) + return -100; + + // unroll + for (int t = 0; t < T; t++) + { + int ti = reverse ? T - 1 - t : t; + + const __fp16* x = bottom_blob.row(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + float* gates_data = gates.row(q); + + // gate reset update + const __fp16* bias_c_R = bias_c.row(0); + const __fp16* bias_c_U = bias_c.row(1); + + const __fp16* weight_xc_R = weight_xc.row(num_output * 0 + q); + const __fp16* weight_xc_U = weight_xc.row(num_output * 1 + q); + const __fp16* weight_hc_R = weight_hc.row(num_output * 0 + q); + const __fp16* weight_hc_U = weight_hc.row(num_output * 1 + q); + + __fp16 R = bias_c_R[q]; + __fp16 U = bias_c_U[q]; + + int n = size; + const __fp16* ptr_x = x; + const __fp16* ptr_xcr = weight_xc_R; + const __fp16* ptr_xcu = weight_xc_U; + while (n > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n); + vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x, vl); + vfloat16m8_t _xcr = __riscv_vle16_v_f16m8(ptr_xcr, vl); + vfloat16m8_t _xcu = __riscv_vle16_v_f16m8(ptr_xcu, vl); + vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); + vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); + + _xcr = __riscv_vfmul_vv_f16m8(_xcr, _x, vl); + _xcu = __riscv_vfmul_vv_f16m8(_xcu, _x, vl); + _scalar_r = __riscv_vfredusum_vs_f16m8_f16m1(_xcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f16m8_f16m1(_xcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); + U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); + + ptr_x += vl; + ptr_xcr += vl; + ptr_xcu += vl; + n -= vl; + } + + int n_out = num_output; + const float* ptr_hc = hidden_state; + const __fp16* ptr_hcr = weight_hc_R; + const __fp16* ptr_hcu = weight_hc_U; + while (n_out > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out); + vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc, vl), vl); + vfloat16m4_t _hcr = __riscv_vle16_v_f16m4(ptr_hcr, vl); + vfloat16m4_t _hcu = __riscv_vle16_v_f16m4(ptr_hcu, vl); + vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl); + vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl); + + _hcr = __riscv_vfmul_vv_f16m4(_hcr, _h_cont, vl); + _hcu = __riscv_vfmul_vv_f16m4(_hcu, _h_cont, vl); + _scalar_r = __riscv_vfredusum_vs_f16m4_f16m1(_hcr, _scalar_r, vl); + _scalar_u = __riscv_vfredusum_vs_f16m4_f16m1(_hcu, _scalar_u, vl); + + R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r); + U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u); + + ptr_hc += vl; + ptr_hcr += vl; + ptr_hcu += vl; + n_out -= vl; + } + + // sigmoid(R) + // sigmoid(U) + R = 1.f / (1.f + (__fp16)exp((float)(-R))); + U = 1.f / (1.f + (__fp16)exp((float)(-U))); + + // gate new + const __fp16* bias_c_WN = bias_c.row(2); + const __fp16* bias_c_BN = bias_c.row(3); + + const __fp16* weight_xc_N = weight_xc.row(num_output * 2 + q); + const __fp16* weight_hc_N = weight_hc.row(num_output * 2 + q); + + __fp16 N = bias_c_BN[q]; + + int n_out2 = num_output; + const float* ptr_hc2 = hidden_state; + const __fp16* ptr_whc_n = weight_hc_N; + while (n_out2 > 0) + { + size_t vl = __riscv_vsetvl_e16m4(n_out2); + + vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc2, vl), vl); + vfloat16m4_t _whc_n = __riscv_vle16_v_f16m4(ptr_whc_n, vl); + vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); + + _h_cont = __riscv_vfmul_vv_f16m4(_whc_n, _h_cont, vl); + _scalar_n = __riscv_vfredusum_vs_f16m4_f16m1(_h_cont, _scalar_n, vl); + + N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); + n_out2 -= vl; + ptr_hc2 += vl; + ptr_whc_n += vl; + } + N = bias_c_WN[q] + R * N; + + int n2 = size; + const __fp16* ptr_x2 = x; + const __fp16* ptr_xcn = weight_xc_N; + while (n2 > 0) + { + size_t vl = __riscv_vsetvl_e16m8(n2); + + vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x2, vl); + vfloat16m8_t _xcn = __riscv_vle16_v_f16m8(ptr_xcn, vl); + vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl); + + _xcn = __riscv_vfmul_vv_f16m8(_x, _xcn, vl); + _scalar_n = __riscv_vfredusum_vs_f16m8_f16m1(_xcn, _scalar_n, vl); + N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n); + + n2 -= vl; + ptr_x2 += vl; + ptr_xcn += vl; + } + + // tanh(N) + N = (__fp16)tanh((float)N); + + gates_data[0] = U; + gates_data[1] = N; + } + + // h_t := (1 - update) .* new + update .* h_{t-1} + __fp16* output_data = top_blob.row<__fp16>(ti); + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < num_output; q++) + { + const float* gates_data = gates.row(q); + + float U = gates_data[0]; + float N = gates_data[1]; + + float H = (1 - U) * N + U * hidden_state[q]; + + hidden_state[q] = H; + output_data[q] = H; + } + } + + return 0; +} + +int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int T = bottom_blob.h; + + int num_directions = direction == 2 ? 2 : 1; + // initial hidden state + Mat hidden(num_output, 4u, opt.workspace_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret0 != 0) + return ret0; + + hidden.fill(0.0f); + + int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + return 0; +} + +int GRU_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + int T = bottom_blob.h; + int num_directions = direction == 2 ? 2 : 1; + + Mat hidden; + Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator; + if (bottom_blobs.size() == 2) + { + Option opt_cast = opt; + opt_cast.blob_allocator = hidden_allocator; + cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast); + } + else + { + hidden.create(num_output, num_directions, 4u, hidden_allocator); + if (hidden.empty()) + return -100; + hidden.fill(0.f); + } + + Mat& top_blob = top_blobs[0]; + top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // Uni directional + if (direction == 0 || direction == 1) + { + int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt); + if (ret != 0) + return ret; + } + + if (direction == 2) + { + Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_forward.empty()) + return -100; + + Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator); + if (top_blob_reverse.empty()) + return -100; + + Mat hidden0 = hidden.row_range(0, 1); + int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt); + if (ret0 != 0) + return ret0; + + Mat hidden1 = hidden.row_range(1, 1); + int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt); + if (ret1 != 0) + return ret1; + + // concat w + for (int i = 0; i < T; i++) + { + const __fp16* pf = top_blob_forward.row(i); + const __fp16* pr = top_blob_reverse.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + + memcpy(ptr, pf, num_output * sizeof(__fp16)); + memcpy(ptr + num_output, pr, num_output * sizeof(__fp16)); + } + } + + if (top_blobs.size() == 2) + { + cast_float32_to_float16(hidden, top_blobs[1], opt); + } + + return 0; +} +#endif // __riscv_zvfh + +} // namespace ncnn diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp index f7ee8ec9ed4..95a39a20ba5 100644 --- a/src/layer/riscv/instancenorm_riscv.cpp +++ b/src/layer/riscv/instancenorm_riscv.cpp @@ -94,7 +94,7 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) } #endif // __riscv_vector float mean = sum / size; -#if __riscv_vecotr +#if __riscv_vector { int n = size; float* ptr_sqsum = ptr;