From 9117c40707f29d66053bb8903e2dbaf9b414ebd3 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Mon, 26 Aug 2024 15:25:17 +0800
Subject: [PATCH] x

---
 src/layer/riscv/gru_riscv.cpp          | 660 +-----------------------
 src/layer/riscv/gru_riscv.h            |   2 +-
 src/layer/riscv/gru_riscv_zvfh.cpp     | 661 +++++++++++++++++++++++++
 src/layer/riscv/instancenorm_riscv.cpp |   2 +-
 4 files changed, 672 insertions(+), 653 deletions(-)
 create mode 100644 src/layer/riscv/gru_riscv_zvfh.cpp
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index 6d6330c3eef..47f6dee9118 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -208,8 +208,8 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
 
 GRU_riscv::GRU_riscv()
 {
-#if __riscv_vector && __riscv_zvfh
-    support_fp16_storage = true;
+#if NCNN_ZVFH
+    support_fp16_storage = cpu_support_riscv_zvfh();
 #endif
 }
 
@@ -223,7 +223,7 @@ int GRU_riscv::create_pipeline(const Option& opt)
     }
 #endif
 
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
         return create_pipeline_fp16sa(opt);
 #endif
@@ -240,10 +240,11 @@ int GRU_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
     }
 #endif
 
-    int elembits = bottom_blob.elembits();
 #if __riscv_vector
 
-#if __riscv_zvfh
+#if NCNN_ZVFH
+    int elembits = bottom_blob.elembits();
+
     if (opt.use_fp16_storage && elembits == 16)
     {
         if (opt.use_fp16_arithmetic)
@@ -322,10 +323,11 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
 #endif
 
     const Mat& bottom_blob = bottom_blobs[0];
-    int elembits = bottom_blob.elembits();
 
 #if __riscv_vector
-#if __riscv_zvfh
+#if NCNN_ZVFH
+    int elembits = bottom_blob.elembits();
+
     if (opt.use_fp16_storage && elembits == 16)
     {
         if (opt.use_fp16_arithmetic)
@@ -408,648 +410,4 @@ int GRU_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
     return GRU::forward(bottom_blobs, top_blobs, opt);
 }
 
-#if __riscv_vector && __riscv_zvfh
-static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
-{
-    int size = bottom_blob.w;
-    int T = bottom_blob.h;
-
-    int num_output = top_blob.w;
-
-    // 2 x num_output
-    Mat gates(2, num_output, 4u, opt.workspace_allocator);
-    if (gates.empty())
-        return -100;
-
-    // unroll
-    for (int t = 0; t < T; t++)
-    {
-        int ti = reverse ? T - 1 - t : t;
-
-        const __fp16* x = bottom_blob.row<const __fp16>(ti);
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
-        {
-            float* gates_data = gates.row(q);
-
-            // gate reset update
-            const float* bias_c_R = bias_c.row(0);
-            const float* bias_c_U = bias_c.row(1);
-
-            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
-            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
-            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
-            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
-
-            float R = bias_c_R[q];
-            float U = bias_c_U[q];
-
-            int n = size;
-            const __fp16* ptr_x = x;
-            const float* ptr_xcr = weight_xc_R;
-            const float* ptr_xcu = weight_xc_U;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n);
-                vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x, vl), vl);
-                vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl);
-                vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl);
-                vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl);
-                vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl);
-
-                _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl);
-                _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl);
-                _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl);
-                _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl);
-
-                R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r);
-                U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u);
-
-                ptr_x += vl;
-                ptr_xcr += vl;
-                ptr_xcu += vl;
-                n -= vl;
-            }
-            ptr_x = NULL;
-            ptr_xcr = NULL;
-            ptr_xcu = NULL;
-
-            int n_out = num_output;
-            const float* ptr_hc = hidden_state;
-            const float* ptr_hcr = weight_hc_R;
-            const float* ptr_hcu = weight_hc_U;
-            while (n_out > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n_out);
-                vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl);
-                vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl);
-                vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl);
-                vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl);
-                vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl);
-
-                _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl);
-                _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl);
-                _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl);
-                _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl);
-
-                R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r);
-                U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u);
-
-                ptr_hc += vl;
-                ptr_hcr += vl;
-                ptr_hcu += vl;
-                n_out -= vl;
-            }
-            ptr_hc = NULL;
-            ptr_hcr = NULL;
-            ptr_hcu = NULL;
-
-            // sigmoid(R)
-            // sigmoid(U)
-            R = 1.f / (1.f + exp(-R));
-            U = 1.f / (1.f + exp(-U));
-
-            // gate new
-            const float* bias_c_WN = bias_c.row(2);
-            const float* bias_c_BN = bias_c.row(3);
-
-            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);
-            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);
-
-            float N = bias_c_BN[q];
-
-            int n_out2 = num_output;
-            const float* ptr_hc2 = hidden_state;
-            const float* ptr_whc_n = weight_hc_N;
-            while (n_out2 > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n_out2);
-
-                vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl);
-                vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl);
-                vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl);
-
-                _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl);
-                _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl);
-
-                N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n);
-                n_out2 -= vl;
-                ptr_hc2 += vl;
-                ptr_whc_n += vl;
-            }
-            ptr_hc2 = NULL;
-            ptr_whc_n = NULL;
-
-            N = bias_c_WN[q] + R * N;
-
-            int n2 = size;
-            const __fp16* ptr_x2 = x;
-            const float* ptr_xcn = weight_xc_N;
-            while (n2 > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n2);
-
-                vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x2, vl), vl);
-                vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl);
-                vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl);
-
-                _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl);
-                _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl);
-                N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n);
-
-                n2 -= vl;
-                ptr_x2 += vl;
-                ptr_xcn += vl;
-            }
-            ptr_x2 = NULL;
-            ptr_xcn = NULL;
-
-            // tanh(N)
-            N = tanh(N);
-
-            gates_data[0] = U;
-            gates_data[1] = N;
-        }
-
-        // h_t := (1 - update) .* new + update .* h_{t-1}
-        __fp16* output_data = top_blob.row<__fp16>(ti);
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
-        {
-            const float* gates_data = gates.row(q);
-
-            float U = gates_data[0];
-            float N = gates_data[1];
-
-            float H = (1 - U) * N + U * hidden_state[q];
-
-            hidden_state[q] = H;
-            output_data[q] = (__fp16)H;
-        }
-    }
-
-    return 0;
-}
-
-int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    int T = bottom_blob.h;
-
-    int num_directions = direction == 2 ? 2 : 1;
-    // initial hidden state
-    Mat hidden(num_output, 4u, opt.workspace_allocator);
-    if (hidden.empty())
-        return -100;
-    hidden.fill(0.f);
-
-    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    // Uni directional
-    if (direction == 0 || direction == 1)
-    {
-        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
-        if (ret != 0)
-            return ret;
-    }
-
-    if (direction == 2)
-    {
-        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_forward.empty())
-            return -100;
-
-        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_reverse.empty())
-            return -100;
-
-        int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
-        if (ret0 != 0)
-            return ret0;
-
-        hidden.fill(0.0f);
-
-        int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt);
-        if (ret1 != 0)
-            return ret1;
-
-        // concat w
-        for (int i = 0; i < T; i++)
-        {
-            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
-            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
-            __fp16* ptr = top_blob.row<__fp16>(i);
-
-            memcpy(ptr, pf, num_output * sizeof(__fp16));
-            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
-        }
-    }
-
-    return 0;
-}
-
-int GRU_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
-{
-    const Mat& bottom_blob = bottom_blobs[0];
-    int T = bottom_blob.h;
-    int num_directions = direction == 2 ? 2 : 1;
-
-    Mat hidden;
-    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
-    if (bottom_blobs.size() == 2)
-    {
-        Option opt_cast = opt;
-        opt_cast.blob_allocator = hidden_allocator;
-        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
-    }
-    else
-    {
-        hidden.create(num_output, num_directions, 4u, hidden_allocator);
-        if (hidden.empty())
-            return -100;
-        hidden.fill(0.f);
-    }
-
-    Mat& top_blob = top_blobs[0];
-    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    // Uni directional
-    if (direction == 0 || direction == 1)
-    {
-        Mat hidden0 = hidden.row_range(0, 1);
-        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
-        if (ret != 0)
-            return ret;
-    }
-
-    if (direction == 2)
-    {
-        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_forward.empty())
-            return -100;
-
-        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_reverse.empty())
-            return -100;
-
-        Mat hidden0 = hidden.row_range(0, 1);
-        int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
-        if (ret0 != 0)
-            return ret0;
-
-        Mat hidden1 = hidden.row_range(1, 1);
-        int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
-        if (ret1 != 0)
-            return ret1;
-
-        // concat w
-        for (int i = 0; i < T; i++)
-        {
-            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
-            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
-            __fp16* ptr = top_blob.row<__fp16>(i);
-
-            memcpy(ptr, pf, num_output * sizeof(__fp16));
-            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
-        }
-    }
-
-    if (top_blobs.size() == 2)
-    {
-        cast_float32_to_float16(hidden, top_blobs[1], opt);
-    }
-
-    return 0;
-}
-
-#endif
-
-//fp16sa start at here
-#if __riscv_vector && __riscv_zvfh
-
-int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
-{
-    cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt);
-    cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
-    cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
-
-    if (opt.lightmode)
-    {
-        weight_xc_data.release();
-        bias_c_data.release();
-        weight_hc_data.release();
-    }
-
-    return 0;
-}
-
-static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
-{
-    int size = bottom_blob.w;
-    int T = bottom_blob.h;
-
-    int num_output = top_blob.w;
-
-    // 2 x num_output
-    Mat gates(2, num_output, 4u, opt.workspace_allocator);
-    if (gates.empty())
-        return -100;
-
-    // unroll
-    for (int t = 0; t < T; t++)
-    {
-        int ti = reverse ? T - 1 - t : t;
-
-        const __fp16* x = bottom_blob.row<const __fp16>(ti);
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
-        {
-            float* gates_data = gates.row(q);
-
-            // gate reset update
-            const __fp16* bias_c_R = bias_c.row<const __fp16>(0);
-            const __fp16* bias_c_U = bias_c.row<const __fp16>(1);
-
-            const __fp16* weight_xc_R = weight_xc.row<const __fp16>(num_output * 0 + q);
-            const __fp16* weight_xc_U = weight_xc.row<const __fp16>(num_output * 1 + q);
-            const __fp16* weight_hc_R = weight_hc.row<const __fp16>(num_output * 0 + q);
-            const __fp16* weight_hc_U = weight_hc.row<const __fp16>(num_output * 1 + q);
-
-            __fp16 R = bias_c_R[q];
-            __fp16 U = bias_c_U[q];
-
-            int n = size;
-            const __fp16* ptr_x = x;
-            const __fp16* ptr_xcr = weight_xc_R;
-            const __fp16* ptr_xcu = weight_xc_U;
-            while (n > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m8(n);
-                vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x, vl);
-                vfloat16m8_t _xcr = __riscv_vle16_v_f16m8(ptr_xcr, vl);
-                vfloat16m8_t _xcu = __riscv_vle16_v_f16m8(ptr_xcu, vl);
-                vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl);
-                vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl);
-
-                _xcr = __riscv_vfmul_vv_f16m8(_xcr, _x, vl);
-                _xcu = __riscv_vfmul_vv_f16m8(_xcu, _x, vl);
-                _scalar_r = __riscv_vfredusum_vs_f16m8_f16m1(_xcr, _scalar_r, vl);
-                _scalar_u = __riscv_vfredusum_vs_f16m8_f16m1(_xcu, _scalar_u, vl);
-
-                R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r);
-                U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u);
-
-                ptr_x += vl;
-                ptr_xcr += vl;
-                ptr_xcu += vl;
-                n -= vl;
-            }
-
-            int n_out = num_output;
-            const float* ptr_hc = hidden_state;
-            const __fp16* ptr_hcr = weight_hc_R;
-            const __fp16* ptr_hcu = weight_hc_U;
-            while (n_out > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n_out);
-                vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc, vl), vl);
-                vfloat16m4_t _hcr = __riscv_vle16_v_f16m4(ptr_hcr, vl);
-                vfloat16m4_t _hcu = __riscv_vle16_v_f16m4(ptr_hcu, vl);
-                vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl);
-                vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl);
-
-                _hcr = __riscv_vfmul_vv_f16m4(_hcr, _h_cont, vl);
-                _hcu = __riscv_vfmul_vv_f16m4(_hcu, _h_cont, vl);
-                _scalar_r = __riscv_vfredusum_vs_f16m4_f16m1(_hcr, _scalar_r, vl);
-                _scalar_u = __riscv_vfredusum_vs_f16m4_f16m1(_hcu, _scalar_u, vl);
-
-                R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r);
-                U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u);
-
-                ptr_hc += vl;
-                ptr_hcr += vl;
-                ptr_hcu += vl;
-                n_out -= vl;
-            }
-
-            // sigmoid(R)
-            // sigmoid(U)
-            R = 1.f / (1.f + (__fp16)exp((float)(-R)));
-            U = 1.f / (1.f + (__fp16)exp((float)(-U)));
-
-            // gate new
-            const __fp16* bias_c_WN = bias_c.row<const __fp16>(2);
-            const __fp16* bias_c_BN = bias_c.row<const __fp16>(3);
-
-            const __fp16* weight_xc_N = weight_xc.row<const __fp16>(num_output * 2 + q);
-            const __fp16* weight_hc_N = weight_hc.row<const __fp16>(num_output * 2 + q);
-
-            __fp16 N = bias_c_BN[q];
-
-            int n_out2 = num_output;
-            const float* ptr_hc2 = hidden_state;
-            const __fp16* ptr_whc_n = weight_hc_N;
-            while (n_out2 > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m4(n_out2);
-
-                vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc2, vl), vl);
-                vfloat16m4_t _whc_n = __riscv_vle16_v_f16m4(ptr_whc_n, vl);
-                vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl);
-
-                _h_cont = __riscv_vfmul_vv_f16m4(_whc_n, _h_cont, vl);
-                _scalar_n = __riscv_vfredusum_vs_f16m4_f16m1(_h_cont, _scalar_n, vl);
-
-                N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n);
-                n_out2 -= vl;
-                ptr_hc2 += vl;
-                ptr_whc_n += vl;
-            }
-            N = bias_c_WN[q] + R * N;
-
-            int n2 = size;
-            const __fp16* ptr_x2 = x;
-            const __fp16* ptr_xcn = weight_xc_N;
-            while (n2 > 0)
-            {
-                size_t vl = __riscv_vsetvl_e16m8(n2);
-
-                vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x2, vl);
-                vfloat16m8_t _xcn = __riscv_vle16_v_f16m8(ptr_xcn, vl);
-                vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl);
-
-                _xcn = __riscv_vfmul_vv_f16m8(_x, _xcn, vl);
-                _scalar_n = __riscv_vfredusum_vs_f16m8_f16m1(_xcn, _scalar_n, vl);
-                N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n);
-
-                n2 -= vl;
-                ptr_x2 += vl;
-                ptr_xcn += vl;
-            }
-
-            // tanh(N)
-            N = (__fp16)tanh((float)N);
-
-            gates_data[0] = U;
-            gates_data[1] = N;
-        }
-
-        // h_t := (1 - update) .* new + update .* h_{t-1}
-        __fp16* output_data = top_blob.row<__fp16>(ti);
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < num_output; q++)
-        {
-            const float* gates_data = gates.row(q);
-
-            float U = gates_data[0];
-            float N = gates_data[1];
-
-            float H = (1 - U) * N + U * hidden_state[q];
-
-            hidden_state[q] = H;
-            output_data[q] = H;
-        }
-    }
-
-    return 0;
-}
-
-int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
-{
-    int T = bottom_blob.h;
-
-    int num_directions = direction == 2 ? 2 : 1;
-    // initial hidden state
-    Mat hidden(num_output, 4u, opt.workspace_allocator);
-    if (hidden.empty())
-        return -100;
-    hidden.fill(0.f);
-
-    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    // Uni directional
-    if (direction == 0 || direction == 1)
-    {
-        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
-        if (ret != 0)
-            return ret;
-    }
-
-    if (direction == 2)
-    {
-        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_forward.empty())
-            return -100;
-
-        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_reverse.empty())
-            return -100;
-
-        int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
-        if (ret0 != 0)
-            return ret0;
-
-        hidden.fill(0.0f);
-
-        int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt);
-        if (ret1 != 0)
-            return ret1;
-
-        // concat w
-        for (int i = 0; i < T; i++)
-        {
-            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
-            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
-            __fp16* ptr = top_blob.row<__fp16>(i);
-
-            memcpy(ptr, pf, num_output * sizeof(__fp16));
-            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
-        }
-    }
-
-    return 0;
-}
-
-int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
-{
-    const Mat& bottom_blob = bottom_blobs[0];
-    int T = bottom_blob.h;
-    int num_directions = direction == 2 ? 2 : 1;
-
-    Mat hidden;
-    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
-    if (bottom_blobs.size() == 2)
-    {
-        Option opt_cast = opt;
-        opt_cast.blob_allocator = hidden_allocator;
-        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
-    }
-    else
-    {
-        hidden.create(num_output, num_directions, 4u, hidden_allocator);
-        if (hidden.empty())
-            return -100;
-        hidden.fill(0.f);
-    }
-
-    Mat& top_blob = top_blobs[0];
-    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
-    if (top_blob.empty())
-        return -100;
-
-    // Uni directional
-    if (direction == 0 || direction == 1)
-    {
-        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
-        if (ret != 0)
-            return ret;
-    }
-
-    if (direction == 2)
-    {
-        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_forward.empty())
-            return -100;
-
-        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
-        if (top_blob_reverse.empty())
-            return -100;
-
-        Mat hidden0 = hidden.row_range(0, 1);
-        int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt);
-        if (ret0 != 0)
-            return ret0;
-
-        Mat hidden1 = hidden.row_range(1, 1);
-        int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt);
-        if (ret1 != 0)
-            return ret1;
-
-        // concat w
-        for (int i = 0; i < T; i++)
-        {
-            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
-            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
-            __fp16* ptr = top_blob.row<__fp16>(i);
-
-            memcpy(ptr, pf, num_output * sizeof(__fp16));
-            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
-        }
-    }
-
-    if (top_blobs.size() == 2)
-    {
-        cast_float32_to_float16(hidden, top_blobs[1], opt);
-    }
-
-    return 0;
-}
-
-#endif
-
 } // namespace ncnn
diff --git a/src/layer/riscv/gru_riscv.h b/src/layer/riscv/gru_riscv.h
index 3d5532e198d..32d75d83d58 100644
--- a/src/layer/riscv/gru_riscv.h
+++ b/src/layer/riscv/gru_riscv.h
@@ -29,7 +29,7 @@ class GRU_riscv : public GRU
     virtual int create_pipeline(const Option& opt);
 
 protected:
-#if __riscv_vector && __riscv_zvfh
+#if NCNN_ZVFH
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
diff --git a/src/layer/riscv/gru_riscv_zvfh.cpp b/src/layer/riscv/gru_riscv_zvfh.cpp
new file mode 100644
index 00000000000..d714d690e88
--- /dev/null
+++ b/src/layer/riscv/gru_riscv_zvfh.cpp
@@ -0,0 +1,661 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gru_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+namespace ncnn {
+
+#if __riscv_zvfh
+static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
+{
+    int size = bottom_blob.w;
+    int T = bottom_blob.h;
+
+    int num_output = top_blob.w;
+
+    // 2 x num_output
+    Mat gates(2, num_output, 4u, opt.workspace_allocator);
+    if (gates.empty())
+        return -100;
+
+    // unroll
+    for (int t = 0; t < T; t++)
+    {
+        int ti = reverse ? T - 1 - t : t;
+
+        const __fp16* x = bottom_blob.row<const __fp16>(ti);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < num_output; q++)
+        {
+            float* gates_data = gates.row(q);
+
+            // gate reset update
+            const float* bias_c_R = bias_c.row(0);
+            const float* bias_c_U = bias_c.row(1);
+
+            const float* weight_xc_R = weight_xc.row(num_output * 0 + q);
+            const float* weight_xc_U = weight_xc.row(num_output * 1 + q);
+            const float* weight_hc_R = weight_hc.row(num_output * 0 + q);
+            const float* weight_hc_U = weight_hc.row(num_output * 1 + q);
+
+            float R = bias_c_R[q];
+            float U = bias_c_U[q];
+
+            int n = size;
+            const __fp16* ptr_x = x;
+            const float* ptr_xcr = weight_xc_R;
+            const float* ptr_xcu = weight_xc_U;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n);
+                vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x, vl), vl);
+                vfloat32m8_t _xcr = __riscv_vle32_v_f32m8(ptr_xcr, vl);
+                vfloat32m8_t _xcu = __riscv_vle32_v_f32m8(ptr_xcu, vl);
+                vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl);
+                vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl);
+
+                _xcr = __riscv_vfmul_vv_f32m8(_xcr, _x, vl);
+                _xcu = __riscv_vfmul_vv_f32m8(_xcu, _x, vl);
+                _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_xcr, _scalar_r, vl);
+                _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_xcu, _scalar_u, vl);
+
+                R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r);
+                U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_x += vl;
+                ptr_xcr += vl;
+                ptr_xcu += vl;
+                n -= vl;
+            }
+            ptr_x = NULL;
+            ptr_xcr = NULL;
+            ptr_xcu = NULL;
+
+            int n_out = num_output;
+            const float* ptr_hc = hidden_state;
+            const float* ptr_hcr = weight_hc_R;
+            const float* ptr_hcu = weight_hc_U;
+            while (n_out > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n_out);
+                vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc, vl);
+                vfloat32m8_t _hcr = __riscv_vle32_v_f32m8(ptr_hcr, vl);
+                vfloat32m8_t _hcu = __riscv_vle32_v_f32m8(ptr_hcu, vl);
+                vfloat32m1_t _scalar_r = __riscv_vfmv_s_f_f32m1(R, vl);
+                vfloat32m1_t _scalar_u = __riscv_vfmv_s_f_f32m1(U, vl);
+
+                _hcr = __riscv_vfmul_vv_f32m8(_hcr, _h_cont, vl);
+                _hcu = __riscv_vfmul_vv_f32m8(_hcu, _h_cont, vl);
+                _scalar_r = __riscv_vfredusum_vs_f32m8_f32m1(_hcr, _scalar_r, vl);
+                _scalar_u = __riscv_vfredusum_vs_f32m8_f32m1(_hcu, _scalar_u, vl);
+
+                R = __riscv_vfmv_f_s_f32m1_f32(_scalar_r);
+                U = __riscv_vfmv_f_s_f32m1_f32(_scalar_u);
+
+                ptr_hc += vl;
+                ptr_hcr += vl;
+                ptr_hcu += vl;
+                n_out -= vl;
+            }
+            ptr_hc = NULL;
+            ptr_hcr = NULL;
+            ptr_hcu = NULL;
+
+            // sigmoid(R)
+            // sigmoid(U)
+            R = 1.f / (1.f + exp(-R));
+            U = 1.f / (1.f + exp(-U));
+
+            // gate new
+            const float* bias_c_WN = bias_c.row(2);
+            const float* bias_c_BN = bias_c.row(3);
+
+            const float* weight_xc_N = weight_xc.row(num_output * 2 + q);
+            const float* weight_hc_N = weight_hc.row(num_output * 2 + q);
+
+            float N = bias_c_BN[q];
+
+            int n_out2 = num_output;
+            const float* ptr_hc2 = hidden_state;
+            const float* ptr_whc_n = weight_hc_N;
+            while (n_out2 > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n_out2);
+
+                vfloat32m8_t _h_cont = __riscv_vle32_v_f32m8(ptr_hc2, vl);
+                vfloat32m8_t _whc_n = __riscv_vle32_v_f32m8(ptr_whc_n, vl);
+                vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl);
+
+                _h_cont = __riscv_vfmul_vv_f32m8(_whc_n, _h_cont, vl);
+                _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_h_cont, _scalar_n, vl);
+
+                N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n);
+                n_out2 -= vl;
+                ptr_hc2 += vl;
+                ptr_whc_n += vl;
+            }
+            ptr_hc2 = NULL;
+            ptr_whc_n = NULL;
+
+            N = bias_c_WN[q] + R * N;
+
+            int n2 = size;
+            const __fp16* ptr_x2 = x;
+            const float* ptr_xcn = weight_xc_N;
+            while (n2 > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n2);
+
+                vfloat32m8_t _x = __riscv_vfwcvt_f_f_v_f32m8(__riscv_vle16_v_f16m4(ptr_x2, vl), vl);
+                vfloat32m8_t _xcn = __riscv_vle32_v_f32m8(ptr_xcn, vl);
+                vfloat32m1_t _scalar_n = __riscv_vfmv_s_f_f32m1(N, vl);
+
+                _xcn = __riscv_vfmul_vv_f32m8(_x, _xcn, vl);
+                _scalar_n = __riscv_vfredusum_vs_f32m8_f32m1(_xcn, _scalar_n, vl);
+                N = __riscv_vfmv_f_s_f32m1_f32(_scalar_n);
+
+                n2 -= vl;
+                ptr_x2 += vl;
+                ptr_xcn += vl;
+            }
+            ptr_x2 = NULL;
+            ptr_xcn = NULL;
+
+            // tanh(N)
+            N = tanh(N);
+
+            gates_data[0] = U;
+            gates_data[1] = N;
+        }
+
+        // h_t := (1 - update) .* new + update .* h_{t-1}
+        __fp16* output_data = top_blob.row<__fp16>(ti);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < num_output; q++)
+        {
+            const float* gates_data = gates.row(q);
+
+            float U = gates_data[0];
+            float N = gates_data[1];
+
+            float H = (1 - U) * N + U * hidden_state[q];
+
+            hidden_state[q] = H;
+            output_data[q] = (__fp16)H;
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int T = bottom_blob.h;
+
+    int num_directions = direction == 2 ? 2 : 1;
+    // initial hidden state
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        hidden.fill(0.0f);
+
+        int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int T = bottom_blob.h;
+    int num_directions = direction == 2 ? 2 : 1;
+
+    Mat hidden;
+    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
+    if (bottom_blobs.size() == 2)
+    {
+        Option opt_cast = opt;
+        opt_cast.blob_allocator = hidden_allocator;
+        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
+    }
+    else
+    {
+        hidden.create(num_output, num_directions, 4u, hidden_allocator);
+        if (hidden.empty())
+            return -100;
+        hidden.fill(0.f);
+    }
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        Mat hidden0 = hidden.row_range(0, 1);
+        int ret = gru_fp16s(bottom_blob, top_blob, direction, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        Mat hidden0 = hidden.row_range(0, 1);
+        int ret0 = gru_fp16s(bottom_blob, top_blob_forward, 0, weight_xc_data.channel(0), bias_c_data.channel(0), weight_hc_data.channel(0), hidden0, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        Mat hidden1 = hidden.row_range(1, 1);
+        int ret1 = gru_fp16s(bottom_blob, top_blob_reverse, 1, weight_xc_data.channel(1), bias_c_data.channel(1), weight_hc_data.channel(1), hidden1, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    if (top_blobs.size() == 2)
+    {
+        cast_float32_to_float16(hidden, top_blobs[1], opt);
+    }
+
+    return 0;
+}
+
+int GRU_riscv::create_pipeline_fp16sa(const Option& opt)
+{
+    cast_float32_to_float16(weight_xc_data, weight_xc_data_fp16sa, opt);
+    cast_float32_to_float16(weight_hc_data, weight_hc_data_fp16sa, opt);
+    cast_float32_to_float16(bias_c_data, bias_c_data_fp16sa, opt);
+
+    if (opt.lightmode)
+    {
+        weight_xc_data.release();
+        bias_c_data.release();
+        weight_hc_data.release();
+    }
+
+    return 0;
+}
+
+static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& weight_xc, const Mat& bias_c, const Mat& weight_hc, Mat& hidden_state, const Option& opt)
+{
+    int size = bottom_blob.w;
+    int T = bottom_blob.h;
+
+    int num_output = top_blob.w;
+
+    // 2 x num_output
+    Mat gates(2, num_output, 4u, opt.workspace_allocator);
+    if (gates.empty())
+        return -100;
+
+    // unroll
+    for (int t = 0; t < T; t++)
+    {
+        int ti = reverse ? T - 1 - t : t;
+
+        const __fp16* x = bottom_blob.row<const __fp16>(ti);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < num_output; q++)
+        {
+            float* gates_data = gates.row(q);
+
+            // gate reset update
+            const __fp16* bias_c_R = bias_c.row<const __fp16>(0);
+            const __fp16* bias_c_U = bias_c.row<const __fp16>(1);
+
+            const __fp16* weight_xc_R = weight_xc.row<const __fp16>(num_output * 0 + q);
+            const __fp16* weight_xc_U = weight_xc.row<const __fp16>(num_output * 1 + q);
+            const __fp16* weight_hc_R = weight_hc.row<const __fp16>(num_output * 0 + q);
+            const __fp16* weight_hc_U = weight_hc.row<const __fp16>(num_output * 1 + q);
+
+            __fp16 R = bias_c_R[q];
+            __fp16 U = bias_c_U[q];
+
+            int n = size;
+            const __fp16* ptr_x = x;
+            const __fp16* ptr_xcr = weight_xc_R;
+            const __fp16* ptr_xcu = weight_xc_U;
+            while (n > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m8(n);
+                vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x, vl);
+                vfloat16m8_t _xcr = __riscv_vle16_v_f16m8(ptr_xcr, vl);
+                vfloat16m8_t _xcu = __riscv_vle16_v_f16m8(ptr_xcu, vl);
+                vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl);
+                vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl);
+
+                _xcr = __riscv_vfmul_vv_f16m8(_xcr, _x, vl);
+                _xcu = __riscv_vfmul_vv_f16m8(_xcu, _x, vl);
+                _scalar_r = __riscv_vfredusum_vs_f16m8_f16m1(_xcr, _scalar_r, vl);
+                _scalar_u = __riscv_vfredusum_vs_f16m8_f16m1(_xcu, _scalar_u, vl);
+
+                R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r);
+                U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u);
+
+                ptr_x += vl;
+                ptr_xcr += vl;
+                ptr_xcu += vl;
+                n -= vl;
+            }
+
+            int n_out = num_output;
+            const float* ptr_hc = hidden_state;
+            const __fp16* ptr_hcr = weight_hc_R;
+            const __fp16* ptr_hcu = weight_hc_U;
+            while (n_out > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n_out);
+                vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc, vl), vl);
+                vfloat16m4_t _hcr = __riscv_vle16_v_f16m4(ptr_hcr, vl);
+                vfloat16m4_t _hcu = __riscv_vle16_v_f16m4(ptr_hcu, vl);
+                vfloat16m1_t _scalar_r = __riscv_vfmv_s_f_f16m1(R, vl);
+                vfloat16m1_t _scalar_u = __riscv_vfmv_s_f_f16m1(U, vl);
+
+                _hcr = __riscv_vfmul_vv_f16m4(_hcr, _h_cont, vl);
+                _hcu = __riscv_vfmul_vv_f16m4(_hcu, _h_cont, vl);
+                _scalar_r = __riscv_vfredusum_vs_f16m4_f16m1(_hcr, _scalar_r, vl);
+                _scalar_u = __riscv_vfredusum_vs_f16m4_f16m1(_hcu, _scalar_u, vl);
+
+                R = __riscv_vfmv_f_s_f16m1_f16(_scalar_r);
+                U = __riscv_vfmv_f_s_f16m1_f16(_scalar_u);
+
+                ptr_hc += vl;
+                ptr_hcr += vl;
+                ptr_hcu += vl;
+                n_out -= vl;
+            }
+
+            // sigmoid(R)
+            // sigmoid(U)
+            R = 1.f / (1.f + (__fp16)exp((float)(-R)));
+            U = 1.f / (1.f + (__fp16)exp((float)(-U)));
+
+            // gate new
+            const __fp16* bias_c_WN = bias_c.row<const __fp16>(2);
+            const __fp16* bias_c_BN = bias_c.row<const __fp16>(3);
+
+            const __fp16* weight_xc_N = weight_xc.row<const __fp16>(num_output * 2 + q);
+            const __fp16* weight_hc_N = weight_hc.row<const __fp16>(num_output * 2 + q);
+
+            __fp16 N = bias_c_BN[q];
+
+            int n_out2 = num_output;
+            const float* ptr_hc2 = hidden_state;
+            const __fp16* ptr_whc_n = weight_hc_N;
+            while (n_out2 > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m4(n_out2);
+
+                vfloat16m4_t _h_cont = __riscv_vfncvt_f_f_w_f16m4(__riscv_vle32_v_f32m8(ptr_hc2, vl), vl);
+                vfloat16m4_t _whc_n = __riscv_vle16_v_f16m4(ptr_whc_n, vl);
+                vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl);
+
+                _h_cont = __riscv_vfmul_vv_f16m4(_whc_n, _h_cont, vl);
+                _scalar_n = __riscv_vfredusum_vs_f16m4_f16m1(_h_cont, _scalar_n, vl);
+
+                N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n);
+                n_out2 -= vl;
+                ptr_hc2 += vl;
+                ptr_whc_n += vl;
+            }
+            N = bias_c_WN[q] + R * N;
+
+            int n2 = size;
+            const __fp16* ptr_x2 = x;
+            const __fp16* ptr_xcn = weight_xc_N;
+            while (n2 > 0)
+            {
+                size_t vl = __riscv_vsetvl_e16m8(n2);
+
+                vfloat16m8_t _x = __riscv_vle16_v_f16m8(ptr_x2, vl);
+                vfloat16m8_t _xcn = __riscv_vle16_v_f16m8(ptr_xcn, vl);
+                vfloat16m1_t _scalar_n = __riscv_vfmv_s_f_f16m1(N, vl);
+
+                _xcn = __riscv_vfmul_vv_f16m8(_x, _xcn, vl);
+                _scalar_n = __riscv_vfredusum_vs_f16m8_f16m1(_xcn, _scalar_n, vl);
+                N = __riscv_vfmv_f_s_f16m1_f16(_scalar_n);
+
+                n2 -= vl;
+                ptr_x2 += vl;
+                ptr_xcn += vl;
+            }
+
+            // tanh(N)
+            N = (__fp16)tanh((float)N);
+
+            gates_data[0] = U;
+            gates_data[1] = N;
+        }
+
+        // h_t := (1 - update) .* new + update .* h_{t-1}
+        __fp16* output_data = top_blob.row<__fp16>(ti);
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < num_output; q++)
+        {
+            const float* gates_data = gates.row(q);
+
+            float U = gates_data[0];
+            float N = gates_data[1];
+
+            float H = (1 - U) * N + U * hidden_state[q];
+
+            hidden_state[q] = H;
+            output_data[q] = H;
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int T = bottom_blob.h;
+
+    int num_directions = direction == 2 ? 2 : 1;
+    // initial hidden state
+    Mat hidden(num_output, 4u, opt.workspace_allocator);
+    if (hidden.empty())
+        return -100;
+    hidden.fill(0.f);
+
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        hidden.fill(0.0f);
+
+        int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    return 0;
+}
+
+int GRU_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    int T = bottom_blob.h;
+    int num_directions = direction == 2 ? 2 : 1;
+
+    Mat hidden;
+    Allocator* hidden_allocator = top_blobs.size() == 2 ? opt.blob_allocator : opt.workspace_allocator;
+    if (bottom_blobs.size() == 2)
+    {
+        Option opt_cast = opt;
+        opt_cast.blob_allocator = hidden_allocator;
+        cast_float16_to_float32(bottom_blobs[1], hidden, opt_cast);
+    }
+    else
+    {
+        hidden.create(num_output, num_directions, 4u, hidden_allocator);
+        if (hidden.empty())
+            return -100;
+        hidden.fill(0.f);
+    }
+
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(num_output * num_directions, T, 2u, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // Uni directional
+    if (direction == 0 || direction == 1)
+    {
+        int ret = gru_fp16sa(bottom_blob, top_blob, direction, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden, opt);
+        if (ret != 0)
+            return ret;
+    }
+
+    if (direction == 2)
+    {
+        Mat top_blob_forward(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_forward.empty())
+            return -100;
+
+        Mat top_blob_reverse(num_output, T, 2u, opt.workspace_allocator);
+        if (top_blob_reverse.empty())
+            return -100;
+
+        Mat hidden0 = hidden.row_range(0, 1);
+        int ret0 = gru_fp16sa(bottom_blob, top_blob_forward, 0, weight_xc_data_fp16sa.channel(0), bias_c_data_fp16sa.channel(0), weight_hc_data_fp16sa.channel(0), hidden0, opt);
+        if (ret0 != 0)
+            return ret0;
+
+        Mat hidden1 = hidden.row_range(1, 1);
+        int ret1 = gru_fp16sa(bottom_blob, top_blob_reverse, 1, weight_xc_data_fp16sa.channel(1), bias_c_data_fp16sa.channel(1), weight_hc_data_fp16sa.channel(1), hidden1, opt);
+        if (ret1 != 0)
+            return ret1;
+
+        // concat w
+        for (int i = 0; i < T; i++)
+        {
+            const __fp16* pf = top_blob_forward.row<const __fp16>(i);
+            const __fp16* pr = top_blob_reverse.row<const __fp16>(i);
+            __fp16* ptr = top_blob.row<__fp16>(i);
+
+            memcpy(ptr, pf, num_output * sizeof(__fp16));
+            memcpy(ptr + num_output, pr, num_output * sizeof(__fp16));
+        }
+    }
+
+    if (top_blobs.size() == 2)
+    {
+        cast_float32_to_float16(hidden, top_blobs[1], opt);
+    }
+
+    return 0;
+}
+#endif // __riscv_zvfh
+
+} // namespace ncnn
diff --git a/src/layer/riscv/instancenorm_riscv.cpp b/src/layer/riscv/instancenorm_riscv.cpp
index f7ee8ec9ed4..95a39a20ba5 100644
--- a/src/layer/riscv/instancenorm_riscv.cpp
+++ b/src/layer/riscv/instancenorm_riscv.cpp
@@ -94,7 +94,7 @@ int InstanceNorm_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt)
             }
 #endif // __riscv_vector
             float mean = sum / size;
-#if __riscv_vecotr
+#if __riscv_vector
             {
                 int n = size;
                 float* ptr_sqsum = ptr;