Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Riscv64 c906 d1 #3177

Open
wants to merge 14 commits into
base: master
Choose a base branch
from
219 changes: 163 additions & 56 deletions src/layer/riscv/convolution_sgemm_packn_fp16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,20 +55,35 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr[4] = img0[l + packn * 4];
tmpptr[5] = img0[l + packn * 5];
tmpptr[6] = img0[l + packn * 6];
tmpptr[7] = img0[l + packn * 7];
tmpptr += 8;
}
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
"mv t2, %[TMP] \n\t"
"slli t3, t3, 1 \n\t"
"vle.v v0, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v1, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v2, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v3, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v4, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v5, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v6, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v7, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vsseg8e.v v0, (t2) \n\t"
:
: [LEN] "r"(packn), [SRC] "r"(img0), [TMP] "r"(tmpptr)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "t1", "t2", "t3");

img0 += size * packn;
tmpptr += packn * 8;

#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
Expand Down Expand Up @@ -104,16 +119,25 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr[2] = img0[l + packn * 2];
tmpptr[3] = img0[l + packn * 3];
tmpptr += 4;
}
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
"mv t2, %[TMP] \n\t"
"slli t3, t3, 1 \n\t"
"vle.v v0, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v1, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v2, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v3, (t1) \n\t"
"vsseg4e.v v0, (t2) \n\t"
:
: [LEN] "r"(packn), [SRC] "r"(img0), [TMP] "r"(tmpptr)
: "cc", "memory", "v0", "v1", "v2", "v3", "t1", "t2", "t3");

img0 += size * packn;
tmpptr += packn * 4;
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
Expand Down Expand Up @@ -146,14 +170,21 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
for (int k = 0; k < maxk; k++)
{
#if RVV_SPEC_0_7
for (int l = 0; l < packn; l++)
{
tmpptr[0] = img0[l];
tmpptr[1] = img0[l + packn];
tmpptr += 2;
}
asm volatile(
"mv t3, %[LEN] \n\t"
"mv t1, %[SRC] \n\t"
"mv t2, %[TMP] \n\t"
"slli t3, t3, 1 \n\t"
"vle.v v0, (t1) \n\t"
"add t1, t1, t3 \n\t"
"vle.v v1, (t1) \n\t"
"add t1, t1, t3 \n\t"
:
: [LEN] "r"(packn), [SRC] "r"(img0), [TMP] "r"(tmpptr)
: "cc", "memory", "v0", "v1", "t1", "t2", "t3");

img0 += size * packn;
tmpptr += packn * 2;
#else
vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
Expand Down Expand Up @@ -225,25 +256,53 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int j = 0; j < nn; j++)
{
__fp16 val0 = *tmpptr++;
__fp16 val1 = *tmpptr++;
__fp16 val2 = *tmpptr++;
__fp16 val3 = *tmpptr++;
__fp16 val4 = *tmpptr++;
__fp16 val5 = *tmpptr++;
__fp16 val6 = *tmpptr++;
__fp16 val7 = *tmpptr++;
#if RVV_SPEC_0_7
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl);
vfloat16m1_t _val0 = vrgathervx_float16xm1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgathervx_float16xm1(_v0, 1, vl);
vfloat16m1_t _val2 = vrgathervx_float16xm1(_v0, 2, vl);
vfloat16m1_t _val3 = vrgathervx_float16xm1(_v0, 3, vl);
vfloat16m1_t _val4 = vrgathervx_float16xm1(_v0, 4, vl);
vfloat16m1_t _val5 = vrgathervx_float16xm1(_v0, 5, vl);
vfloat16m1_t _val6 = vrgathervx_float16xm1(_v0, 6, vl);
vfloat16m1_t _val7 = vrgathervx_float16xm1(_v0, 7, vl);
tmpptr += 8;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
_sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
_sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
_sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl);
_sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl);
_sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl);
_sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl);

kptr0 += packn;
#else
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl);
vfloat16m1_t _val0 = vrgather_vx_f16m1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgather_vx_f16m1(_v0, 1, vl);
vfloat16m1_t _val2 = vrgather_vx_f16m1(_v0, 2, vl);
vfloat16m1_t _val3 = vrgather_vx_f16m1(_v0, 3, vl);
vfloat16m1_t _val4 = vrgather_vx_f16m1(_v0, 4, vl);
vfloat16m1_t _val5 = vrgather_vx_f16m1(_v0, 5, vl);
vfloat16m1_t _val6 = vrgather_vx_f16m1(_v0, 6, vl);
vfloat16m1_t _val7 = vrgather_vx_f16m1(_v0, 7, vl);
tmpptr += 8;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl);
_sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl);
_sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl);
_sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl);
_sum4 = vfmacc_vf_f16m1(_sum4, val4, _w0, vl);
_sum5 = vfmacc_vf_f16m1(_sum5, val5, _w0, vl);
_sum6 = vfmacc_vf_f16m1(_sum6, val6, _w0, vl);
_sum7 = vfmacc_vf_f16m1(_sum7, val7, _w0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
_sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
_sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
_sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl);
_sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl);
_sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl);
_sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl);

kptr0 += packn;
#endif
Comment on lines +259 to +305
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Stick to rvv-1.0 spec for intrinsic code, and define compatibility alias for rvv-0.7 in riscv_v_071_fix.h

}

vse16_v_f16m1(outptr0, _sum0, vl);
Expand Down Expand Up @@ -279,17 +338,37 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int j = 0; j < nn; j++)
{
__fp16 val0 = *tmpptr++;
__fp16 val1 = *tmpptr++;
__fp16 val2 = *tmpptr++;
__fp16 val3 = *tmpptr++;
#if RVV_SPEC_0_7
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl / 2);
vfloat16m1_t _val0 = vrgathervx_float16xm1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgathervx_float16xm1(_v0, 1, vl);
vfloat16m1_t _val2 = vrgathervx_float16xm1(_v0, 2, vl);
vfloat16m1_t _val3 = vrgathervx_float16xm1(_v0, 3, vl);
tmpptr += 4;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
_sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
_sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);

kptr0 += packn;
#else
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl / 2);
vfloat16m1_t _val0 = vrgather_vx_f16m1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgather_vx_f16m1(_v0, 1, vl);
vfloat16m1_t _val2 = vrgather_vx_f16m1(_v0, 2, vl);
vfloat16m1_t _val3 = vrgather_vx_f16m1(_v0, 3, vl);
tmpptr += 4;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl);
_sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl);
_sum2 = vfmacc_vf_f16m1(_sum2, val2, _w0, vl);
_sum3 = vfmacc_vf_f16m1(_sum3, val3, _w0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
_sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
_sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);

kptr0 += packn;
#endif
}

vse16_v_f16m1(outptr0, _sum0, vl);
Expand Down Expand Up @@ -317,13 +396,29 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int j = 0; j < nn; j++)
{
__fp16 val0 = *tmpptr++;
__fp16 val1 = *tmpptr++;
#if RVV_SPEC_0_7
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl / 4);
vfloat16m1_t _val0 = vrgathervx_float16xm1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgathervx_float16xm1(_v0, 1, vl);
tmpptr += 2;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vf_f16m1(_sum0, val0, _w0, vl);
_sum1 = vfmacc_vf_f16m1(_sum1, val1, _w0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);

kptr0 += packn;
#else
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr, vl / 4);
vfloat16m1_t _val0 = vrgather_vx_f16m1(_v0, 0, vl);
vfloat16m1_t _val1 = vrgather_vx_f16m1(_v0, 1, vl);
tmpptr += 2;

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
_sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);

kptr0 += packn;
#endif
}

vse16_v_f16m1(outptr0, _sum0, vl);
Expand All @@ -347,11 +442,23 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo

for (int j = 0; j < nn; j++)
{
__fp16 val = *tmpptr++;
#if RVV_SPEC_0_7
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr++, vl / 8);
vfloat16m1_t _val0 = vrgathervx_float16xm1(_v0, 0, vl);

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum = vfmacc_vv_f16m1(_sum, _val0, _w0, vl);

kptr0 += packn;
#else
vfloat16m1_t _v0 = vle16_v_f16m1(tmpptr++, vl / 8);
vfloat16m1_t _val0 = vrgather_vx_f16m1(_v0, 0, vl);

vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
_sum = vfmacc_vf_f16m1(_sum, val, _w0, vl);
_sum = vfmacc_vv_f16m1(_sum, _val0, _w0, vl);

kptr0 += packn;
#endif
}

vse16_v_f16m1(outptr0, _sum, vl);
Expand Down
Loading