From f6472a7e1233672d2d441d972137665abdd4db31 Mon Sep 17 00:00:00 2001 From: chenjiaoAngel Date: Fri, 20 May 2022 18:23:28 +0800 Subject: [PATCH 1/3] add v7 fp16 scale implement --- .pre-commit-config.yaml | 2 +- lite/backends/arm/math/scale.cc | 157 ++++++++++++++++++++++++ tools/codestyle/cpplint_pre_commit.hook | 4 +- 3 files changed, 160 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5ebedf10fa3..cca3e6daa6f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: hooks: - id: copyright_checker name: copyright_checker - entry: python ./tools/codestyle/copyright.hook + entry: python3 ./tools/codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|cl|h|hpp|hxx|proto|py|mm|m|metal)$ exclude: (?!.*third_party)^.*$|(?!.*book)^.*$ diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc index 3687d50d773..b9b6c0cc4ae 100644 --- a/lite/backends/arm/math/scale.cc +++ b/lite/backends/arm/math/scale.cc @@ -863,6 +863,40 @@ inline void scale_compute_fp16(const flaot16_t* din, [remain_cnt] "+r"(remain_cnt) : [vscale] "w"(vscale), [vbias] "w"(vbias) : "cc", "memory", "v4", "v5", "v8", "v9"); +#else + asm volatile( + "cmp %[cnt], #1 \n" + "blt 0f \n" + "1: \n" + "vld1.16 {d8-d9}, [%[din]]! \n" + "vmov q8, %q[vbias] \n" + "vld1.16 {d10-d11}, [%[din]]! \n" + "vmov q9, %q[vbias] \n" + + "vmla.f16 q8, q4, %q[vscale] \n" + "vmla.f16 q9, q5, %q[vscale] \n" + + "subs %[cnt], %[cnt], #1 \n" + "vst1.16 {d16-d19}, [%[dout]]! \n" + "bne 1b \n" + "0: \n" + "cmp %[remain_cnt], #1 \n" + "blt 2f \n" + "3: \n" + "vld1.16 {d8}, [%[din]]! \n" + "vmov d16, %e[vbias] \n" + "vmla.f16 d16, d8, %e[vscale] \n" + "subs %[remain_cnt], %[remain_cnt], #1 \n" + "vst1.16 {d16}, [%[dout]]! \n" + "bne 3b \n" + "2: \n" + : [dout] "+r"(dout), + [din] "+r"(din), + [cnt] "+r"(cnt), + [remain_cnt] "+r"(remain_cnt) + : [vscale] "w"(vscale), [vbias] "w"(vbias) + : "cc", "memory", "q4", "q5", "q8", "q9"); + #endif for (int j = 0; j < remain_rem; j++) { *dout = *din * vscale[0] + vbias[0]; @@ -1013,6 +1047,42 @@ void scale_relu(const float16_t* din, : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); #else + asm volatile( + "cmp %[cnt], #1 \n" + "blt 0f \n" + "1: \n" + "vld1.16 {d8-d9}, [%[din]]! \n" + "vmov q8, %q[vbias] \n" + "vld1.16 {d10-d11}, [%[din]]! \n" + "vmov q9, %q[vbias] \n" + + "vmla.f16 q8, q4, %q[vscale] \n" + "vmla.f16 q9, q5, %q[vscale] \n" + "vmax.f16 q8, q8, %q[vzero] \n" + "vmax.f16 q9, q9, %q[vzero] \n" + + "subs %[cnt], %[cnt], #1 \n" + "vst1.16 {d16-d19}, [%[dout]]! \n" + "bne 1b \n" + "0: \n" + "cmp %[remain_cnt], #1 \n" + "blt 2f \n" + "3: \n" + "vld1.16 {d8}, [%[din]]! \n" + "vmov d16, %e[vbias] \n" + "vmla.f16 d16, d8, %e[vscale] \n" + "vmax.f16 d16, d16, %e[vzero] \n" + "subs %[remain_cnt], %[remain_cnt], #1 \n" + "vst1.16 {d16}, [%[dout]]! \n" + "bne 3b \n" + "2: \n" + : [dout] "+r"(dout), + [din] "+r"(din), + [cnt] "+r"(cnt), + [remain_cnt] "+r"(remain_num) + : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero) + : "cc", "memory", "q4", "q5", "q8", "q9"); + #endif for (int i = 0; i < remain_rem; i++) { *dout = *din * scale + bias; @@ -1082,6 +1152,48 @@ void scale_relu6(const float16_t* din, [valpha] "w"(valpha) : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); #else + asm volatile( + "cmp %[cnt], #1 \n" + "blt 0f \n" + "1: \n" + "vld1.16 {d8-d9}, [%[din]]! \n" + "vmov q8, %q[vbias] \n" + "vld1.16 {d10-d11}, [%[din]]! \n" + "vmov q9, %q[vbias] \n" + + "vmla.f16 q8, q4, %q[vscale] \n" + "vmla.f16 q9, q5, %q[vscale] \n" + "vmax.f16 q8, q8, %q[vzero] \n" + "vmax.f16 q9, q9, %q[vzero] \n" + "vmin.f16 q8, q8, %q[valpha] \n" + "vmin.f16 q9, q9, %q[valpha] \n" + + "subs %[cnt], %[cnt], #1 \n" + "vst1.16 {d16-d19}, [%[dout]]! \n" + "bne 1b \n" + "0: \n" + "cmp %[remain_cnt], #1 \n" + "blt 2f \n" + "3: \n" + "vld1.16 {d8}, [%[din]]! \n" + "vmov d16, %e[vbias] \n" + "vmla.f16 d16, d8, %e[vscale] \n" + "vmax.f16 d16, d16, %e[vzero] \n" + "vmin.f16 d16, d16, %e[valpha] \n" + "subs %[remain_cnt], %[remain_cnt], #1 \n" + "vst1.16 {d16}, [%[dout]]! \n" + "bne 3b \n" + "2: \n" + : [dout] "+r"(dout), + [din] "+r"(din), + [cnt] "+r"(cnt), + [remain_cnt] "+r"(remain_num) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "q4", "q5", "q8", "q9"); + #endif for (int i = 0; i < remain_rem; i++) { *dout = *din * scale + bias; @@ -1153,6 +1265,51 @@ void scale_leaky_relu(const float16_t* din, [valpha] "w"(valpha) : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"); #else + asm volatile( + "cmp %[cnt], #1 \n" + "blt 0f \n" + "1: \n" + "vld1.16 {d8-d9}, [%[din]]! \n" + "vmov q8, %q[vbias] \n" + "vld1.16 {d10-d11}, [%[din]]! \n" + "vmov q9, %q[vbias] \n" + + "vmla.f16 q8, q4, %q[vscale] \n" + "vmla.f16 q9, q5, %q[vscale] \n" + "vcge.f16 q10, q8, %q[vzero] \n" + "vmul.f16 q11, q8, %q[valpha]\n" + "vcge.f16 q12, q9, %q[vzero] \n" + "vmul.f16 q13, q9, %q[valpha]\n" + + "subs %[cnt], %[cnt], #1 \n" + "vbif q8, q11, q10 \n" + "vbif q9, q13, q12 \n" + "vst1.16 {d16-d19}, [%[dout]]! \n" + "bne 1b \n" + "0: \n" + "cmp %[remain_cnt], #1 \n" + "blt 2f \n" + "3: \n" + "vld1.16 {d8}, [%[din]]! \n" + "vmov d16, %e[vbias] \n" + "vmla.f16 d16, d8, %e[vscale] \n" + "vcge.f16 d20, d16, %e[vzero] \n" + "vmul.f16 d22, d16, %e[valpha]\n" + "subs %[remain_cnt], %[remain_cnt], #1 \n" + "vbif d16, d22, d20 \n" + "vst1.16 {d16}, [%[dout]]! \n" + "bne 3b \n" + "2: \n" + : [dout] "+r"(dout), + [din] "+r"(din), + [cnt] "+r"(cnt), + [remain_cnt] "+r"(remain_num) + : [vscale] "w"(vscale), + [vbias] "w"(vbias), + [vzero] "w"(vzero), + [valpha] "w"(valpha) + : "cc", "memory", "q4", "q5", "q8", "q9", "q10", "q11"); + #endif for (int i = 0; i < remain_rem; i++) { *dout = *din * scale + bias; diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index fa3dd78516b..44709ea3eb4 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -4,8 +4,8 @@ TOTAL_ERRORS=0 if [[ ! $TRAVIS_BRANCH ]]; then # install cpplint on local machine. if [[ ! $(which cpplint) ]]; then - pip install pytest-runner - pip install cpplint==1.5.4 + pip3 install pytest-runner + pip3 install cpplint==1.5.4 fi # diff files on local machine. files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}') From ecd624898b4ada63050d658d787e8ed705b45fd8 Mon Sep 17 00:00:00 2001 From: HappyAngel Date: Fri, 20 May 2022 18:25:16 +0800 Subject: [PATCH 2/3] Update .pre-commit-config.yaml --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cca3e6daa6f..5ebedf10fa3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,7 +51,7 @@ repos: hooks: - id: copyright_checker name: copyright_checker - entry: python3 ./tools/codestyle/copyright.hook + entry: python ./tools/codestyle/copyright.hook language: system files: \.(c|cc|cxx|cpp|cu|cl|h|hpp|hxx|proto|py|mm|m|metal)$ exclude: (?!.*third_party)^.*$|(?!.*book)^.*$ From f96362ddc6ad91505239d576b05140e31956a6be Mon Sep 17 00:00:00 2001 From: HappyAngel Date: Fri, 20 May 2022 18:25:33 +0800 Subject: [PATCH 3/3] Update cpplint_pre_commit.hook --- tools/codestyle/cpplint_pre_commit.hook | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index 44709ea3eb4..fa3dd78516b 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -4,8 +4,8 @@ TOTAL_ERRORS=0 if [[ ! $TRAVIS_BRANCH ]]; then # install cpplint on local machine. if [[ ! $(which cpplint) ]]; then - pip3 install pytest-runner - pip3 install cpplint==1.5.4 + pip install pytest-runner + pip install cpplint==1.5.4 fi # diff files on local machine. files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')