From f6472a7e1233672d2d441d972137665abdd4db31 Mon Sep 17 00:00:00 2001
From: chenjiaoAngel <chenjiaobuaa@126.com>
Date: Fri, 20 May 2022 18:23:28 +0800
Subject: [PATCH 1/3] add v7 fp16 scale implement

---
 .pre-commit-config.yaml                 |   2 +-
 lite/backends/arm/math/scale.cc         | 157 ++++++++++++++++++++++++
 tools/codestyle/cpplint_pre_commit.hook |   4 +-
 3 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5ebedf10fa3..cca3e6daa6f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python ./tools/codestyle/copyright.hook
+        entry: python3 ./tools/codestyle/copyright.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|cl|h|hpp|hxx|proto|py|mm|m|metal)$
         exclude: (?!.*third_party)^.*$|(?!.*book)^.*$
diff --git a/lite/backends/arm/math/scale.cc b/lite/backends/arm/math/scale.cc
index 3687d50d773..b9b6c0cc4ae 100644
--- a/lite/backends/arm/math/scale.cc
+++ b/lite/backends/arm/math/scale.cc
@@ -863,6 +863,40 @@ inline void scale_compute_fp16(const flaot16_t* din,
         [remain_cnt] "+r"(remain_cnt)
       : [vscale] "w"(vscale), [vbias] "w"(vbias)
       : "cc", "memory", "v4", "v5", "v8", "v9");
+#else
+  asm volatile(
+      "cmp  %[cnt], #1                          \n"
+      "blt 0f                                   \n"
+      "1:                                       \n"
+      "vld1.16  {d8-d9}, [%[din]]!              \n"
+      "vmov     q8, %q[vbias] \n"
+      "vld1.16  {d10-d11}, [%[din]]!              \n"
+      "vmov     q9, %q[vbias] \n"
+
+      "vmla.f16 q8, q4, %q[vscale]          \n"
+      "vmla.f16 q9, q5, %q[vscale]         \n"
+
+      "subs %[cnt], %[cnt],  #1               \n"
+      "vst1.16 {d16-d19}, [%[dout]]!             \n"
+      "bne    1b                                \n"
+      "0:                                       \n"
+      "cmp  %[remain_cnt], #1                   \n"
+      "blt 2f                                   \n"
+      "3:                                       \n"
+      "vld1.16  {d8}, [%[din]]!               \n"
+      "vmov     d16, %e[vbias] \n"
+      "vmla.f16 d16, d8, %e[vscale]          \n"
+      "subs %[remain_cnt], %[remain_cnt],  #1 \n"
+      "vst1.16 {d16}, [%[dout]]!              \n"
+      "bne    3b                                \n"
+      "2:                                       \n"
+      : [dout] "+r"(dout),
+        [din] "+r"(din),
+        [cnt] "+r"(cnt),
+        [remain_cnt] "+r"(remain_cnt)
+      : [vscale] "w"(vscale), [vbias] "w"(vbias)
+      : "cc", "memory", "q4", "q5", "q8", "q9");
+
 #endif
   for (int j = 0; j < remain_rem; j++) {
     *dout = *din * vscale[0] + vbias[0];
@@ -1013,6 +1047,42 @@ void scale_relu<float16_t>(const float16_t* din,
       : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
       : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
 #else
+  asm volatile(
+      "cmp  %[cnt], #1                          \n"
+      "blt 0f                                   \n"
+      "1:                                       \n"
+      "vld1.16  {d8-d9}, [%[din]]!              \n"
+      "vmov     q8, %q[vbias] \n"
+      "vld1.16  {d10-d11}, [%[din]]!              \n"
+      "vmov     q9, %q[vbias] \n"
+
+      "vmla.f16 q8, q4, %q[vscale]          \n"
+      "vmla.f16 q9, q5, %q[vscale]         \n"
+      "vmax.f16 q8, q8, %q[vzero]         \n"
+      "vmax.f16 q9, q9, %q[vzero]          \n"
+
+      "subs %[cnt], %[cnt],  #1               \n"
+      "vst1.16 {d16-d19}, [%[dout]]!             \n"
+      "bne    1b                                \n"
+      "0:                                       \n"
+      "cmp  %[remain_cnt], #1                   \n"
+      "blt 2f                                   \n"
+      "3:                                       \n"
+      "vld1.16  {d8}, [%[din]]!               \n"
+      "vmov     d16, %e[vbias] \n"
+      "vmla.f16 d16, d8, %e[vscale]          \n"
+      "vmax.f16 d16, d16, %e[vzero]         \n"
+      "subs %[remain_cnt], %[remain_cnt],  #1 \n"
+      "vst1.16 {d16}, [%[dout]]!              \n"
+      "bne    3b                                \n"
+      "2:                                       \n"
+      : [dout] "+r"(dout),
+        [din] "+r"(din),
+        [cnt] "+r"(cnt),
+        [remain_cnt] "+r"(remain_num)
+      : [vscale] "w"(vscale), [vbias] "w"(vbias), [vzero] "w"(vzero)
+      : "cc", "memory", "q4", "q5", "q8", "q9");
+
 #endif
   for (int i = 0; i < remain_rem; i++) {
     *dout = *din * scale + bias;
@@ -1082,6 +1152,48 @@ void scale_relu6<float16_t>(const float16_t* din,
         [valpha] "w"(valpha)
       : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
 #else
+  asm volatile(
+      "cmp  %[cnt], #1                          \n"
+      "blt 0f                                   \n"
+      "1:                                       \n"
+      "vld1.16  {d8-d9}, [%[din]]!              \n"
+      "vmov     q8, %q[vbias] \n"
+      "vld1.16  {d10-d11}, [%[din]]!              \n"
+      "vmov     q9, %q[vbias] \n"
+
+      "vmla.f16 q8, q4, %q[vscale]          \n"
+      "vmla.f16 q9, q5, %q[vscale]         \n"
+      "vmax.f16 q8, q8, %q[vzero]         \n"
+      "vmax.f16 q9, q9, %q[vzero]          \n"
+      "vmin.f16 q8, q8, %q[valpha]         \n"
+      "vmin.f16 q9, q9, %q[valpha]          \n"
+
+      "subs %[cnt], %[cnt],  #1               \n"
+      "vst1.16 {d16-d19}, [%[dout]]!             \n"
+      "bne    1b                                \n"
+      "0:                                       \n"
+      "cmp  %[remain_cnt], #1                   \n"
+      "blt 2f                                   \n"
+      "3:                                       \n"
+      "vld1.16  {d8}, [%[din]]!               \n"
+      "vmov     d16, %e[vbias] \n"
+      "vmla.f16 d16, d8, %e[vscale]          \n"
+      "vmax.f16 d16, d16, %e[vzero]         \n"
+      "vmin.f16 d16, d16, %e[valpha]         \n"
+      "subs %[remain_cnt], %[remain_cnt],  #1 \n"
+      "vst1.16 {d16}, [%[dout]]!              \n"
+      "bne    3b                                \n"
+      "2:                                       \n"
+      : [dout] "+r"(dout),
+        [din] "+r"(din),
+        [cnt] "+r"(cnt),
+        [remain_cnt] "+r"(remain_num)
+      : [vscale] "w"(vscale),
+        [vbias] "w"(vbias),
+        [vzero] "w"(vzero),
+        [valpha] "w"(valpha)
+      : "cc", "memory", "q4", "q5", "q8", "q9");
+
 #endif
   for (int i = 0; i < remain_rem; i++) {
     *dout = *din * scale + bias;
@@ -1153,6 +1265,51 @@ void scale_leaky_relu<float16_t>(const float16_t* din,
         [valpha] "w"(valpha)
       : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
 #else
+  asm volatile(
+      "cmp  %[cnt], #1                          \n"
+      "blt 0f                                   \n"
+      "1:                                       \n"
+      "vld1.16  {d8-d9}, [%[din]]!              \n"
+      "vmov     q8, %q[vbias] \n"
+      "vld1.16  {d10-d11}, [%[din]]!              \n"
+      "vmov     q9, %q[vbias] \n"
+
+      "vmla.f16 q8, q4, %q[vscale]          \n"
+      "vmla.f16 q9, q5, %q[vscale]         \n"
+      "vcge.f16  q10,  q8, %q[vzero] \n"
+      "vmul.f16  q11,  q8, %q[valpha]\n"
+      "vcge.f16  q12,  q9, %q[vzero] \n"
+      "vmul.f16  q13,  q9, %q[valpha]\n"
+
+      "subs %[cnt], %[cnt],  #1       \n"
+      "vbif      q8,  q11,  q10        \n"
+      "vbif      q9,  q13, q12       \n"
+      "vst1.16 {d16-d19}, [%[dout]]!             \n"
+      "bne    1b                                \n"
+      "0:                                       \n"
+      "cmp  %[remain_cnt], #1                   \n"
+      "blt 2f                                   \n"
+      "3:                                       \n"
+      "vld1.16  {d8}, [%[din]]!               \n"
+      "vmov     d16, %e[vbias] \n"
+      "vmla.f16 d16, d8, %e[vscale]          \n"
+      "vcge.f16  d20,  d16, %e[vzero] \n"
+      "vmul.f16  d22,  d16, %e[valpha]\n"
+      "subs %[remain_cnt], %[remain_cnt],  #1 \n"
+      "vbif      d16,  d22,  d20        \n"
+      "vst1.16 {d16}, [%[dout]]!              \n"
+      "bne    3b                                \n"
+      "2:                                       \n"
+      : [dout] "+r"(dout),
+        [din] "+r"(din),
+        [cnt] "+r"(cnt),
+        [remain_cnt] "+r"(remain_num)
+      : [vscale] "w"(vscale),
+        [vbias] "w"(vbias),
+        [vzero] "w"(vzero),
+        [valpha] "w"(valpha)
+      : "cc", "memory", "q4", "q5", "q8", "q9", "q10", "q11");
+
 #endif
   for (int i = 0; i < remain_rem; i++) {
     *dout = *din * scale + bias;
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index fa3dd78516b..44709ea3eb4 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,8 +4,8 @@ TOTAL_ERRORS=0
 if [[ ! $TRAVIS_BRANCH ]]; then
   # install cpplint on local machine.
   if [[ ! $(which cpplint) ]]; then
-    pip install pytest-runner  
-    pip install cpplint==1.5.4
+    pip3 install pytest-runner  
+    pip3 install cpplint==1.5.4
   fi
   # diff files on local machine. 
   files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')

From ecd624898b4ada63050d658d787e8ed705b45fd8 Mon Sep 17 00:00:00 2001
From: HappyAngel <chenjiaobuaa@126.com>
Date: Fri, 20 May 2022 18:25:16 +0800
Subject: [PATCH 2/3] Update .pre-commit-config.yaml

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cca3e6daa6f..5ebedf10fa3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     hooks:
     -   id: copyright_checker
         name: copyright_checker
-        entry: python3 ./tools/codestyle/copyright.hook
+        entry: python ./tools/codestyle/copyright.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|cl|h|hpp|hxx|proto|py|mm|m|metal)$
         exclude: (?!.*third_party)^.*$|(?!.*book)^.*$

From f96362ddc6ad91505239d576b05140e31956a6be Mon Sep 17 00:00:00 2001
From: HappyAngel <chenjiaobuaa@126.com>
Date: Fri, 20 May 2022 18:25:33 +0800
Subject: [PATCH 3/3] Update cpplint_pre_commit.hook

---
 tools/codestyle/cpplint_pre_commit.hook | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 44709ea3eb4..fa3dd78516b 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,8 +4,8 @@ TOTAL_ERRORS=0
 if [[ ! $TRAVIS_BRANCH ]]; then
   # install cpplint on local machine.
   if [[ ! $(which cpplint) ]]; then
-    pip3 install pytest-runner  
-    pip3 install cpplint==1.5.4
+    pip install pytest-runner  
+    pip install cpplint==1.5.4
   fi
   # diff files on local machine. 
   files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')