From 6aa28af6a91ff49f0d0b66c7bb1c31377c3da759 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 11 Jul 2024 16:58:50 +0800
Subject: [PATCH 1/2] fix potential fp16s bf16s conflicts on armv7 vfpv4

---
 src/net.cpp        | 4 ++--
 tests/testutil.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/net.cpp b/src/net.cpp
index 996337ba36a..c84107de23c 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -622,7 +622,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // *INDENT-OFF*
 
 #if NCNN_VFPV4
-        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
+        if (opt.use_fp16_storage && !opt.use_bf16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
         {
             Mat bottom_blob_fp16;
             cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
@@ -741,7 +741,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // *INDENT-OFF*
 
 #if NCNN_VFPV4
-        if (opt.use_fp16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
+        if (opt.use_fp16_storage && !opt.use_bf16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
         {
             Mat bottom_blob_fp32;
             cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 07d95547d44..893b85418e2 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -329,7 +329,7 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc
     // clang-format off
     // *INDENT-OFF*
 #if NCNN_VFPV4
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
         ncnn::cast_float32_to_float16(a, a4, opt);
     }
@@ -450,7 +450,7 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc
     // clang-format off
     // *INDENT-OFF*
 #if NCNN_VFPV4
-    if (opt.use_fp16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+    if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
     {
         ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
     }

From 6bc83d623b83eba13e54ee1b48a20527c7aeab6f Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Thu, 11 Jul 2024 17:29:47 +0800
Subject: [PATCH 2/2] but prefer fp16 on armv8.2

---
 src/net.cpp        | 32 +++++++++++++++++++++++++++++++-
 tests/testutil.cpp | 14 ++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/src/net.cpp b/src/net.cpp
index c84107de23c..3574944e726 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -621,6 +621,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && layer->support_fp16_storage)
+        {
+            Mat bottom_blob_fp16;
+            cast_float32_to_float16(bottom_blob, bottom_blob_fp16, opt);
+            bottom_blob = bottom_blob_fp16;
+        }
+        else
+#endif // NCNN_ARM82
 #if NCNN_VFPV4
         if (opt.use_fp16_storage && !opt.use_bf16_storage && cpu_support_arm_vfpv4() && layer->support_fp16_storage)
         {
@@ -740,6 +749,15 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
         // clang-format off
         // *INDENT-OFF*
 
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && !layer->support_fp16_storage)
+        {
+            Mat bottom_blob_fp32;
+            cast_float16_to_float32(bottom_blob, bottom_blob_fp32, opt);
+            bottom_blob = bottom_blob_fp32;
+        }
+        else
+#endif // NCNN_ARM82
 #if NCNN_VFPV4
         if (opt.use_fp16_storage && !opt.use_bf16_storage && cpu_support_arm_vfpv4() && !layer->support_fp16_storage)
         {
@@ -2719,8 +2737,20 @@ int Extractor::extract(int blob_index, Mat& feat, int type)
 
         // clang-format off
         // *INDENT-OFF*
+#if NCNN_ARM82
+        if (d->opt.use_fp16_storage && cpu_support_arm_asimdhp() && (type == 0))
+        {
+            if (feat.elembits() == 16)
+            {
+                Mat feat_fp32;
+                cast_float16_to_float32(feat, feat_fp32, d->opt);
+                feat = feat_fp32;
+            }
+        }
+        else
+#endif // NCNN_ARM82
 #if NCNN_VFPV4
-        if (d->opt.use_fp16_storage && cpu_support_arm_vfpv4() && (type == 0))
+        if (d->opt.use_fp16_storage && !d->opt.use_bf16_storage && cpu_support_arm_vfpv4() && (type == 0))
         {
             if (feat.elembits() == 16)
             {
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 893b85418e2..837043cb754 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -328,6 +328,13 @@ static int convert_to_optimal_layout(const ncnn::Mat& a, ncnn::Mat& a4, const nc
 {
     // clang-format off
     // *INDENT-OFF*
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
+    {
+        ncnn::cast_float32_to_float16(a, a4, opt);
+    }
+    else
+#endif // NCNN_ARM82
 #if NCNN_VFPV4
     if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && !(flag & TEST_LAYER_DISABLE_AUTO_INPUT_CASTING))
     {
@@ -449,6 +456,13 @@ static int convert_to_vanilla_layout(const ncnn::Mat& c4, ncnn::Mat& c, const nc
 
     // clang-format off
     // *INDENT-OFF*
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && ncnn::cpu_support_arm_asimdhp() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
+    {
+        ncnn::cast_float16_to_float32(c4_unpacked, c, opt);
+    }
+    else
+#endif // NCNN_ARM82
 #if NCNN_VFPV4
     if (opt.use_fp16_storage && !opt.use_bf16_storage && ncnn::cpu_support_arm_vfpv4() && op->support_fp16_storage && c4_unpacked.elembits() == 16)
     {