Enable FP16 Clip and Handle Bias in FP16 Depthwise Conv (microsoft#21493

) - Improved accuracy for face-detection, image-classification, and object-detection in the GeekBench ML benchmark on ARM64. - Fixed issue microsoft#18992
intel · Jul 30, 2024 · 530a2d7 · 530a2d7
1 parent 82036b0
commit 530a2d7
Show file tree

Hide file tree

Showing 9 changed files with 531 additions and 20 deletions.
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
@@ -58,8 +58,8 @@ Do not modify directly.*
 |Ceil|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[6, 12]|**T** = tensor(double), tensor(float)|
 |Celu|*in* X:**T**<br> *out* Y:**T**|12+|**T** = tensor(float)|
-|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
-|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|Clip|*in* input:**T**<br> *in* min:**T**<br> *in* max:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**T**<br> *out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
+|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||11|**T** = tensor(float)|
 |||[6, 10]|**T** = tensor(float)|
 |Col2Im|*in* input:**T**<br> *in* image_shape:**tensor(int64)**<br> *in* block_shape:**tensor(int64)**<br> *out* output:**T**|18+|**T** = tensor(float)|

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac
  * @brief Indirect Depthwise convolution for fp16
  * @param Input         Supplies the indirect buffer for NHWC input
  * @param Filter        Supplies the address for filter tensor
+ * @param Bias          Supplies the address for 1D bias tensor B, has size of M
  * @param Output        Supplies the address for the result tensor
  * @param Channels      # of input channels
  * @param OutputCount   # of output pixels
@@ -1762,6 +1763,7 @@ MLASCALL
 MlasConvDepthwise(
     const MLAS_FP16* const* Input,
     const MLAS_FP16* Filter,
+    const MLAS_FP16* Bias,
     MLAS_FP16* Output,
     size_t Channels,
     size_t OutputCount,

diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp
@@ -14,7 +14,6 @@ Module Name:
 
 --*/
 
-
 #include "fp16_common.h"
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -24,19 +23,20 @@ void
 MlasConvDepthwiseKernel(
     const _mlas_fp16_* const* Input,
     const _mlas_fp16_* Filter,
+    const _mlas_fp16_* Bias,
     _mlas_fp16_* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     while (OutputCount > 0) {
         size_t ChannelOffset = 0;
         size_t c = Channels;
 
         while (c >= 8) {
-            MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8();
+            MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -54,7 +54,7 @@ MlasConvDepthwiseKernel(
         }
 
         if (c >= 4) {
-            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -72,7 +72,8 @@ MlasConvDepthwiseKernel(
         }
 
         if (c > 0) {
-            MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4();
+            MLAS_FLOAT16X4 Accumulator =
+                Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -86,8 +87,7 @@ MlasConvDepthwiseKernel(
             Output += c;
         }
         if (PostProc) {
-            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
-                              Channels);
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
         }
         Input += KernelSize;
         OutputCount -= 1;
@@ -101,16 +101,17 @@ void
 MlasConvDepthwiseKernel(
     const _mlas_fp16_* const* Input,
     const _mlas_fp16_* Filter,
+    const _mlas_fp16_* Bias,
     _mlas_fp16_* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     while (OutputCount > 0) {
         for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) {
-            float Accumulator = 0.0f;
+            float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]);
             size_t ChannelKernelOffset = ChannelOffset;
 
             for (size_t k = 0; k < KernelSize; k++) {
@@ -120,35 +121,36 @@ MlasConvDepthwiseKernel(
             *Output++ = MLAS_Float2Half(Accumulator);
         }
         if (PostProc) {
-            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels,
-                              Channels);
+            PostProc->Process(reinterpret_cast<MLAS_FP16*>(Output - Channels), 0, 0, 1, Channels, Channels);
         }
         Input += KernelSize;
         OutputCount -= 1;
     }
 }
 
-#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED
-
+#endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
 
 void
 MLASCALL
 MlasConvDepthwise(
     const MLAS_FP16* const* Input,
     const MLAS_FP16* Filter,
+    const MLAS_FP16* Bias,
     MLAS_FP16* Output,
     size_t Channels,
     size_t OutputCount,
     size_t KernelSize,
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
-    )
+)
 {
     MlasConvDepthwiseKernel(
         reinterpret_cast<const _mlas_fp16_* const*>(Input),
         reinterpret_cast<const _mlas_fp16_*>(Filter),
+        reinterpret_cast<const _mlas_fp16_*>(Bias),
         reinterpret_cast<_mlas_fp16_*>(Output),
         Channels,
         OutputCount,
         KernelSize,
-        PostProc);
+        PostProc
+    );
 }
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -64,6 +64,23 @@ MLAS_FORCEINLINE
 MLAS_FLOAT16X4
 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); }
 
+MLAS_FORCEINLINE
+MLAS_FLOAT16X4
+MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len)
+{
+    MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4();
+    if ((len & 1) != 0) {
+        Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0));
+    }
+    if ((len & 2) != 0) {
+        Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0));
+        Vector = vreinterpret_f16_f32(
+            vld1_lane_f32(reinterpret_cast<const float*>(Buffer), vreinterpret_f32_f16(Vector), 0)
+        );
+    }
+    return Vector;
+}
+
 MLAS_FORCEINLINE
 void
 MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector)

diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc
@@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr
 
   bool share_prepacked_weights = (prepacked_weights != nullptr);
 
+  const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1);
   // Don't pack the filter buffer if the MlasConvDepthwise path is used.
-  if (!(group_input_channels == 1 && group_output_channels == 1)) {
+  if (!is_depthwise_conv) {
     packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false);
     if (packed_W_size_ != 0) {
       size_t packed_W_data_size = SafeInt<size_t>(group_count) * packed_W_size_;
@@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const {
         MlasConvDepthwise(
             worker_indirection_buffer,
             reordered_W,
+            Bdata,
             worker_output,
             static_cast<size_t>(M),
             static_cast<size_t>(output_count),

diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc
@@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
     float);
 ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES(
     kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0,
-    float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
+    float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t);
 }  // namespace op_kernel_type_control
 
 using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST(

diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc
@@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
+TEST(MathOpTest, Clip_MLFloat16) {
+  OpTester test("Clip", 12);
+
+  std::vector<int64_t> dims{3, 3};
+  test.AddInput<MLFloat16>("X", dims,
+                           {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f),
+                            MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+                            MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)});
+  test.AddInput<MLFloat16>("min", {}, {MLFloat16(0.0f)});
+  test.AddInput<MLFloat16>("max", {}, {MLFloat16(6.0f)});
+  test.AddOutput<MLFloat16>("Y", dims,
+                            {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f),
+                             MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f),
+                             MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)});
+
+  test.Run();
+}
+
 TEST(MathOpTest, Clip_int32) {
   OpTester test("Clip", 12);