PaddlePaddle · xiaoxiaohehe001 · Feb 16, 2022 · Jan 27, 2022 · Jan 27, 2022
@@ -121,3 +121,100 @@ kernel void reduce_sum_c(texture2d_array<ftype, access::read> inTexture[[texture
     }
     outTexture.write(ftype4(osum, 0.0, 0.0, 0.0), gid.xy, 0);
 }
+
+kernel void reduce_mean_hw(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float4 omean = 0;
+#else
+    half4 omean = 0;
+#endif
+    uint iC = inTexture.get_array_size();
+    uint iH = inTexture.get_height();
+    uint iW = inTexture.get_width();
+    for (uint i = 0; i < iW; ++i) {
+        for (uint j = 0; j < iH; ++j) {
+            ftype4 in = inTexture.read(uint2(i, j), gid.z);
+            omean += in;
+        }
+    }
+    omean = omean / iW / iH;
+    outTexture.write(omean, uint2(0, 0), gid.z);
+}
+
+kernel void reduce_max_hw(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float4 omax = FLT_MIN;
+#else
+    half4 omax = HALF_MIN;
+#endif
+    uint iC = inTexture.get_array_size();
+    uint iH = inTexture.get_height();
+    uint iW = inTexture.get_width();
+    for (uint i = 0; i < iW; ++i) {
+        for (uint j = 0; j < iH; ++j) {
+            ftype4 in = inTexture.read(uint2(i, j), gid.z);
+            omax = max(omax, in);
+        }
+    }
+    outTexture.write(omax, uint2(0, 0), gid.z);
+}
+
+kernel void reduce_min_hw(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float4 omin = FLT_MAX;
+#else
+    half4 omin = HALF_MAX;
+#endif
+    uint iC = inTexture.get_array_size();
+    uint iH = inTexture.get_height();
+    uint iW = inTexture.get_width();
+    for (uint i = 0; i < iW; ++i) {
+        for (uint j = 0; j < iH; ++j) {
+            ftype4 in = inTexture.read(uint2(i, j), gid.z);
+            omin = min(omin, in);
+        }
+    }
+    outTexture.write(omin, uint2(0, 0), gid.z);
+}
+
+kernel void reduce_sum_hw(texture2d_array<ftype, access::read> inTexture[[texture(0)]],
+    texture2d_array<ftype, access::write> outTexture[[texture(1)]],
+    uint3 gid[[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size())
+        return;
+
+#if LITE_WITH_METAL_FULL
+    float4 osum = 0;
+#else
+    half4 osum = 0;
+#endif
+    uint iC = inTexture.get_array_size();
+    uint iH = inTexture.get_height();
+    uint iW = inTexture.get_width();
+    for (uint i = 0; i < iW; ++i) {
+        for (uint j = 0; j < iH; ++j) {
+            ftype4 in = inTexture.read(uint2(i, j), gid.z);
+            osum += in;
+        }
+    }
+    outTexture.write(osum, uint2(0, 0), gid.z);
+}
@@ -46,11 +46,9 @@
 
     if (@available(iOS 11.3, macOS 10.13.4, macCatalyst 13.0, *)) {
         if (metal_context_->use_mps()) {
-            should_use_mps = true;
+            if (input_buffer_->tensor_dim_[1] >= 4 && param.dim.size() == 1) should_use_mps = true;
         }
     }
-    if (input_buffer_->tensor_dim_[1] < 4) should_use_mps = false;
-
     use_mps_ = should_use_mps;
     if (use_mps_) {
         if (reduce_type_ == ("reduce_max")) {
@@ -107,19 +105,13 @@
 
     // only support reduce_max by channel
     if (param.dim.size() == 1 && param.dim[0] == 1 && param.keep_dim == true && irank == 4) {
+        function_name_ = reduce_type_ + "_c";
+    } else if (param.dim.size() == 2 && param.dim[0] == 2 && param.dim[1] == 3) {
+        function_name_ = reduce_type_ + "_hw";
     } else {
         LOG(FATAL) << "reduce: only support max by channel";
     }
 
-    if (reduce_type_ == ("reduce_max")) {
-        function_name_ = "reduce_max_c";
-    } else if (reduce_type_ == ("reduce_min")) {
-        function_name_ = "reduce_min_c";
-    } else if (reduce_type_ == ("reduce_mean")) {
-        function_name_ = "reduce_mean_c";
-    } else if (reduce_type_ == ("reduce_sum")) {
-        function_name_ = "reduce_sum_c";
-    }
     // pipline
     auto backend = (__bridge MetalContextImp*)metal_context_->backend();
     pipline_ = [backend pipline:function_name_];