From 3696cbba16f9c45d3571dd3eaa9c146081094c3d Mon Sep 17 00:00:00 2001
From: Konstantin Gindemit <konstantin.gindemit@gmail.com>
Date: Mon, 10 Jul 2023 22:30:25 +0200
Subject: [PATCH] ~ Replaced the pixman_composite_over_n_8888_asm_neon function

---
 dependency/pixman/pixman-arm-intrisics.cpp | 48 +++++++++++++---------
 1 file changed, 29 insertions(+), 19 deletions(-)

diff --git a/dependency/pixman/pixman-arm-intrisics.cpp b/dependency/pixman/pixman-arm-intrisics.cpp
index ad67776..3d4666a 100644
--- a/dependency/pixman/pixman-arm-intrisics.cpp
+++ b/dependency/pixman/pixman-arm-intrisics.cpp
@@ -27,27 +27,37 @@ extern "C" void pixman_composite_src_n_8888_asm_neon(int32_t w, int32_t h,
 extern "C" void pixman_composite_over_n_8888_asm_neon(int32_t w, int32_t h,
                                                       uint32_t *dst,
                                                       int32_t   dst_stride,
-                                                      uint32_t  src)
+                                                      uint32_t  color)
 {
-    // Extract the source alpha and replicate it to all 16 lanes of a NEON vector
-    uint8x16_t v_src_alpha = vdupq_n_u8(src >> 24);
-    // Extract the source color and replicate it to all 16 lanes of a NEON vector
-    uint8x16_t v_src_color = vdupq_n_u8(src);
+    // Calculate alpha
+    int alpha = color >> 24;
+    int ialpha = 255 - alpha;
 
-    for (int32_t y = 0; y < h; y++)
+    // Calculate total length
+    int total_len = w * h;
+
+    // Create 4-element vectors
+    uint32x4_t colorVector = vdupq_n_u32(color);
+    uint32x4_t ialphaVector = vdupq_n_u32(ialpha);
+
+    // Perform the operation on blocks of 4 32-bit integers
+    for (int i = 0; i < total_len; i += 4)
+    {
+        uint32x4_t dstVector = vld1q_u32(dst + i); // Load 4 integers from dst
+
+        // Multiply dst[i] by ialpha and divide by 255
+        uint32x4_t mulVector = vmulq_n_u32(dstVector, ialpha);
+        mulVector = vaddq_u32(mulVector, vdupq_n_u32(255));
+        mulVector = vshrq_n_u32(mulVector, 8); // Equivalent to / 255
+
+        // Add color and store the result
+        uint32x4_t resultVector = vaddq_u32(colorVector, mulVector);
+        vst1q_u32(dst + i, resultVector); // Store the vector to memory
+    }
+
+    // If the total length is not a multiple of 4, we need to finish the rest
+    for (int i = total_len & ~3; i < total_len; ++i)
     {
-        for (int32_t x = 0; x < w; x += 16)  // Changed to 16 to match uint8x16_t
-        {
-            // Load 16 destination pixels
-            uint8x16_t v_dst_color = vld1q_u8((uint8_t *)(dst + x));
-
-            // Calculate the result color = source color * source alpha + destination color * (1 - source alpha)
-            // Note that we need to shift right by 8 because the alpha blending operation can result in values greater than 255
-            uint8x16_t v_res_color = vshrq_n_u8(vmlaq_u8(vmlsq_u8(v_dst_color, v_dst_color, v_src_alpha), v_src_color, v_src_alpha), 8);
-
-            // Store the result to memory
-            vst1q_u8((uint8_t *)(dst + x), v_res_color);
-        }
-        dst += dst_stride;
+        dst[i] = color + ((dst[i] * ialpha + 255) / 255);
     }
 }
\ No newline at end of file