From 3696cbba16f9c45d3571dd3eaa9c146081094c3d Mon Sep 17 00:00:00 2001 From: Konstantin Gindemit Date: Mon, 10 Jul 2023 22:30:25 +0200 Subject: [PATCH] ~ Replaced the pixman_composite_over_n_8888_asm_neon function --- dependency/pixman/pixman-arm-intrisics.cpp | 48 +++++++++++++--------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/dependency/pixman/pixman-arm-intrisics.cpp b/dependency/pixman/pixman-arm-intrisics.cpp index ad67776..3d4666a 100644 --- a/dependency/pixman/pixman-arm-intrisics.cpp +++ b/dependency/pixman/pixman-arm-intrisics.cpp @@ -27,27 +27,37 @@ extern "C" void pixman_composite_src_n_8888_asm_neon(int32_t w, int32_t h, extern "C" void pixman_composite_over_n_8888_asm_neon(int32_t w, int32_t h, uint32_t *dst, int32_t dst_stride, - uint32_t src) + uint32_t color) { - // Extract the source alpha and replicate it to all 16 lanes of a NEON vector - uint8x16_t v_src_alpha = vdupq_n_u8(src >> 24); - // Extract the source color and replicate it to all 16 lanes of a NEON vector - uint8x16_t v_src_color = vdupq_n_u8(src); + // Calculate alpha + int alpha = color >> 24; + int ialpha = 255 - alpha; - for (int32_t y = 0; y < h; y++) + // Calculate total length + int total_len = w * h; + + // Create 4-element vectors + uint32x4_t colorVector = vdupq_n_u32(color); + uint32x4_t ialphaVector = vdupq_n_u32(ialpha); + + // Perform the operation on blocks of 4 32-bit integers + for (int i = 0; i < total_len; i += 4) + { + uint32x4_t dstVector = vld1q_u32(dst + i); // Load 4 integers from dst + + // Multiply dst[i] by ialpha and divide by 255 + uint32x4_t mulVector = vmulq_n_u32(dstVector, ialpha); + mulVector = vaddq_u32(mulVector, vdupq_n_u32(255)); + mulVector = vshrq_n_u32(mulVector, 8); // Equivalent to / 255 + + // Add color and store the result + uint32x4_t resultVector = vaddq_u32(colorVector, mulVector); + vst1q_u32(dst + i, resultVector); // Store the vector to memory + } + + // If the total length is not a multiple of 4, we need to finish the rest + for (int i = total_len & ~3; i < total_len; ++i) { - for (int32_t x = 0; x < w; x += 16) // Changed to 16 to match uint8x16_t - { - // Load 16 destination pixels - uint8x16_t v_dst_color = vld1q_u8((uint8_t *)(dst + x)); - - // Calculate the result color = source color * source alpha + destination color * (1 - source alpha) - // Note that we need to shift right by 8 because the alpha blending operation can result in values greater than 255 - uint8x16_t v_res_color = vshrq_n_u8(vmlaq_u8(vmlsq_u8(v_dst_color, v_dst_color, v_src_alpha), v_src_color, v_src_alpha), 8); - - // Store the result to memory - vst1q_u8((uint8_t *)(dst + x), v_res_color); - } - dst += dst_stride; + dst[i] = color + ((dst[i] * ialpha + 255) / 255); } } \ No newline at end of file