Skip to content

Commit

Permalink
~ Replaced the pixman_composite_over_n_8888_asm_neon function
Browse files Browse the repository at this point in the history
  • Loading branch information
gindemit committed Jul 10, 2023
1 parent 972ab85 commit 3696cbb
Showing 1 changed file with 29 additions and 19 deletions.
48 changes: 29 additions & 19 deletions dependency/pixman/pixman-arm-intrisics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,27 +27,37 @@ extern "C" void pixman_composite_src_n_8888_asm_neon(int32_t w, int32_t h,
extern "C" void pixman_composite_over_n_8888_asm_neon(int32_t w, int32_t h,
uint32_t *dst,
int32_t dst_stride,
uint32_t src)
uint32_t color)
{
// Extract the source alpha and replicate it to all 16 lanes of a NEON vector
uint8x16_t v_src_alpha = vdupq_n_u8(src >> 24);
// Extract the source color and replicate it to all 16 lanes of a NEON vector
uint8x16_t v_src_color = vdupq_n_u8(src);
// Calculate alpha
int alpha = color >> 24;
int ialpha = 255 - alpha;

for (int32_t y = 0; y < h; y++)
// Calculate total length
int total_len = w * h;

// Create 4-element vectors
uint32x4_t colorVector = vdupq_n_u32(color);
uint32x4_t ialphaVector = vdupq_n_u32(ialpha);

// Perform the operation on blocks of 4 32-bit integers
for (int i = 0; i < total_len; i += 4)
{
uint32x4_t dstVector = vld1q_u32(dst + i); // Load 4 integers from dst

// Multiply dst[i] by ialpha and divide by 255
uint32x4_t mulVector = vmulq_n_u32(dstVector, ialpha);
mulVector = vaddq_u32(mulVector, vdupq_n_u32(255));
mulVector = vshrq_n_u32(mulVector, 8); // Equivalent to / 255

// Add color and store the result
uint32x4_t resultVector = vaddq_u32(colorVector, mulVector);
vst1q_u32(dst + i, resultVector); // Store the vector to memory
}

// If the total length is not a multiple of 4, we need to finish the rest
for (int i = total_len & ~3; i < total_len; ++i)
{
for (int32_t x = 0; x < w; x += 16) // Changed to 16 to match uint8x16_t
{
// Load 16 destination pixels
uint8x16_t v_dst_color = vld1q_u8((uint8_t *)(dst + x));

// Calculate the result color = source color * source alpha + destination color * (1 - source alpha)
// Note that we need to shift right by 8 because the alpha blending operation can result in values greater than 255
uint8x16_t v_res_color = vshrq_n_u8(vmlaq_u8(vmlsq_u8(v_dst_color, v_dst_color, v_src_alpha), v_src_color, v_src_alpha), 8);

// Store the result to memory
vst1q_u8((uint8_t *)(dst + x), v_res_color);
}
dst += dst_stride;
dst[i] = color + ((dst[i] * ialpha + 255) / 255);
}
}

0 comments on commit 3696cbb

Please sign in to comment.