Skip to content

Commit

Permalink
Comments pass for alpha_opaque_dst blitter
Browse files Browse the repository at this point in the history
  • Loading branch information
Starbuck5 committed Jan 4, 2024
1 parent 8ee4f9b commit 1f1ebf0
Showing 1 changed file with 31 additions and 43 deletions.
74 changes: 31 additions & 43 deletions src_c/simd_blitters_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
Uint32 *srcp32 = (Uint32 *)info->s_pixels;
Uint32 *dstp32 = (Uint32 *)info->d_pixels;

int pre_4_width = width % 4;
int post_4_width = width / 4;
int pxl_excess = width % 4;
int n_iters_4 = width / 4;

__m128i src1, dst1, sub_dst, mm_src_alpha;
__m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
Expand All @@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)

LOOP_UNROLLED4(
{
/*
* 4 pixel preparations
*/
/* ==== load 4 pixels into SSE registers ==== */

/* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
pixels_src = _mm_loadu_si128(srcp128);

/* isolate alpha channels
* A1000A2000A3000A4000 -> mm_src_alpha */
* [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);

/* shift right to position alpha channels for manipulation
* 0A1000A2000A3000A400 -> mm_src_alpha*/
* [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);

/* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
/*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
pixels_dst = _mm_loadu_si128(dstp128);

/*
* BATCH A (the 2 low pixels)
*/
/* ==== BATCH A (the 2 low pixels) ==== */

/* shuffle alpha channels to duplicate 16 bit pairs
* shuffle (3, 3, 1, 1) (backed 2 bit numbers)
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
* [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
* Therefore the previous contents of 16 bit lane 1
* Goes into 16 bit lanes 0 and 1, and the previous
* content of 16 bit lane 3 goes into lanes 2 and 3*/
* [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);

/* finally move into final config
* spread out so they can be multiplied in 16 bit math
* against all RGBA of both pixels being blit
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
*/
/* spread alpha into final config for 16 bit math
* [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);

Expand All @@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)

batch_a_dst = sub_dst;

/*
* BATCH B (the 2 high pixels)
*/
/* ==== BATCH B (the 2 high pixels) ==== */

/*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);

/*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
unpacked_alpha =
_mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);

/* 0A0R0G0B0A0R0G0B -> src1 */
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);

/* 0A0R0G0B0A0R0G0B -> dst1 */
/*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);

ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE

/*
* Combine the batches and store
* pack everything back into a pixel with zeroed out alpha
*/
/* ==== combine batches and store ==== */

sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
/* zero out alpha */
sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
_mm_storeu_si128(dstp128, sub_dst);

srcp128++;
dstp128++;
},
n, post_4_width);
n, n_iters_4);

srcp32 = (Uint32 *)srcp128;
dstp32 = (Uint32 *)dstp128;

for (int i = 0; i < pre_4_width; i++) {
/* Do the actual blend */
/* src(ARGB) -> src1 (000000000000ARGB) */
for (int i = 0; i < pxl_excess; i++) {
/*[00][00][00][00][00][00][AR][GB] -> src1*/
src1 = _mm_cvtsi32_si128(*srcp32);
/* src1 >> ashift -> mm_src_alpha(000000000000000A) */

/*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
mm_src_alpha = _mm_srli_si128(src1, 3);

/* Then Calc RGB */
/* 0000000000000A0A -> rgb_src_alpha */
/*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
/* 000000000A0A0A0A -> rgb_src_alpha */

/*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);

/* 000000000A0R0G0B -> src1 */
/*[00][00][00][00][0A][0R][0G][0B] -> src1*/
src1 = _mm_unpacklo_epi8(src1, mm_zero);

/* dst(ARGB) -> dst1 (000000000000ARGB) */
/*[00][00][00][00][00][00][AR][GB] -> dst1*/
dst1 = _mm_cvtsi32_si128(*dstp32);
/* 000000000A0R0G0B -> dst1 */

/*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
dst1 = _mm_unpacklo_epi8(dst1, mm_zero);

ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
Expand Down

0 comments on commit 1f1ebf0

Please sign in to comment.