Comments pass for alpha_opaque_dst blitter

pygame-community · Jan 4, 2024 · 1f1ebf0 · 1f1ebf0
1 parent 8ee4f9b
commit 1f1ebf0
Showing 1 changed file with 31 additions and 43 deletions.
diff --git a/src_c/simd_blitters_sse2.c b/src_c/simd_blitters_sse2.c
@@ -503,8 +503,8 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
     Uint32 *srcp32 = (Uint32 *)info->s_pixels;
     Uint32 *dstp32 = (Uint32 *)info->d_pixels;
 
-    int pre_4_width = width % 4;
-    int post_4_width = width / 4;
+    int pxl_excess = width % 4;
+    int n_iters_4 = width / 4;
 
     __m128i src1, dst1, sub_dst, mm_src_alpha;
     __m128i unpacked_alpha, pixels_src, pixels_dst, batch_a_dst;
@@ -518,42 +518,30 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
 
         LOOP_UNROLLED4(
             {
-                /*
-                 * 4 pixel preparations
-                 */
+                /* ==== load 4 pixels into SSE registers ==== */
 
-                /* src(ARGB) -> pixels_src (ARGBARGBARGBARGB) */
+                /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_src*/
                 pixels_src = _mm_loadu_si128(srcp128);
 
                 /* isolate alpha channels
-                 * A1000A2000A3000A4000 -> mm_src_alpha */
+                 * [A10][00 ][A20][00 ][A30][00 ][A40][00 ] -> mm_src_alpha*/
                 mm_src_alpha = _mm_andnot_si128(mm_rgb_mask, pixels_src);
 
                 /* shift right to position alpha channels for manipulation
-                 * 0A1000A2000A3000A400 -> mm_src_alpha*/
+                 * [0A1][00 ][0A2][00 ][0A3][00 ][0A4][00 ] -> mm_src_alpha*/
                 mm_src_alpha = _mm_srli_si128(mm_src_alpha, 1);
 
-                /* dst(ARGB) -> pixels_dst (ARGBARGBARGBARGB) */
+                /*[AR][GB][AR][GB][AR][GB][AR][GB] -> pixels_dst*/
                 pixels_dst = _mm_loadu_si128(dstp128);
 
-                /*
-                 * BATCH A (the 2 low pixels)
-                 */
+                /* ==== BATCH A (the 2 low pixels) ==== */
 
                 /* shuffle alpha channels to duplicate 16 bit pairs
-                 * shuffle (3, 3, 1, 1) (backed 2 bit numbers)
-                 * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha
-                 * [ 7 ][ 6 ][ 5 ][ 4 ][ 3 ][ 2 ][ 1 ][ 0 ]
-                 * Therefore the previous contents of 16 bit lane 1
-                 * Goes into 16 bit lanes 0 and 1, and the previous
-                 * content of 16 bit lane 3 goes into lanes 2 and 3*/
+                 * [00 ][00 ][00 ][00 ][0A3][0A3][0A4][0A4] -> mm_src_alpha*/
                 unpacked_alpha = _mm_shufflelo_epi16(mm_src_alpha, 0b11110101);
 
-                /* finally move into final config
-                 * spread out so they can be multiplied in 16 bit math
-                 * against all RGBA of both pixels being blit
-                 * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha
-                 */
+                /* spread alpha into final config for 16 bit math
+                 * [0A3][0A3][0A3][0A3][0A4][0A4][0A4][0A4] -> unpacked_alpha*/
                 unpacked_alpha =
                     _mm_unpacklo_epi16(unpacked_alpha, unpacked_alpha);
 
@@ -567,58 +555,58 @@ alphablit_alpha_sse2_argb_no_surf_alpha_opaque_dst(SDL_BlitInfo *info)
 
                 batch_a_dst = sub_dst;
 
-                /*
-                 * BATCH B (the 2 high pixels)
-                 */
+                /* ==== BATCH B (the 2 high pixels) ==== */
 
+                /*[00 ][00 ][00 ][00 ][0A1][0A1][0A2][0A2] -> unpacked_alpha*/
                 unpacked_alpha = _mm_shufflehi_epi16(mm_src_alpha, 0b11110101);
 
+                /*[0A1][0A1][0A1][0A1][0A2][0A2][0A2][0A2] -> unpacked_alpha*/
                 unpacked_alpha =
                     _mm_unpackhi_epi16(unpacked_alpha, unpacked_alpha);
 
-                /* 0A0R0G0B0A0R0G0B -> src1 */
+                /*[0A][0R][0G][0B][0A][0R][0G][0B] -> src1*/
                 src1 = _mm_unpackhi_epi8(pixels_src, mm_zero);
 
-                /* 0A0R0G0B0A0R0G0B -> dst1 */
+                /*[0A][0R][0G][0B][0A][0R][0G][0B] -> dst1*/
                 dst1 = _mm_unpackhi_epi8(pixels_dst, mm_zero);
 
                 ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE
 
-                /*
-                 * Combine the batches and store
-                 * pack everything back into a pixel with zeroed out alpha
-                 */
+                /* ==== combine batches and store ==== */
+
                 sub_dst = _mm_packus_epi16(batch_a_dst, sub_dst);
+                /* zero out alpha */
                 sub_dst = _mm_and_si128(sub_dst, mm_rgb_mask);
                 _mm_storeu_si128(dstp128, sub_dst);
 
                 srcp128++;
                 dstp128++;
             },
-            n, post_4_width);
+            n, n_iters_4);
 
         srcp32 = (Uint32 *)srcp128;
         dstp32 = (Uint32 *)dstp128;
 
-        for (int i = 0; i < pre_4_width; i++) {
-            /* Do the actual blend */
-            /* src(ARGB) -> src1 (000000000000ARGB) */
+        for (int i = 0; i < pxl_excess; i++) {
+            /*[00][00][00][00][00][00][AR][GB] -> src1*/
             src1 = _mm_cvtsi32_si128(*srcp32);
-            /* src1 >> ashift -> mm_src_alpha(000000000000000A) */
+
+            /*[00][00][00][00][00][00][00][0A] -> mm_src_alpha*/
             mm_src_alpha = _mm_srli_si128(src1, 3);
 
-            /* Then Calc RGB */
-            /* 0000000000000A0A -> rgb_src_alpha */
+            /*[00][00][00][00][00][00][0A][0A] -> mm_src_alpha*/
             mm_src_alpha = _mm_unpacklo_epi16(mm_src_alpha, mm_src_alpha);
-            /* 000000000A0A0A0A -> rgb_src_alpha */
+
+            /*[00][00][00][00][0A][0A][0A][0A] -> mm_src_alpha*/
             unpacked_alpha = _mm_unpacklo_epi32(mm_src_alpha, mm_src_alpha);
 
-            /* 000000000A0R0G0B -> src1 */
+            /*[00][00][00][00][0A][0R][0G][0B] -> src1*/
             src1 = _mm_unpacklo_epi8(src1, mm_zero);
 
-            /* dst(ARGB) -> dst1 (000000000000ARGB) */
+            /*[00][00][00][00][00][00][AR][GB] -> dst1*/
             dst1 = _mm_cvtsi32_si128(*dstp32);
-            /* 000000000A0R0G0B -> dst1 */
+
+            /*[00][00][00][00][0A][0R][0G][0B] -> dst1*/
             dst1 = _mm_unpacklo_epi8(dst1, mm_zero);
 
             ARGB_NO_SURF_ALPHA_OPAQUE_DST_PROCEDURE