From 6c9fb785680a91c32c06d9e6337c95d24fa39816 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 1 Sep 2023 14:51:16 +0100
Subject: [PATCH 01/15] Add SIMD versions of greyscale transform

---
 src_c/simd_transform.h      |   4 +
 src_c/simd_transform_avx2.c | 169 ++++++++++++++++++++++++++++++++++++
 src_c/simd_transform_sse2.c |  77 ++++++++++++++++
 src_c/transform.c           |  62 +++++++++----
 4 files changed, 295 insertions(+), 17 deletions(-)

diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h
index d97a0d5b22..ef740e8b94 100644
--- a/src_c/simd_transform.h
+++ b/src_c/simd_transform.h
@@ -9,6 +9,10 @@
 
 // SSE2 functions
 #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf);
 #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
 
 // AVX2 functions
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index f1889f1bb7..f98c369d01 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -42,3 +42,172 @@ pg_avx2_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+    !defined(SDL_DISABLE_IMMINTRIN_H)
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    // Current AVX2 process
+    // ------------------
+    // - pre loop: Load weights into register x8
+    // - in loop:
+    //     1. Load 8 pixels into register
+    //     2. remove the alpha channel for every pixel and save it.
+    //     3. multiply weights by pixels using standard shuffle to 2x 16bit
+    //        register, mul + 255 then left shift. See multiply blitter mode
+    //        for this operation in isolation.
+    //     4. pack pixels back together from A & B while adding with a
+    //        horizontal add (e.g. adds A+R and G+B in a ARGB layout)
+    //     5. shift and add to make final grey pixel colour in 0th
+    //        8Bit channel in each 'pixel'
+    //     6. shuffle again to push the grey from the 0th channel into every
+    //        channel of every pixel.
+    //     7. add the alpha channel back in.
+
+    // Things to fix:
+    //     1. Would be nice to only use AVX2 stuff for the single pixel stuff.
+    //     2. Get inspiration from Starbuck's AVX2 macros
+
+    int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+
+    int remaining_pixels = pixel_batch_length % 8;
+    int perfect_8_pixels = (pixel_batch_length - remaining_pixels) / 8;
+
+    int perfect_8_pixels_batch_counter = perfect_8_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 rgbmask =
+        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 amask = ~rgbmask;
+
+    int rgb_weights =
+        ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
+         (0x1D << src->format->Bshift));
+
+    __m256i *srcp256 = (__m256i *)src->pixels;
+    __m256i *dstp256 = (__m256i *)newsurf->pixels;
+
+    __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
+        mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
+    __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA,
+        mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B,
+        mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray,
+        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask;
+
+    mm256_shuff_mask_A =
+        _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
+                        18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5,
+                        0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0);
+    mm256_shuff_mask_B =
+        _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80,
+                        26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13,
+                        0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8);
+
+    mm256_shuff_mask_gray = _mm256_set_epi8(
+        28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12,
+        12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
+
+    mm_zero = _mm_setzero_si128();
+    mm_alpha_mask = _mm_cvtsi32_si128(amask);
+    mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
+    mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+    mm_rgb_weights =
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
+
+    mm256_two_five_fives = _mm256_set1_epi16(0x00FF);
+    mm256_rgb_weights = _mm256_set1_epi32(rgb_weights);
+    mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
+    mm256_alpha_mask = _mm256_set1_epi32(amask);
+
+    while (num_batches--) {
+        perfect_8_pixels_batch_counter = perfect_8_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_8_pixels_batch_counter--) {
+            mm256_src = _mm256_loadu_si256(srcp256);
+            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            mm256_dstA =
+                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
+            mm256_dstB =
+                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
+
+            mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
+            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+
+            _mm256_storeu_si256(dstp256, mm256_dst);
+
+            srcp256++;
+            dstp256++;
+        }
+        srcp = (Uint32 *)srcp256;
+        dstp = (Uint32 *)dstp256;
+        while (remaining_pixels_batch_counter--) {
+            mm_src = _mm_cvtsi32_si128(*srcp);
+            /*mm_src = 0x000000000000000000000000AARRGGBB*/
+            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            /*mm_src = 0x00000000000000000000000000RRGGBB*/
+            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
+            /*mm_src = 0x0000000000000000000000RR00GG00BB*/
+
+            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_srli_epi16(mm_dst, 8);
+            /*mm_dst = 0x0000000000000000000000RR00GG00BB*/
+
+            mm_dst = _mm_hadd_epi16(mm_dst, mm_dst);  // This requires SSE3
+            mm_dst = _mm_shufflelo_epi16(_mm_hadd_epi16(mm_dst, mm_dst),
+                                         _MM_SHUFFLE(0, 0, 0, 0));
+            /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/
+
+            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
+            /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
+            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
+            mm_dst = _mm_add_epi16(mm_dst, mm_alpha);
+            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
+            *dstp = _mm_cvtsi128_si32(mm_dst);
+            /*dstp = 0xAARRGGBB*/
+            srcp++;
+            dstp++;
+        }
+        srcp += s_row_skip;
+        srcp256 = (__m256i *)srcp;
+    }
+}
+#else
+void
+grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
+#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+          !defined(SDL_DISABLE_IMMINTRIN_H) */
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 8f503b964f..fadd0f615c 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -34,3 +34,80 @@ pg_neon_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+    int pixel_batch_counter = pixel_batch_length;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 rgbmask =
+        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 amask = ~rgbmask;
+
+    int rgb_weights =
+        ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
+         (0x1D << src->format->Bshift));
+
+    __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
+        mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
+
+    mm_zero = _mm_setzero_si128();
+    mm_alpha_mask = _mm_cvtsi32_si128(amask);
+    mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
+    mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+    mm_rgb_weights =
+        _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
+
+    while (num_batches--) {
+        pixel_batch_counter = pixel_batch_length;
+        while (pixel_batch_counter--) {
+            mm_src = _mm_cvtsi32_si128(*srcp);
+            /*mm_src = 0x000000000000000000000000AARRGGBB*/
+            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            /*mm_src = 0x00000000000000000000000000RRGGBB*/
+            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
+            /*mm_src = 0x0000000000000000000000RR00GG00BB*/
+
+            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
+            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
+            mm_dst = _mm_srli_epi16(mm_dst, 8);
+            /*mm_dst = 0x0000000000000000000000RR00GG00BB*/
+
+            mm_dst = _mm_adds_epu8(
+                _mm_adds_epu8(
+                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(0, 0, 0, 0)),
+                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(1, 1, 1, 1))),
+                _mm_adds_epu8(
+                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(2, 2, 2, 2)),
+                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(3, 3, 3, 3))));
+            /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/
+
+            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
+            /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
+            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
+            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
+            *dstp = _mm_cvtsi128_si32(mm_dst);
+            /*dstp = 0xAARRGGBB*/
+            srcp++;
+            dstp++;
+        }
+        srcp += s_row_skip;
+    }
+}
+#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
diff --git a/src_c/transform.c b/src_c/transform.c
index 95f791d3bb..915c7d4dd0 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2032,6 +2032,32 @@ clamp_4
 
 #endif
 
+void
+grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int x, y;
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++) {
+            Uint32 pixel;
+            Uint8 *pix;
+            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
+                        pix);
+            Uint8 r, g, b, a;
+            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
+
+            // RGBA to GRAY formula used by OpenCV
+            Uint8 grayscale_pixel =
+                (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) +
+                        (((29 * b) + 255) >> 8));
+            Uint32 new_pixel =
+                SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel,
+                            grayscale_pixel, a);
+            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
+                        newsurf->format, pix);
+        }
+    }
+}
+
 SDL_Surface *
 grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
 {
@@ -2059,24 +2085,26 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             "Source and destination surfaces need the same format."));
     }
 
-    int x, y;
-    for (y = 0; y < src->h; y++) {
-        for (x = 0; x < src->w; x++) {
-            Uint32 pixel;
-            Uint8 *pix;
-            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
-                        pix);
-            Uint8 r, g, b, a;
-            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
-
-            // RGBA to GRAY formula used by OpenCV
-            Uint8 grayscale_pixel = (Uint8)(0.299 * r + 0.587 * g + 0.114 * b);
-            Uint32 new_pixel =
-                SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel,
-                            grayscale_pixel, a);
-            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
-                        newsurf->format, pix);
+    if (src->format->BytesPerPixel == 4 &&
+        src->format->Rmask == newsurf->format->Rmask &&
+        src->format->Gmask == newsurf->format->Gmask &&
+        src->format->Bmask == newsurf->format->Bmask &&
+        (src->pitch % src->format->BytesPerPixel == 0) &&
+        (newsurf->pitch == (newsurf->w * newsurf->format->BytesPerPixel))) {
+        if (pg_has_avx2()) {
+            grayscale_avx2(src, newsurf);
         }
+#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        if (pg_HasSSE_NEON()) {
+            grayscale_sse2(src, newsurf);
+        }
+#endif  // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else {
+            grayscale_non_simd(src, newsurf);
+        }
+    }
+    else {
+        grayscale_non_simd(src, newsurf);
     }
 
     SDL_UnlockSurface(newsurf);

From dda7b5e63b2f2a67ef515f351286231c9f3e79a4 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 1 Sep 2023 15:08:01 +0100
Subject: [PATCH 02/15] Update transform test to new formula

---
 test/transform_test.py | 44 ++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/test/transform_test.py b/test/transform_test.py
index af1c6109d1..087b1e5044 100644
--- a/test/transform_test.py
+++ b/test/transform_test.py
@@ -174,10 +174,24 @@ def test_grayscale(self):
         s = pygame.Surface((32, 32))
         s.fill((255, 0, 0))
 
-        s2 = pygame.transform.grayscale(s)
-        self.assertEqual(pygame.transform.average_color(s2)[0], 76)
-        self.assertEqual(pygame.transform.average_color(s2)[1], 76)
-        self.assertEqual(pygame.transform.average_color(s2)[2], 76)
+        gray_red = pygame.transform.grayscale(s)
+        self.assertEqual(pygame.transform.average_color(gray_red)[0], 76)
+        self.assertEqual(pygame.transform.average_color(gray_red)[1], 76)
+        self.assertEqual(pygame.transform.average_color(gray_red)[2], 76)
+
+        green_surf = pygame.Surface((32, 32))
+        green_surf.fill((0, 255, 0))
+        gray_green = pygame.transform.grayscale(green_surf)
+        self.assertEqual(pygame.transform.average_color(gray_green)[0], 150)
+        self.assertEqual(pygame.transform.average_color(gray_green)[1], 150)
+        self.assertEqual(pygame.transform.average_color(gray_green)[2], 150)
+
+        blue_surf = pygame.Surface((32, 32))
+        blue_surf.fill((0, 0, 255))
+        blue_green = pygame.transform.grayscale(blue_surf)
+        self.assertEqual(pygame.transform.average_color(blue_green)[0], 29)
+        self.assertEqual(pygame.transform.average_color(blue_green)[1], 29)
+        self.assertEqual(pygame.transform.average_color(blue_green)[2], 29)
 
         dest = pygame.Surface((32, 32), depth=32)
         pygame.transform.grayscale(s, dest)
@@ -188,16 +202,16 @@ def test_grayscale(self):
         dest = pygame.Surface((32, 32), depth=32)
         s.fill((34, 12, 65))
         pygame.transform.grayscale(s, dest)
-        self.assertEqual(pygame.transform.average_color(dest)[0], 24)
-        self.assertEqual(pygame.transform.average_color(dest)[1], 24)
-        self.assertEqual(pygame.transform.average_color(dest)[2], 24)
+        self.assertEqual(pygame.transform.average_color(dest)[0], 27)
+        self.assertEqual(pygame.transform.average_color(dest)[1], 27)
+        self.assertEqual(pygame.transform.average_color(dest)[2], 27)
 
         dest = pygame.Surface((32, 32), depth=32)
         s.fill((123, 123, 123))
         pygame.transform.grayscale(s, dest)
-        self.assertIn(pygame.transform.average_color(dest)[0], [123, 122])
-        self.assertIn(pygame.transform.average_color(dest)[1], [123, 122])
-        self.assertIn(pygame.transform.average_color(dest)[2], [123, 122])
+        self.assertIn(pygame.transform.average_color(dest)[0], [124, 122])
+        self.assertIn(pygame.transform.average_color(dest)[1], [124, 122])
+        self.assertIn(pygame.transform.average_color(dest)[2], [124, 122])
 
         s = pygame.Surface((32, 32), depth=24)
         s.fill((255, 0, 0))
@@ -215,6 +229,16 @@ def test_grayscale(self):
         self.assertEqual(pygame.transform.average_color(dest)[1], 76)
         self.assertEqual(pygame.transform.average_color(dest)[2], 72)
 
+        super_surf = pygame.Surface((64, 64), depth=32)
+        super_surf.fill((255, 255, 255))
+        super_surf.fill((255, 0, 0), pygame.Rect(0, 0, 32, 32))
+        sub_surf = super_surf.subsurface(pygame.Rect(0, 0, 32, 32))
+
+        grey_sub_surf = pygame.transform.grayscale(sub_surf)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+        self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
+
     def test_threshold__honors_third_surface(self):
         # __doc__ for threshold as of Tue 07/15/2008
 

From 25f653c1b61bbe750998553d460c2a1ca098b569 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 10 Sep 2023 16:22:31 +0100
Subject: [PATCH 03/15] Switch to using AVX only in AVX version, add comments

---
 src_c/simd_transform.h      |  15 +++++
 src_c/simd_transform_avx2.c | 111 +++++++++++++++++++-----------------
 src_c/simd_transform_sse2.c |  50 +++++++++++++---
 src_c/transform.c           |   6 +-
 4 files changed, 123 insertions(+), 59 deletions(-)

diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h
index ef740e8b94..8940e86429 100644
--- a/src_c/simd_transform.h
+++ b/src_c/simd_transform.h
@@ -1,6 +1,21 @@
 #define NO_PYGAME_C_API
 #include "_surface.h"
 
+/**
+ * MACRO borrowed from SSE2NEON - useful for making the shuffling family of
+ * intrinsics easier to understand by indicating clearly what will go where.
+ *
+ * SSE2Neon description follows...
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
 #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__)
 // arm64 has neon optimisations enabled by default, even when fpu=neon is not
 // passed
diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index f98c369d01..b60b4f9fe7 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -48,27 +48,24 @@ pg_avx2_at_runtime_but_uncompiled()
 void
 grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
 {
-    // Current AVX2 process
-    // ------------------
-    // - pre loop: Load weights into register x8
-    // - in loop:
-    //     1. Load 8 pixels into register
-    //     2. remove the alpha channel for every pixel and save it.
-    //     3. multiply weights by pixels using standard shuffle to 2x 16bit
-    //        register, mul + 255 then left shift. See multiply blitter mode
-    //        for this operation in isolation.
-    //     4. pack pixels back together from A & B while adding with a
-    //        horizontal add (e.g. adds A+R and G+B in a ARGB layout)
-    //     5. shift and add to make final grey pixel colour in 0th
-    //        8Bit channel in each 'pixel'
-    //     6. shuffle again to push the grey from the 0th channel into every
-    //        channel of every pixel.
-    //     7. add the alpha channel back in.
-
-    // Things to fix:
-    //     1. Would be nice to only use AVX2 stuff for the single pixel stuff.
-    //     2. Get inspiration from Starbuck's AVX2 macros
-
+    /* See the SSE2 code for a simpler overview of this algorithm
+     * Current AVX2 process
+     * ------------------
+     * - pre loop: Load weights into register x8
+     * - in loop:
+     *     1. Load 8 pixels into register
+     *     2. remove the alpha channel for every pixel and save it.
+     *     3. multiply weights by pixels using standard shuffle to 2x 16bit
+     *        register, mul + 255 then left shift. See multiply blitter mode
+     *        for this operation in isolation.
+     *     4. pack pixels back together from A & B while adding with a
+     *        horizontal add (e.g. adds A+R and G+B in a ARGB layout)
+     *     5. shift and add to make final grey pixel colour in 0th
+     *        8Bit channel in each 'pixel'
+     *     6. shuffle again to push the grey from the 0th channel into every
+     *        channel of every pixel.
+     *     7. add the alpha channel back in.
+     */
     int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
 
     // generate number of batches of pixels we need to loop through
@@ -122,7 +119,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     mm_zero = _mm_setzero_si128();
     mm_alpha_mask = _mm_cvtsi32_si128(amask);
     mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
-    mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+    mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
     mm_rgb_weights =
         _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
 
@@ -131,6 +128,15 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
     mm256_alpha_mask = _mm256_set1_epi32(amask);
 
+    __m256i _partial8_mask =
+        _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 5) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 4) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 3) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 2) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 1) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 0) ? 0x80000000 : 0x00);
+
     while (num_batches--) {
         perfect_8_pixels_batch_counter = perfect_8_pixels;
         remaining_pixels_batch_counter = remaining_pixels;
@@ -169,35 +175,38 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         }
         srcp = (Uint32 *)srcp256;
         dstp = (Uint32 *)dstp256;
-        while (remaining_pixels_batch_counter--) {
-            mm_src = _mm_cvtsi32_si128(*srcp);
-            /*mm_src = 0x000000000000000000000000AARRGGBB*/
-            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
-            /*mm_src = 0x00000000000000000000000000RRGGBB*/
-            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
-            /*mm_src = 0x0000000000000000000000RR00GG00BB*/
-
-            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
-            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
-            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
-            /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
-            mm_dst = _mm_srli_epi16(mm_dst, 8);
-            /*mm_dst = 0x0000000000000000000000RR00GG00BB*/
-
-            mm_dst = _mm_hadd_epi16(mm_dst, mm_dst);  // This requires SSE3
-            mm_dst = _mm_shufflelo_epi16(_mm_hadd_epi16(mm_dst, mm_dst),
-                                         _MM_SHUFFLE(0, 0, 0, 0));
-            /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/
-
-            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
-            /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
-            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
-            mm_dst = _mm_add_epi16(mm_dst, mm_alpha);
-            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
-            *dstp = _mm_cvtsi128_si32(mm_dst);
-            /*dstp = 0xAARRGGBB*/
-            srcp++;
-            dstp++;
+        if (remaining_pixels_batch_counter > 0) {
+            mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
+            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+
+            mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
+            mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
+
+            mm256_dstA =
+                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
+            mm256_dstB =
+                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
+
+            mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA);
+            mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
+            mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
+
+            mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB);
+            mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
+            mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
+
+            mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
+            mm256_dst =
+                _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
+
+            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
+            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+
+            _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
+
+            srcp += remaining_pixels_batch_counter;
+            dstp += remaining_pixels_batch_counter;
         }
         srcp += s_row_skip;
         srcp256 = (__m256i *)srcp;
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index fadd0f615c..383dad2444 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -39,6 +39,22 @@ pg_neon_at_runtime_but_uncompiled()
 void
 grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
 {
+    /* For the SSE2 SIMD version of grayscale we do one pixel at a time
+     * Thus we can calculate the number of loops (and pixels) by multiplying
+     * the width of the surface to be grayscaled, by the height of that
+     * surface.
+     *
+     * We also need to calculate a 'skip value' in case our surface's rows are
+     * not contiguous in memory. For surfaces, a single row's worth of pixel
+     * data is always contiguous (i.e. each pixel is next to each other).
+     * However, a surface's rows may be seperated from one another in memory,
+     * most commonly this happens with sub surfaces.
+     * The vast majority of surfaces used in applications will probably also
+     * have contiguous rows as that is what happens when you create a standard
+     * 32bit surface with pygame.Surface. SIMD Transform algorithms,
+     * should treat this 'most normal' case as the critical path to maximise
+     * performance.
+     */
     int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
 
     // generate number of batches of pixels we need to loop through
@@ -67,7 +83,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     mm_zero = _mm_setzero_si128();
     mm_alpha_mask = _mm_cvtsi32_si128(amask);
     mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
-    mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF);
+    mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
     mm_rgb_weights =
         _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
 
@@ -76,11 +92,18 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         while (pixel_batch_counter--) {
             mm_src = _mm_cvtsi32_si128(*srcp);
             /*mm_src = 0x000000000000000000000000AARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
             mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
             /*mm_src = 0x00000000000000000000000000RRGGBB*/
+
+            /* This is where we do the efficient 8bit 'floating point multiply'
+               operation of each channel by the weights - using a 16bit integer
+               multiply, an add and a bitshift. We use this trick repeatedly
+               for multiplication by a 0 to 1 value in SIMD code.
+            */
             mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
             /*mm_src = 0x0000000000000000000000RR00GG00BB*/
-
             mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
             /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/
             mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
@@ -88,15 +111,28 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             mm_dst = _mm_srli_epi16(mm_dst, 8);
             /*mm_dst = 0x0000000000000000000000RR00GG00BB*/
 
+            /* now we have the multiplied channels we 'shuffle them out' one
+             * at a time so there are four copies of red, four copies of green,
+             * four copies of blue etc. Then we add all these together
+             * so each of channels contains R+G+B.
+             */
             mm_dst = _mm_adds_epu8(
                 _mm_adds_epu8(
-                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(0, 0, 0, 0)),
-                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(1, 1, 1, 1))),
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1))),
                 _mm_adds_epu8(
-                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(2, 2, 2, 2)),
-                    _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(3, 3, 3, 3))));
-            /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/
+                    _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                    _mm_shufflelo_epi16(mm_dst,
+                                        _PG_SIMD_SHUFFLE(3, 3, 3, 3))));
+            /* Gr here stands for 'Gray' as we've now added all the channels
+             * back together after multiplying them above.
+             * mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr
+             */
 
+            /* The rest is just packing the grayscale back to the original
+             * 8bit pixel layout and adding the alpha we removed earlier back
+             * in again
+             */
             mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
             /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
             mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
diff --git a/src_c/transform.c b/src_c/transform.c
index 915c7d4dd0..de3b862a0f 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2045,7 +2045,11 @@ grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
             Uint8 r, g, b, a;
             SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
 
-            // RGBA to GRAY formula used by OpenCV
+            /* RGBA to GRAY formula used by OpenCV
+             * We are using a bitshift and integer addition to align the
+             * calculation with what is fastest for SIMD operations.
+             * Results are almost identical to floating point multiplication.
+             */
             Uint8 grayscale_pixel =
                 (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) +
                         (((29 * b) + 255) >> 8));

From d3f0651a0b10313a8806dbf03fb9cd553b307a33 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 24 Sep 2023 11:18:23 +0100
Subject: [PATCH 04/15] Clean up unused SSE2 variables from AVX2 code

---
 src_c/simd_transform_avx2.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index b60b4f9fe7..4c809728ac 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -96,8 +96,6 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     __m256i *srcp256 = (__m256i *)src->pixels;
     __m256i *dstp256 = (__m256i *)newsurf->pixels;
 
-    __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
-        mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
     __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA,
         mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B,
         mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray,
@@ -116,13 +114,6 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12,
         12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0);
 
-    mm_zero = _mm_setzero_si128();
-    mm_alpha_mask = _mm_cvtsi32_si128(amask);
-    mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
-    mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
-    mm_rgb_weights =
-        _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
-
     mm256_two_five_fives = _mm256_set1_epi16(0x00FF);
     mm256_rgb_weights = _mm256_set1_epi32(rgb_weights);
     mm256_rgb_mask = _mm256_set1_epi32(rgbmask);

From 33db90e0ee1a4340de9cc9a1c9bd7888a7bbf557 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 1 Oct 2023 18:51:53 +0100
Subject: [PATCH 05/15] Add SIMD versions of greyscale transform

A few changes from review.
---
 src_c/simd_transform_avx2.c | 4 ++--
 src_c/transform.c           | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 4c809728ac..cc4effa907 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -66,7 +66,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
      *        channel of every pixel.
      *     7. add the alpha channel back in.
      */
-    int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
 
     // generate number of batches of pixels we need to loop through
     int pixel_batch_length = src->w * src->h;
@@ -77,7 +77,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     }
 
     int remaining_pixels = pixel_batch_length % 8;
-    int perfect_8_pixels = (pixel_batch_length - remaining_pixels) / 8;
+    int perfect_8_pixels = pixel_batch_length / 8;
 
     int perfect_8_pixels_batch_counter = perfect_8_pixels;
     int remaining_pixels_batch_counter = remaining_pixels;
diff --git a/src_c/transform.c b/src_c/transform.c
index de3b862a0f..8455af6c66 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2094,12 +2094,12 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
         src->format->Gmask == newsurf->format->Gmask &&
         src->format->Bmask == newsurf->format->Bmask &&
         (src->pitch % src->format->BytesPerPixel == 0) &&
-        (newsurf->pitch == (newsurf->w * newsurf->format->BytesPerPixel))) {
+        (newsurf->pitch == (newsurf->w * 4))) {
         if (pg_has_avx2()) {
             grayscale_avx2(src, newsurf);
         }
 #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
-        if (pg_HasSSE_NEON()) {
+        else if (pg_HasSSE_NEON()) {
             grayscale_sse2(src, newsurf);
         }
 #endif  // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)

From bac4ac020e44f70c04eb745a78f5294c7f80a3e4 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 2 Oct 2023 20:11:17 +0100
Subject: [PATCH 06/15] Upgrade SSE2 to 2 pixels at a time

---
 src_c/simd_transform_sse2.c | 112 ++++++++++++++++++++++++++++++++----
 1 file changed, 102 insertions(+), 10 deletions(-)

diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 383dad2444..5c1da408dc 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -37,7 +37,17 @@ pg_neon_at_runtime_but_uncompiled()
 
 #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
 void
-grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+#if defined(ENV64BIT)
+#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
+#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg)
+#else
+#define LOAD_64_INTO_M128(num, reg) \
+    *reg = _mm_loadl_epi64((const __m128i *)num)
+#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg)
+#endif
+
+    void
+    grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
 {
     /* For the SSE2 SIMD version of grayscale we do one pixel at a time
      * Thus we can calculate the number of loops (and pixels) by multiplying
@@ -55,7 +65,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
      * should treat this 'most normal' case as the critical path to maximise
      * performance.
      */
-    int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2;
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
 
     // generate number of batches of pixels we need to loop through
     int pixel_batch_length = src->w * src->h;
@@ -64,7 +74,11 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         pixel_batch_length = src->w;
         num_batches = src->h;
     }
-    int pixel_batch_counter = pixel_batch_length;
+    int remaining_pixels = pixel_batch_length % 2;
+    int perfect_2_pixels = pixel_batch_length / 2;
+
+    int perfect_2_pixels_batch_counter = perfect_2_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
 
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
@@ -73,23 +87,100 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         (src->format->Rmask | src->format->Gmask | src->format->Bmask);
     Uint32 amask = ~rgbmask;
 
-    int rgb_weights =
+    Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
+    Uint64 amask64 = ~rgbmask64;
+
+    Uint64 rgb_weights =
+        ((Uint64)((0x4C << src->format->Rshift) |
+                  (0x96 << src->format->Gshift) |
+                  (0x1D << src->format->Bshift))
+         << 32) |
         ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
          (0x1D << src->format->Bshift));
 
+    Uint64 *srcp64 = (Uint64 *)src->pixels;
+    Uint64 *dstp64 = (Uint64 *)newsurf->pixels;
+
     __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives,
         mm_rgb_weights, mm_alpha_mask, mm_rgb_mask;
 
     mm_zero = _mm_setzero_si128();
-    mm_alpha_mask = _mm_cvtsi32_si128(amask);
-    mm_rgb_mask = _mm_cvtsi32_si128(rgbmask);
+    LOAD_64_INTO_M128(&amask64, &mm_alpha_mask);
+    LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask);
     mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF);
-    mm_rgb_weights =
-        _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero);
+
+    LOAD_64_INTO_M128(&rgb_weights, &mm_rgb_weights);
+    mm_rgb_weights = _mm_unpacklo_epi8(mm_rgb_weights, mm_zero);
 
     while (num_batches--) {
-        pixel_batch_counter = pixel_batch_length;
-        while (pixel_batch_counter--) {
+        perfect_2_pixels_batch_counter = perfect_2_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_2_pixels_batch_counter--) {
+            LOAD_64_INTO_M128(srcp64, &mm_src);
+            /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
+            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
+
+            /* This is where we do the efficient 8bit 'floating point multiply'
+               operation of each channel by the weights - using a 16bit integer
+               multiply, an add and a bitshift. We use this trick repeatedly
+               for multiplication by a 0 to 1 value in SIMD code.
+            */
+            mm_src = _mm_unpacklo_epi8(mm_src, mm_zero);
+            /*mm_src = 0x000000RR00GG00BB000000RR00GG00BB*/
+            mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights);
+            /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
+            mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives);
+            /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/
+            mm_dst = _mm_srli_epi16(mm_dst, 8);
+            /*mm_dst = 0x000000RR00GG00BB000000RR00GG00BB*/
+
+            /* now we have the multiplied channels we 'shuffle them out' one
+             * at a time so there are four copies of red, four copies of green,
+             * four copies of blue etc. Then we add all these together
+             * so each of channels contains R+G+B.
+             */
+            mm_dst = _mm_adds_epu8(
+                _mm_adds_epu8(_mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                                  _PG_SIMD_SHUFFLE(0, 0, 0, 0)),
+                              _mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1)),
+                                  _PG_SIMD_SHUFFLE(1, 1, 1, 1))),
+                _mm_adds_epu8(_mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                                  _PG_SIMD_SHUFFLE(2, 2, 2, 2)),
+                              _mm_shufflehi_epi16(
+                                  _mm_shufflelo_epi16(
+                                      mm_dst, _PG_SIMD_SHUFFLE(3, 3, 3, 3)),
+                                  _PG_SIMD_SHUFFLE(3, 3, 3, 3))));
+            /* Gr here stands for 'Gray' as we've now added all the channels
+             * back together after multiplying them above.
+             * mm_dst = 0x0000GrGr00GrGr00GrGr00GrGr0000GrGr00GrGr00GrGr00GrGr
+             */
+
+            /* The rest is just packing the grayscale back to the original
+             * 8bit pixel layout and adding the alpha we removed earlier back
+             * in again
+             */
+            mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
+            /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/
+            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
+            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
+            STORE_M128_INTO_64(&mm_dst, dstp64);
+            /*dstp = 0xAARRGGBB*/
+            srcp64++;
+            dstp64++;
+        }
+        srcp = (Uint32 *)srcp64;
+        dstp = (Uint32 *)dstp64;
+        if (remaining_pixels_batch_counter > 0) {
             mm_src = _mm_cvtsi32_si128(*srcp);
             /*mm_src = 0x000000000000000000000000AARRGGBB*/
             /* First we strip out the alpha so we have one of our 4 channels
@@ -144,6 +235,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             dstp++;
         }
         srcp += s_row_skip;
+        srcp64 = (Uint64 *)srcp;
     }
 }
 #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/

From 7d8254bae813ca856c436299d39f8eb18d6ed333 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 2 Oct 2023 20:57:54 +0100
Subject: [PATCH 07/15] Upgrade SSE2 to 2 pixels at a time

---
 src_c/simd_transform_sse2.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 5c1da408dc..ec869cab7d 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -36,7 +36,7 @@ pg_neon_at_runtime_but_uncompiled()
 }
 
 #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
-void
+
 #if defined(ENV64BIT)
 #define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
 #define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg)
@@ -46,8 +46,8 @@ void
 #define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg)
 #endif
 
-    void
-    grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+void
+grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
 {
     /* For the SSE2 SIMD version of grayscale we do one pixel at a time
      * Thus we can calculate the number of loops (and pixels) by multiplying
@@ -85,8 +85,6 @@ void
 
     Uint32 rgbmask =
         (src->format->Rmask | src->format->Gmask | src->format->Bmask);
-    Uint32 amask = ~rgbmask;
-
     Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
     Uint64 amask64 = ~rgbmask64;
 

From 14e0cdabecab0b1135bfb447908ec3e0b46c3c5e Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 15 Oct 2023 08:44:42 +0100
Subject: [PATCH 08/15] change src->format->BytesPerPixel to 4

---
 src_c/transform.c | 140 ++++++++++++++++++++++------------------------
 1 file changed, 66 insertions(+), 74 deletions(-)

diff --git a/src_c/transform.c b/src_c/transform.c
index bb04065deb..5d9852d112 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -1952,86 +1952,79 @@ clamp_4
 
 */
 
-#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix)    \
-    switch (p_format->BytesPerPixel) {                                       \
-        case 1:                                                              \
-            p_color = (Uint32) *                                             \
-                      ((Uint8 *)(p_pixels) + (p_y) * p_surf->pitch + (p_x)); \
-            break;                                                           \
-        case 2:                                                              \
-            p_color =                                                        \
-                (Uint32) *                                                   \
-                ((Uint16 *)((p_pixels) + (p_y) * p_surf->pitch) + (p_x));    \
-            break;                                                           \
-        case 3:                                                              \
-            p_pix =                                                          \
-                ((Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3);   \
-            p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN)                      \
-                          ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16)  \
-                          : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \
-            break;                                                           \
-        default: /* case 4: */                                               \
-            p_color =                                                        \
-                *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x));     \
-            break;                                                           \
+#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix)     \
+    switch (p_format->BytesPerPixel) {                                        \
+        case 1:                                                               \
+            p_color = (Uint32) *                                              \
+                      ((Uint8 *)(p_pixels) + (p_y)*p_surf->pitch + (p_x));    \
+            break;                                                            \
+        case 2:                                                               \
+            p_color = (Uint32) *                                              \
+                      ((Uint16 *)((p_pixels) + (p_y)*p_surf->pitch) + (p_x)); \
+            break;                                                            \
+        case 3:                                                               \
+            p_pix = ((Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3);    \
+            p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN)                       \
+                          ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16)   \
+                          : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16);  \
+            break;                                                            \
+        default: /* case 4: */                                                \
+            p_color = *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x));  \
+            break;                                                            \
     }
 
 #if (SDL_BYTEORDER == SDL_LIL_ENDIAN)
 
-#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,       \
-                    p_byte_buf)                                          \
-    switch (p_format->BytesPerPixel) {                                   \
-        case 1:                                                          \
-            *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) =       \
-                (Uint8)p_color;                                          \
-            break;                                                       \
-        case 2:                                                          \
-            *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
-                (Uint16)p_color;                                         \
-            break;                                                       \
-        case 3:                                                          \
-            p_byte_buf =                                                 \
-                (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \
-            *(p_byte_buf + (p_format->Rshift >> 3)) =                    \
-                (Uint8)(p_color >> p_format->Rshift);                    \
-            *(p_byte_buf + (p_format->Gshift >> 3)) =                    \
-                (Uint8)(p_color >> p_format->Gshift);                    \
-            *(p_byte_buf + (p_format->Bshift >> 3)) =                    \
-                (Uint8)(p_color >> p_format->Bshift);                    \
-            break;                                                       \
-        default:                                                         \
-            *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
-                p_color;                                                 \
-            break;                                                       \
+#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,            \
+                    p_byte_buf)                                               \
+    switch (p_format->BytesPerPixel) {                                        \
+        case 1:                                                               \
+            *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) =              \
+                (Uint8)p_color;                                               \
+            break;                                                            \
+        case 2:                                                               \
+            *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) =           \
+                (Uint16)p_color;                                              \
+            break;                                                            \
+        case 3:                                                               \
+            p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \
+            *(p_byte_buf + (p_format->Rshift >> 3)) =                         \
+                (Uint8)(p_color >> p_format->Rshift);                         \
+            *(p_byte_buf + (p_format->Gshift >> 3)) =                         \
+                (Uint8)(p_color >> p_format->Gshift);                         \
+            *(p_byte_buf + (p_format->Bshift >> 3)) =                         \
+                (Uint8)(p_color >> p_format->Bshift);                         \
+            break;                                                            \
+        default:                                                              \
+            *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color;  \
+            break;                                                            \
     }
 
 #else
 
-#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,       \
-                    p_byte_buf)                                          \
-    switch (p_format->BytesPerPixel) {                                   \
-        case 1:                                                          \
-            *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) =       \
-                (Uint8)p_color;                                          \
-            break;                                                       \
-        case 2:                                                          \
-            *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
-                (Uint16)p_color;                                         \
-            break;                                                       \
-        case 3:                                                          \
-            p_byte_buf =                                                 \
-                (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \
-            *(p_byte_buf + 2 - (p_format->Rshift >> 3)) =                \
-                (Uint8)(p_color >> p_format->Rshift);                    \
-            *(p_byte_buf + 2 - (p_format->Gshift >> 3)) =                \
-                (Uint8)(p_color >> p_format->Gshift);                    \
-            *(p_byte_buf + 2 - (p_format->Bshift >> 3)) =                \
-                (Uint8)(p_color >> p_format->Bshift);                    \
-            break;                                                       \
-        default:                                                         \
-            *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
-                p_color;                                                 \
-            break;                                                       \
+#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,            \
+                    p_byte_buf)                                               \
+    switch (p_format->BytesPerPixel) {                                        \
+        case 1:                                                               \
+            *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) =              \
+                (Uint8)p_color;                                               \
+            break;                                                            \
+        case 2:                                                               \
+            *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) =           \
+                (Uint16)p_color;                                              \
+            break;                                                            \
+        case 3:                                                               \
+            p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \
+            *(p_byte_buf + 2 - (p_format->Rshift >> 3)) =                     \
+                (Uint8)(p_color >> p_format->Rshift);                         \
+            *(p_byte_buf + 2 - (p_format->Gshift >> 3)) =                     \
+                (Uint8)(p_color >> p_format->Gshift);                         \
+            *(p_byte_buf + 2 - (p_format->Bshift >> 3)) =                     \
+                (Uint8)(p_color >> p_format->Bshift);                         \
+            break;                                                            \
+        default:                                                              \
+            *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color;  \
+            break;                                                            \
     }
 
 #endif
@@ -2097,8 +2090,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
         src->format->Rmask == newsurf->format->Rmask &&
         src->format->Gmask == newsurf->format->Gmask &&
         src->format->Bmask == newsurf->format->Bmask &&
-        (src->pitch % src->format->BytesPerPixel == 0) &&
-        (newsurf->pitch == (newsurf->w * 4))) {
+        (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
         if (pg_has_avx2()) {
             grayscale_avx2(src, newsurf);
         }

From 25a611bf549f1e047fb275bbe4bf309ecd07b756 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 15 Oct 2023 10:48:15 +0100
Subject: [PATCH 09/15] update formatting to latest version of clang

---
 src_c/transform.c | 137 ++++++++++++++++++++++++----------------------
 1 file changed, 72 insertions(+), 65 deletions(-)

diff --git a/src_c/transform.c b/src_c/transform.c
index 5d9852d112..845556b5e6 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -1952,79 +1952,86 @@ clamp_4
 
 */
 
-#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix)     \
-    switch (p_format->BytesPerPixel) {                                        \
-        case 1:                                                               \
-            p_color = (Uint32) *                                              \
-                      ((Uint8 *)(p_pixels) + (p_y)*p_surf->pitch + (p_x));    \
-            break;                                                            \
-        case 2:                                                               \
-            p_color = (Uint32) *                                              \
-                      ((Uint16 *)((p_pixels) + (p_y)*p_surf->pitch) + (p_x)); \
-            break;                                                            \
-        case 3:                                                               \
-            p_pix = ((Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3);    \
-            p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN)                       \
-                          ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16)   \
-                          : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16);  \
-            break;                                                            \
-        default: /* case 4: */                                                \
-            p_color = *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x));  \
-            break;                                                            \
+#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix)    \
+    switch (p_format->BytesPerPixel) {                                       \
+        case 1:                                                              \
+            p_color = (Uint32) *                                             \
+                      ((Uint8 *)(p_pixels) + (p_y) * p_surf->pitch + (p_x)); \
+            break;                                                           \
+        case 2:                                                              \
+            p_color =                                                        \
+                (Uint32) *                                                   \
+                ((Uint16 *)((p_pixels) + (p_y) * p_surf->pitch) + (p_x));    \
+            break;                                                           \
+        case 3:                                                              \
+            p_pix =                                                          \
+                ((Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3);   \
+            p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN)                      \
+                          ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16)  \
+                          : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \
+            break;                                                           \
+        default: /* case 4: */                                               \
+            p_color =                                                        \
+                *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x));     \
+            break;                                                           \
     }
 
 #if (SDL_BYTEORDER == SDL_LIL_ENDIAN)
 
-#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,            \
-                    p_byte_buf)                                               \
-    switch (p_format->BytesPerPixel) {                                        \
-        case 1:                                                               \
-            *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) =              \
-                (Uint8)p_color;                                               \
-            break;                                                            \
-        case 2:                                                               \
-            *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) =           \
-                (Uint16)p_color;                                              \
-            break;                                                            \
-        case 3:                                                               \
-            p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \
-            *(p_byte_buf + (p_format->Rshift >> 3)) =                         \
-                (Uint8)(p_color >> p_format->Rshift);                         \
-            *(p_byte_buf + (p_format->Gshift >> 3)) =                         \
-                (Uint8)(p_color >> p_format->Gshift);                         \
-            *(p_byte_buf + (p_format->Bshift >> 3)) =                         \
-                (Uint8)(p_color >> p_format->Bshift);                         \
-            break;                                                            \
-        default:                                                              \
-            *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color;  \
-            break;                                                            \
+#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,       \
+                    p_byte_buf)                                          \
+    switch (p_format->BytesPerPixel) {                                   \
+        case 1:                                                          \
+            *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) =       \
+                (Uint8)p_color;                                          \
+            break;                                                       \
+        case 2:                                                          \
+            *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
+                (Uint16)p_color;                                         \
+            break;                                                       \
+        case 3:                                                          \
+            p_byte_buf =                                                 \
+                (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \
+            *(p_byte_buf + (p_format->Rshift >> 3)) =                    \
+                (Uint8)(p_color >> p_format->Rshift);                    \
+            *(p_byte_buf + (p_format->Gshift >> 3)) =                    \
+                (Uint8)(p_color >> p_format->Gshift);                    \
+            *(p_byte_buf + (p_format->Bshift >> 3)) =                    \
+                (Uint8)(p_color >> p_format->Bshift);                    \
+            break;                                                       \
+        default:                                                         \
+            *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
+                p_color;                                                 \
+            break;                                                       \
     }
 
 #else
 
-#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,            \
-                    p_byte_buf)                                               \
-    switch (p_format->BytesPerPixel) {                                        \
-        case 1:                                                               \
-            *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) =              \
-                (Uint8)p_color;                                               \
-            break;                                                            \
-        case 2:                                                               \
-            *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) =           \
-                (Uint16)p_color;                                              \
-            break;                                                            \
-        case 3:                                                               \
-            p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \
-            *(p_byte_buf + 2 - (p_format->Rshift >> 3)) =                     \
-                (Uint8)(p_color >> p_format->Rshift);                         \
-            *(p_byte_buf + 2 - (p_format->Gshift >> 3)) =                     \
-                (Uint8)(p_color >> p_format->Gshift);                         \
-            *(p_byte_buf + 2 - (p_format->Bshift >> 3)) =                     \
-                (Uint8)(p_color >> p_format->Bshift);                         \
-            break;                                                            \
-        default:                                                              \
-            *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color;  \
-            break;                                                            \
+#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format,       \
+                    p_byte_buf)                                          \
+    switch (p_format->BytesPerPixel) {                                   \
+        case 1:                                                          \
+            *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) =       \
+                (Uint8)p_color;                                          \
+            break;                                                       \
+        case 2:                                                          \
+            *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
+                (Uint16)p_color;                                         \
+            break;                                                       \
+        case 3:                                                          \
+            p_byte_buf =                                                 \
+                (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \
+            *(p_byte_buf + 2 - (p_format->Rshift >> 3)) =                \
+                (Uint8)(p_color >> p_format->Rshift);                    \
+            *(p_byte_buf + 2 - (p_format->Gshift >> 3)) =                \
+                (Uint8)(p_color >> p_format->Gshift);                    \
+            *(p_byte_buf + 2 - (p_format->Bshift >> 3)) =                \
+                (Uint8)(p_color >> p_format->Bshift);                    \
+            break;                                                       \
+        default:                                                         \
+            *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) =    \
+                p_color;                                                 \
+            break;                                                       \
     }
 
 #endif

From ce96a26f343b54d8e837c341275aafa8ac256691 Mon Sep 17 00:00:00 2001
From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com>
Date: Wed, 25 Oct 2023 23:05:44 -0700
Subject: [PATCH 10/15] Add test for assumptions

---
 test/transform_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/test/transform_test.py b/test/transform_test.py
index 087b1e5044..f7585f2d4f 100644
--- a/test/transform_test.py
+++ b/test/transform_test.py
@@ -239,6 +239,16 @@ def test_grayscale(self):
         self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
         self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76)
 
+    def test_grayscale_simd_assumptions(self):
+        # The grayscale SIMD algorithm relies on the destination surface pitch
+        # being exactly width * 4 (4 bytes per pixel), for maximum speed.
+        # This test is here to make sure that assumption is always true.
+        widths = [1, 5, 6, 23, 54, 233]
+        for width in widths:
+            self.assertEqual(
+                pygame.Surface((width, 1), depth=32).get_pitch(), width * 4
+            )
+
     def test_threshold__honors_third_surface(self):
         # __doc__ for threshold as of Tue 07/15/2008
 

From 8b8b4b64974efd8e09f3864079df709aabd76a45 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Thu, 26 Oct 2023 07:16:54 +0100
Subject: [PATCH 11/15] Add versionchanged documentation

---
 docs/reST/ref/transform.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/reST/ref/transform.rst b/docs/reST/ref/transform.rst
index 9823f42bf1..81002f8c90 100644
--- a/docs/reST/ref/transform.rst
+++ b/docs/reST/ref/transform.rst
@@ -334,6 +334,9 @@ Instead, always begin with the original image and scale to the desired size.)
    
    .. versionadded:: 2.1.4
 
+   .. versionchanged:: 2.4.0 Adjusted formula slightly to support performance optimisation. It may return very slightly
+                       different pixels than before, but should run seven to eleven times faster on most systems.
+
    .. ## pygame.transform.grayscale ##
 
 .. function:: threshold

From 4e42929b6ab91892a6701ccbcb17ed339f47d765 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 3 Nov 2023 19:18:40 +0000
Subject: [PATCH 12/15] Change to using bitwise logic for alpha switcheroo

---
 src_c/simd_transform_avx2.c | 12 ++++++------
 src_c/simd_transform_sse2.c | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index cc4effa907..1df78957a4 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -133,7 +133,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         remaining_pixels_batch_counter = remaining_pixels;
         while (perfect_8_pixels_batch_counter--) {
             mm256_src = _mm256_loadu_si256(srcp256);
-            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
 
             mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
             mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
@@ -156,8 +156,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
                 _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
             mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
 
-            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
-            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
 
             _mm256_storeu_si256(dstp256, mm256_dst);
 
@@ -168,7 +168,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         dstp = (Uint32 *)dstp256;
         if (remaining_pixels_batch_counter > 0) {
             mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
-            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
 
             mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
             mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
@@ -191,8 +191,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
                 _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
             mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
 
-            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
-            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+            mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
 
             _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
 
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 9efbb2ba4c..8cf1ffd6c2 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -494,7 +494,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
             /* First we strip out the alpha so we have one of our 4 channels
                empty for the rest of the calculation */
-            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
             /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
 
             /* This is where we do the efficient 8bit 'floating point multiply'
@@ -544,8 +544,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
              */
             mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
             /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/
-            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
-            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
             /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
             STORE_M128_INTO_64(&mm_dst, dstp64);
             /*dstp = 0xAARRGGBB*/
@@ -559,7 +559,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             /*mm_src = 0x000000000000000000000000AARRGGBB*/
             /* First we strip out the alpha so we have one of our 4 channels
                empty for the rest of the calculation */
-            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
             /*mm_src = 0x00000000000000000000000000RRGGBB*/
 
             /* This is where we do the efficient 8bit 'floating point multiply'
@@ -600,8 +600,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
              */
             mm_dst = _mm_packus_epi16(mm_dst, mm_dst);
             /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/
-            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
-            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
             /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
             *dstp = _mm_cvtsi128_si32(mm_dst);
             /*dstp = 0xAARRGGBB*/

From 56442bfcc37cbe47ce85b5df909a8f38e6c64f6c Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 6 Nov 2023 21:38:20 +0000
Subject: [PATCH 13/15] Disable SIMD on Emscripten

---
 src_c/transform.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src_c/transform.c b/src_c/transform.c
index c7d1300645..9217bc188b 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2115,7 +2115,9 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             PyExc_ValueError,
             "Source and destination surfaces need the same format."));
     }
-
+#if defined(__EMSCRIPTEN__)
+    invert_non_simd(src, newsurf);
+#else  // !defined(__EMSCRIPTEN__)
     if (src->format->BytesPerPixel == 4 &&
         src->format->Rmask == newsurf->format->Rmask &&
         src->format->Gmask == newsurf->format->Gmask &&
@@ -2136,6 +2138,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
     else {
         grayscale_non_simd(src, newsurf);
     }
+#endif  // !defined(__EMSCRIPTEN__)
 
     SDL_UnlockSurface(newsurf);
 

From e8841d43183d3e511edc4de43e54c94c9beac65a Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Wed, 8 Nov 2023 18:33:36 +0000
Subject: [PATCH 14/15] Call correct non_simd_ function

---
 src_c/transform.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src_c/transform.c b/src_c/transform.c
index 9217bc188b..f4fcf51405 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -2116,7 +2116,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             "Source and destination surfaces need the same format."));
     }
 #if defined(__EMSCRIPTEN__)
-    invert_non_simd(src, newsurf);
+    grayscale_non_simd(src, newsurf);
 #else  // !defined(__EMSCRIPTEN__)
     if (src->format->BytesPerPixel == 4 &&
         src->format->Rmask == newsurf->format->Rmask &&

From b5a1ca015feaf80d0f45a6674757ce33164a69b0 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 12 Nov 2023 12:04:19 +0000
Subject: [PATCH 15/15] Add some changes and documentation from code review.

---
 src_c/simd_transform_avx2.c | 59 ++++++++++++++++++++++---------------
 src_c/simd_transform_sse2.c | 12 ++++----
 2 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 1df78957a4..5be6f9863f 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -85,9 +85,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask =
-        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
-    Uint32 amask = ~rgbmask;
+    Uint32 amask = src->format->Amask;
+    Uint32 rgbmask = ~amask;
 
     int rgb_weights =
         ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) |
@@ -99,7 +98,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA,
         mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B,
         mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray,
-        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask;
+        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask,
+        mm256_shuffled_weights_A, mm256_shuffled_weights_B;
 
     mm256_shuff_mask_A =
         _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80,
@@ -119,43 +119,57 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
     mm256_alpha_mask = _mm256_set1_epi32(amask);
 
-    __m256i _partial8_mask =
-        _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 5) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 4) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 3) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 2) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 1) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 0) ? 0x80000000 : 0x00);
+    mm256_shuffled_weights_A =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
+    mm256_shuffled_weights_B =
+        _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
+
+    __m256i _partial8_mask = _mm256_set_epi32(
+        0, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
+        (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
+        (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
+        (remaining_pixels > 0) ? -1 : 0);
 
     while (num_batches--) {
         perfect_8_pixels_batch_counter = perfect_8_pixels;
         remaining_pixels_batch_counter = remaining_pixels;
         while (perfect_8_pixels_batch_counter--) {
             mm256_src = _mm256_loadu_si256(srcp256);
+            // strip out the the alpha and store it
             mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
 
+            // shuffle out the 8 pixels into two spaced out registers
+            // there are four pixels in each register with 16bits of room
+            // per channel. This gives us bit space for multiplication.
             mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A);
             mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
 
+            // Do the 'percentage multiplications' with the weights
+            // with accuracy correction so values like 255 * '255'
+            // (here effectively 1.0) = 255 and not 254.
+            // For our greyscale this should mean 255 white stays 255 white
+            // after greyscaling.
             mm256_dstA =
-                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
-            mm256_dstB =
-                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
-
-            mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA);
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
             mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
             mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
 
-            mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB);
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
             mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
             mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
 
+            // Add up weighted R+G+B into the first channel of each of the 8
+            // pixels. This is the grey value we want in all our colour
+            // channels.
             mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB);
             mm256_dst =
                 _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16));
+            // Shuffle the grey value from ther first channel of each pixel
+            // into every channel of each pixel
             mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray);
 
+            // Add the alpha back
             mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask);
             mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
 
@@ -174,15 +188,12 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
             mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B);
 
             mm256_dstA =
-                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A);
-            mm256_dstB =
-                _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B);
-
-            mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA);
+                _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A);
             mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives);
             mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8);
 
-            mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB);
+            mm256_dstB =
+                _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B);
             mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives);
             mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8);
 
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 8cf1ffd6c2..35689ac72d 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -45,11 +45,11 @@ pg_neon_at_runtime_but_uncompiled()
 
 #if defined(ENV64BIT)
 #define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
-#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg)
+#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(reg)
 #else
 #define LOAD_64_INTO_M128(num, reg) \
     *reg = _mm_loadl_epi64((const __m128i *)num)
-#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg)
+#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, reg)
 #endif
 
 void
@@ -459,10 +459,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask =
-        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
-    Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
-    Uint64 amask64 = ~rgbmask64;
+    Uint64 amask64 = ((Uint64)src->format->Amask) | src->format->Amask;
+    Uint64 rgbmask64 = ~amask64;
 
     Uint64 rgb_weights =
         ((Uint64)((0x4C << src->format->Rshift) |
@@ -547,7 +545,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask);
             mm_dst = _mm_or_si128(mm_dst, mm_alpha);
             /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
-            STORE_M128_INTO_64(&mm_dst, dstp64);
+            STORE_M128_INTO_64(mm_dst, dstp64);
             /*dstp = 0xAARRGGBB*/
             srcp64++;
             dstp64++;