From 6c9fb785680a91c32c06d9e6337c95d24fa39816 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 1 Sep 2023 14:51:16 +0100 Subject: [PATCH 01/15] Add SIMD versions of greyscale transform --- src_c/simd_transform.h | 4 + src_c/simd_transform_avx2.c | 169 ++++++++++++++++++++++++++++++++++++ src_c/simd_transform_sse2.c | 77 ++++++++++++++++ src_c/transform.c | 62 +++++++++---- 4 files changed, 295 insertions(+), 17 deletions(-) diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h index d97a0d5b22..ef740e8b94 100644 --- a/src_c/simd_transform.h +++ b/src_c/simd_transform.h @@ -9,6 +9,10 @@ // SSE2 functions #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +void +grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf); #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */ // AVX2 functions +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf); diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index f1889f1bb7..f98c369d01 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -42,3 +42,172 @@ pg_avx2_at_runtime_but_uncompiled() } return 0; } + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + // Current AVX2 process + // ------------------ + // - pre loop: Load weights into register x8 + // - in loop: + // 1. Load 8 pixels into register + // 2. remove the alpha channel for every pixel and save it. + // 3. multiply weights by pixels using standard shuffle to 2x 16bit + // register, mul + 255 then left shift. See multiply blitter mode + // for this operation in isolation. + // 4. pack pixels back together from A & B while adding with a + // horizontal add (e.g. adds A+R and G+B in a ARGB layout) + // 5. shift and add to make final grey pixel colour in 0th + // 8Bit channel in each 'pixel' + // 6. shuffle again to push the grey from the 0th channel into every + // channel of every pixel. + // 7. add the alpha channel back in. + + // Things to fix: + // 1. Would be nice to only use AVX2 stuff for the single pixel stuff. + // 2. Get inspiration from Starbuck's AVX2 macros + + int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + + int remaining_pixels = pixel_batch_length % 8; + int perfect_8_pixels = (pixel_batch_length - remaining_pixels) / 8; + + int perfect_8_pixels_batch_counter = perfect_8_pixels; + int remaining_pixels_batch_counter = remaining_pixels; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint32 rgbmask = + (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 amask = ~rgbmask; + + int rgb_weights = + ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)); + + __m256i *srcp256 = (__m256i *)src->pixels; + __m256i *dstp256 = (__m256i *)newsurf->pixels; + + __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives, + mm_rgb_weights, mm_alpha_mask, mm_rgb_mask; + __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA, + mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B, + mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray, + mm256_alpha, mm256_rgb_mask, mm256_alpha_mask; + + mm256_shuff_mask_A = + _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80, + 18, 0x80, 17, 0x80, 16, 0x80, 7, 0x80, 6, 0x80, 5, + 0x80, 4, 0x80, 3, 0x80, 2, 0x80, 1, 0x80, 0); + mm256_shuff_mask_B = + _mm256_set_epi8(0x80, 31, 0x80, 30, 0x80, 29, 0x80, 28, 0x80, 27, 0x80, + 26, 0x80, 25, 0x80, 24, 0x80, 15, 0x80, 14, 0x80, 13, + 0x80, 12, 0x80, 11, 0x80, 10, 0x80, 9, 0x80, 8); + + mm256_shuff_mask_gray = _mm256_set_epi8( + 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, + 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); + + mm_zero = _mm_setzero_si128(); + mm_alpha_mask = _mm_cvtsi32_si128(amask); + mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); + mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + mm_rgb_weights = + _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); + + mm256_two_five_fives = _mm256_set1_epi16(0x00FF); + mm256_rgb_weights = _mm256_set1_epi32(rgb_weights); + mm256_rgb_mask = _mm256_set1_epi32(rgbmask); + mm256_alpha_mask = _mm256_set1_epi32(amask); + + while (num_batches--) { + perfect_8_pixels_batch_counter = perfect_8_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_8_pixels_batch_counter--) { + mm256_src = _mm256_loadu_si256(srcp256); + mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + + mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); + mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); + + mm256_dstA = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); + mm256_dstB = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); + + mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA); + mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); + mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); + + mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB); + mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); + mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); + + mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB); + mm256_dst = + _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); + mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); + + mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); + mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + + _mm256_storeu_si256(dstp256, mm256_dst); + + srcp256++; + dstp256++; + } + srcp = (Uint32 *)srcp256; + dstp = (Uint32 *)dstp256; + while (remaining_pixels_batch_counter--) { + mm_src = _mm_cvtsi32_si128(*srcp); + /*mm_src = 0x000000000000000000000000AARRGGBB*/ + mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + /*mm_src = 0x00000000000000000000000000RRGGBB*/ + mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); + /*mm_src = 0x0000000000000000000000RR00GG00BB*/ + + mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_srli_epi16(mm_dst, 8); + /*mm_dst = 0x0000000000000000000000RR00GG00BB*/ + + mm_dst = _mm_hadd_epi16(mm_dst, mm_dst); // This requires SSE3 + mm_dst = _mm_shufflelo_epi16(_mm_hadd_epi16(mm_dst, mm_dst), + _MM_SHUFFLE(0, 0, 0, 0)); + /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/ + + mm_dst = _mm_packus_epi16(mm_dst, mm_dst); + /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ + mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); + mm_dst = _mm_add_epi16(mm_dst, mm_alpha); + /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ + *dstp = _mm_cvtsi128_si32(mm_dst); + /*dstp = 0xAARRGGBB*/ + srcp++; + dstp++; + } + srcp += s_row_skip; + srcp256 = (__m256i *)srcp; + } +} +#else +void +grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + BAD_AVX2_FUNCTION_CALL; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 8f503b964f..fadd0f615c 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -34,3 +34,80 @@ pg_neon_at_runtime_but_uncompiled() } return 0; } + +#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) +void +grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) +{ + int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + int pixel_batch_counter = pixel_batch_length; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint32 rgbmask = + (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 amask = ~rgbmask; + + int rgb_weights = + ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)); + + __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives, + mm_rgb_weights, mm_alpha_mask, mm_rgb_mask; + + mm_zero = _mm_setzero_si128(); + mm_alpha_mask = _mm_cvtsi32_si128(amask); + mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); + mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + mm_rgb_weights = + _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); + + while (num_batches--) { + pixel_batch_counter = pixel_batch_length; + while (pixel_batch_counter--) { + mm_src = _mm_cvtsi32_si128(*srcp); + /*mm_src = 0x000000000000000000000000AARRGGBB*/ + mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + /*mm_src = 0x00000000000000000000000000RRGGBB*/ + mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); + /*mm_src = 0x0000000000000000000000RR00GG00BB*/ + + mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); + /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ + mm_dst = _mm_srli_epi16(mm_dst, 8); + /*mm_dst = 0x0000000000000000000000RR00GG00BB*/ + + mm_dst = _mm_adds_epu8( + _mm_adds_epu8( + _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(0, 0, 0, 0)), + _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(1, 1, 1, 1))), + _mm_adds_epu8( + _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(2, 2, 2, 2)), + _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(3, 3, 3, 3)))); + /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/ + + mm_dst = _mm_packus_epi16(mm_dst, mm_dst); + /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ + mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); + mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ + *dstp = _mm_cvtsi128_si32(mm_dst); + /*dstp = 0xAARRGGBB*/ + srcp++; + dstp++; + } + srcp += s_row_skip; + } +} +#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ diff --git a/src_c/transform.c b/src_c/transform.c index 95f791d3bb..915c7d4dd0 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2032,6 +2032,32 @@ clamp_4 #endif +void +grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf) +{ + int x, y; + for (y = 0; y < src->h; y++) { + for (x = 0; x < src->w; x++) { + Uint32 pixel; + Uint8 *pix; + SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, + pix); + Uint8 r, g, b, a; + SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); + + // RGBA to GRAY formula used by OpenCV + Uint8 grayscale_pixel = + (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) + + (((29 * b) + 255) >> 8)); + Uint32 new_pixel = + SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel, + grayscale_pixel, a); + SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, + newsurf->format, pix); + } + } +} + SDL_Surface * grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) { @@ -2059,24 +2085,26 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) "Source and destination surfaces need the same format.")); } - int x, y; - for (y = 0; y < src->h; y++) { - for (x = 0; x < src->w; x++) { - Uint32 pixel; - Uint8 *pix; - SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, - pix); - Uint8 r, g, b, a; - SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); - - // RGBA to GRAY formula used by OpenCV - Uint8 grayscale_pixel = (Uint8)(0.299 * r + 0.587 * g + 0.114 * b); - Uint32 new_pixel = - SDL_MapRGBA(newsurf->format, grayscale_pixel, grayscale_pixel, - grayscale_pixel, a); - SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, - newsurf->format, pix); + if (src->format->BytesPerPixel == 4 && + src->format->Rmask == newsurf->format->Rmask && + src->format->Gmask == newsurf->format->Gmask && + src->format->Bmask == newsurf->format->Bmask && + (src->pitch % src->format->BytesPerPixel == 0) && + (newsurf->pitch == (newsurf->w * newsurf->format->BytesPerPixel))) { + if (pg_has_avx2()) { + grayscale_avx2(src, newsurf); } +#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + if (pg_HasSSE_NEON()) { + grayscale_sse2(src, newsurf); + } +#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + else { + grayscale_non_simd(src, newsurf); + } + } + else { + grayscale_non_simd(src, newsurf); } SDL_UnlockSurface(newsurf); From dda7b5e63b2f2a67ef515f351286231c9f3e79a4 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 1 Sep 2023 15:08:01 +0100 Subject: [PATCH 02/15] Update transform test to new formula --- test/transform_test.py | 44 ++++++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/test/transform_test.py b/test/transform_test.py index af1c6109d1..087b1e5044 100644 --- a/test/transform_test.py +++ b/test/transform_test.py @@ -174,10 +174,24 @@ def test_grayscale(self): s = pygame.Surface((32, 32)) s.fill((255, 0, 0)) - s2 = pygame.transform.grayscale(s) - self.assertEqual(pygame.transform.average_color(s2)[0], 76) - self.assertEqual(pygame.transform.average_color(s2)[1], 76) - self.assertEqual(pygame.transform.average_color(s2)[2], 76) + gray_red = pygame.transform.grayscale(s) + self.assertEqual(pygame.transform.average_color(gray_red)[0], 76) + self.assertEqual(pygame.transform.average_color(gray_red)[1], 76) + self.assertEqual(pygame.transform.average_color(gray_red)[2], 76) + + green_surf = pygame.Surface((32, 32)) + green_surf.fill((0, 255, 0)) + gray_green = pygame.transform.grayscale(green_surf) + self.assertEqual(pygame.transform.average_color(gray_green)[0], 150) + self.assertEqual(pygame.transform.average_color(gray_green)[1], 150) + self.assertEqual(pygame.transform.average_color(gray_green)[2], 150) + + blue_surf = pygame.Surface((32, 32)) + blue_surf.fill((0, 0, 255)) + blue_green = pygame.transform.grayscale(blue_surf) + self.assertEqual(pygame.transform.average_color(blue_green)[0], 29) + self.assertEqual(pygame.transform.average_color(blue_green)[1], 29) + self.assertEqual(pygame.transform.average_color(blue_green)[2], 29) dest = pygame.Surface((32, 32), depth=32) pygame.transform.grayscale(s, dest) @@ -188,16 +202,16 @@ def test_grayscale(self): dest = pygame.Surface((32, 32), depth=32) s.fill((34, 12, 65)) pygame.transform.grayscale(s, dest) - self.assertEqual(pygame.transform.average_color(dest)[0], 24) - self.assertEqual(pygame.transform.average_color(dest)[1], 24) - self.assertEqual(pygame.transform.average_color(dest)[2], 24) + self.assertEqual(pygame.transform.average_color(dest)[0], 27) + self.assertEqual(pygame.transform.average_color(dest)[1], 27) + self.assertEqual(pygame.transform.average_color(dest)[2], 27) dest = pygame.Surface((32, 32), depth=32) s.fill((123, 123, 123)) pygame.transform.grayscale(s, dest) - self.assertIn(pygame.transform.average_color(dest)[0], [123, 122]) - self.assertIn(pygame.transform.average_color(dest)[1], [123, 122]) - self.assertIn(pygame.transform.average_color(dest)[2], [123, 122]) + self.assertIn(pygame.transform.average_color(dest)[0], [124, 122]) + self.assertIn(pygame.transform.average_color(dest)[1], [124, 122]) + self.assertIn(pygame.transform.average_color(dest)[2], [124, 122]) s = pygame.Surface((32, 32), depth=24) s.fill((255, 0, 0)) @@ -215,6 +229,16 @@ def test_grayscale(self): self.assertEqual(pygame.transform.average_color(dest)[1], 76) self.assertEqual(pygame.transform.average_color(dest)[2], 72) + super_surf = pygame.Surface((64, 64), depth=32) + super_surf.fill((255, 255, 255)) + super_surf.fill((255, 0, 0), pygame.Rect(0, 0, 32, 32)) + sub_surf = super_surf.subsurface(pygame.Rect(0, 0, 32, 32)) + + grey_sub_surf = pygame.transform.grayscale(sub_surf) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + def test_threshold__honors_third_surface(self): # __doc__ for threshold as of Tue 07/15/2008 From 25f653c1b61bbe750998553d460c2a1ca098b569 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 10 Sep 2023 16:22:31 +0100 Subject: [PATCH 03/15] Switch to using AVX only in AVX version, add comments --- src_c/simd_transform.h | 15 +++++ src_c/simd_transform_avx2.c | 111 +++++++++++++++++++----------------- src_c/simd_transform_sse2.c | 50 +++++++++++++--- src_c/transform.c | 6 +- 4 files changed, 123 insertions(+), 59 deletions(-) diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h index ef740e8b94..8940e86429 100644 --- a/src_c/simd_transform.h +++ b/src_c/simd_transform.h @@ -1,6 +1,21 @@ #define NO_PYGAME_C_API #include "_surface.h" +/** + * MACRO borrowed from SSE2NEON - useful for making the shuffling family of + * intrinsics easier to understand by indicating clearly what will go where. + * + * SSE2Neon description follows... + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _PG_SIMD_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + #if !defined(PG_ENABLE_ARM_NEON) && defined(__aarch64__) // arm64 has neon optimisations enabled by default, even when fpu=neon is not // passed diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index f98c369d01..b60b4f9fe7 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -48,27 +48,24 @@ pg_avx2_at_runtime_but_uncompiled() void grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) { - // Current AVX2 process - // ------------------ - // - pre loop: Load weights into register x8 - // - in loop: - // 1. Load 8 pixels into register - // 2. remove the alpha channel for every pixel and save it. - // 3. multiply weights by pixels using standard shuffle to 2x 16bit - // register, mul + 255 then left shift. See multiply blitter mode - // for this operation in isolation. - // 4. pack pixels back together from A & B while adding with a - // horizontal add (e.g. adds A+R and G+B in a ARGB layout) - // 5. shift and add to make final grey pixel colour in 0th - // 8Bit channel in each 'pixel' - // 6. shuffle again to push the grey from the 0th channel into every - // channel of every pixel. - // 7. add the alpha channel back in. - - // Things to fix: - // 1. Would be nice to only use AVX2 stuff for the single pixel stuff. - // 2. Get inspiration from Starbuck's AVX2 macros - + /* See the SSE2 code for a simpler overview of this algorithm + * Current AVX2 process + * ------------------ + * - pre loop: Load weights into register x8 + * - in loop: + * 1. Load 8 pixels into register + * 2. remove the alpha channel for every pixel and save it. + * 3. multiply weights by pixels using standard shuffle to 2x 16bit + * register, mul + 255 then left shift. See multiply blitter mode + * for this operation in isolation. + * 4. pack pixels back together from A & B while adding with a + * horizontal add (e.g. adds A+R and G+B in a ARGB layout) + * 5. shift and add to make final grey pixel colour in 0th + * 8Bit channel in each 'pixel' + * 6. shuffle again to push the grey from the 0th channel into every + * channel of every pixel. + * 7. add the alpha channel back in. + */ int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; // generate number of batches of pixels we need to loop through @@ -122,7 +119,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm_zero = _mm_setzero_si128(); mm_alpha_mask = _mm_cvtsi32_si128(amask); mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); - mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF); mm_rgb_weights = _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); @@ -131,6 +128,15 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm256_rgb_mask = _mm256_set1_epi32(rgbmask); mm256_alpha_mask = _mm256_set1_epi32(amask); + __m256i _partial8_mask = + _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00, + (remaining_pixels > 5) ? 0x80000000 : 0x00, + (remaining_pixels > 4) ? 0x80000000 : 0x00, + (remaining_pixels > 3) ? 0x80000000 : 0x00, + (remaining_pixels > 2) ? 0x80000000 : 0x00, + (remaining_pixels > 1) ? 0x80000000 : 0x00, + (remaining_pixels > 0) ? 0x80000000 : 0x00); + while (num_batches--) { perfect_8_pixels_batch_counter = perfect_8_pixels; remaining_pixels_batch_counter = remaining_pixels; @@ -169,35 +175,38 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) } srcp = (Uint32 *)srcp256; dstp = (Uint32 *)dstp256; - while (remaining_pixels_batch_counter--) { - mm_src = _mm_cvtsi32_si128(*srcp); - /*mm_src = 0x000000000000000000000000AARRGGBB*/ - mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); - /*mm_src = 0x00000000000000000000000000RRGGBB*/ - mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); - /*mm_src = 0x0000000000000000000000RR00GG00BB*/ - - mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); - /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ - mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); - /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ - mm_dst = _mm_srli_epi16(mm_dst, 8); - /*mm_dst = 0x0000000000000000000000RR00GG00BB*/ - - mm_dst = _mm_hadd_epi16(mm_dst, mm_dst); // This requires SSE3 - mm_dst = _mm_shufflelo_epi16(_mm_hadd_epi16(mm_dst, mm_dst), - _MM_SHUFFLE(0, 0, 0, 0)); - /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/ - - mm_dst = _mm_packus_epi16(mm_dst, mm_dst); - /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ - mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); - mm_dst = _mm_add_epi16(mm_dst, mm_alpha); - /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ - *dstp = _mm_cvtsi128_si32(mm_dst); - /*dstp = 0xAARRGGBB*/ - srcp++; - dstp++; + if (remaining_pixels_batch_counter > 0) { + mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); + mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + + mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); + mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); + + mm256_dstA = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); + mm256_dstB = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); + + mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA); + mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); + mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); + + mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB); + mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); + mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); + + mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB); + mm256_dst = + _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); + mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); + + mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); + mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + + _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); + + srcp += remaining_pixels_batch_counter; + dstp += remaining_pixels_batch_counter; } srcp += s_row_skip; srcp256 = (__m256i *)srcp; diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index fadd0f615c..383dad2444 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -39,6 +39,22 @@ pg_neon_at_runtime_but_uncompiled() void grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) { + /* For the SSE2 SIMD version of grayscale we do one pixel at a time + * Thus we can calculate the number of loops (and pixels) by multiplying + * the width of the surface to be grayscaled, by the height of that + * surface. + * + * We also need to calculate a 'skip value' in case our surface's rows are + * not contiguous in memory. For surfaces, a single row's worth of pixel + * data is always contiguous (i.e. each pixel is next to each other). + * However, a surface's rows may be seperated from one another in memory, + * most commonly this happens with sub surfaces. + * The vast majority of surfaces used in applications will probably also + * have contiguous rows as that is what happens when you create a standard + * 32bit surface with pygame.Surface. SIMD Transform algorithms, + * should treat this 'most normal' case as the critical path to maximise + * performance. + */ int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; // generate number of batches of pixels we need to loop through @@ -67,7 +83,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) mm_zero = _mm_setzero_si128(); mm_alpha_mask = _mm_cvtsi32_si128(amask); mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); - mm_two_five_fives = _mm_set_epi64x(0x00FF00FF00FF00FF, 0x00FF00FF00FF00FF); + mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF); mm_rgb_weights = _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); @@ -76,11 +92,18 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) while (pixel_batch_counter--) { mm_src = _mm_cvtsi32_si128(*srcp); /*mm_src = 0x000000000000000000000000AARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); /*mm_src = 0x00000000000000000000000000RRGGBB*/ + + /* This is where we do the efficient 8bit 'floating point multiply' + operation of each channel by the weights - using a 16bit integer + multiply, an add and a bitshift. We use this trick repeatedly + for multiplication by a 0 to 1 value in SIMD code. + */ mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); /*mm_src = 0x0000000000000000000000RR00GG00BB*/ - mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); /*mm_dst = 0x00000000000000000000RRRRGGGGBBBB*/ mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); @@ -88,15 +111,28 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) mm_dst = _mm_srli_epi16(mm_dst, 8); /*mm_dst = 0x0000000000000000000000RR00GG00BB*/ + /* now we have the multiplied channels we 'shuffle them out' one + * at a time so there are four copies of red, four copies of green, + * four copies of blue etc. Then we add all these together + * so each of channels contains R+G+B. + */ mm_dst = _mm_adds_epu8( _mm_adds_epu8( - _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(0, 0, 0, 0)), - _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(1, 1, 1, 1))), + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1))), _mm_adds_epu8( - _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(2, 2, 2, 2)), - _mm_shufflelo_epi16(mm_dst, _MM_SHUFFLE(3, 3, 3, 3)))); - /*mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr*/ + _mm_shufflelo_epi16(mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _mm_shufflelo_epi16(mm_dst, + _PG_SIMD_SHUFFLE(3, 3, 3, 3)))); + /* Gr here stands for 'Gray' as we've now added all the channels + * back together after multiplying them above. + * mm_dst = 0x000000000000000000GrGr00GrGr00GrGr00GrGr + */ + /* The rest is just packing the grayscale back to the original + * 8bit pixel layout and adding the alpha we removed earlier back + * in again + */ mm_dst = _mm_packus_epi16(mm_dst, mm_dst); /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); diff --git a/src_c/transform.c b/src_c/transform.c index 915c7d4dd0..de3b862a0f 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2045,7 +2045,11 @@ grayscale_non_simd(SDL_Surface *src, SDL_Surface *newsurf) Uint8 r, g, b, a; SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); - // RGBA to GRAY formula used by OpenCV + /* RGBA to GRAY formula used by OpenCV + * We are using a bitshift and integer addition to align the + * calculation with what is fastest for SIMD operations. + * Results are almost identical to floating point multiplication. + */ Uint8 grayscale_pixel = (Uint8)((((76 * r) + 255) >> 8) + (((150 * g) + 255) >> 8) + (((29 * b) + 255) >> 8)); From d3f0651a0b10313a8806dbf03fb9cd553b307a33 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 24 Sep 2023 11:18:23 +0100 Subject: [PATCH 04/15] Clean up unused SSE2 variables from AVX2 code --- src_c/simd_transform_avx2.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index b60b4f9fe7..4c809728ac 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -96,8 +96,6 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) __m256i *srcp256 = (__m256i *)src->pixels; __m256i *dstp256 = (__m256i *)newsurf->pixels; - __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives, - mm_rgb_weights, mm_alpha_mask, mm_rgb_mask; __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA, mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B, mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray, @@ -116,13 +114,6 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) 28, 28, 28, 28, 24, 24, 24, 24, 20, 20, 20, 20, 16, 16, 16, 16, 12, 12, 12, 12, 8, 8, 8, 8, 4, 4, 4, 4, 0, 0, 0, 0); - mm_zero = _mm_setzero_si128(); - mm_alpha_mask = _mm_cvtsi32_si128(amask); - mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); - mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF); - mm_rgb_weights = - _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); - mm256_two_five_fives = _mm256_set1_epi16(0x00FF); mm256_rgb_weights = _mm256_set1_epi32(rgb_weights); mm256_rgb_mask = _mm256_set1_epi32(rgbmask); From 33db90e0ee1a4340de9cc9a1c9bd7888a7bbf557 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 1 Oct 2023 18:51:53 +0100 Subject: [PATCH 05/15] Add SIMD versions of greyscale transform A few changes from review. --- src_c/simd_transform_avx2.c | 4 ++-- src_c/transform.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 4c809728ac..cc4effa907 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -66,7 +66,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) * channel of every pixel. * 7. add the alpha channel back in. */ - int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; + int s_row_skip = (src->pitch - src->w * 4) / 4; // generate number of batches of pixels we need to loop through int pixel_batch_length = src->w * src->h; @@ -77,7 +77,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) } int remaining_pixels = pixel_batch_length % 8; - int perfect_8_pixels = (pixel_batch_length - remaining_pixels) / 8; + int perfect_8_pixels = pixel_batch_length / 8; int perfect_8_pixels_batch_counter = perfect_8_pixels; int remaining_pixels_batch_counter = remaining_pixels; diff --git a/src_c/transform.c b/src_c/transform.c index de3b862a0f..8455af6c66 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2094,12 +2094,12 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) src->format->Gmask == newsurf->format->Gmask && src->format->Bmask == newsurf->format->Bmask && (src->pitch % src->format->BytesPerPixel == 0) && - (newsurf->pitch == (newsurf->w * newsurf->format->BytesPerPixel))) { + (newsurf->pitch == (newsurf->w * 4))) { if (pg_has_avx2()) { grayscale_avx2(src, newsurf); } #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) - if (pg_HasSSE_NEON()) { + else if (pg_HasSSE_NEON()) { grayscale_sse2(src, newsurf); } #endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) From bac4ac020e44f70c04eb745a78f5294c7f80a3e4 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 2 Oct 2023 20:11:17 +0100 Subject: [PATCH 06/15] Upgrade SSE2 to 2 pixels at a time --- src_c/simd_transform_sse2.c | 112 ++++++++++++++++++++++++++++++++---- 1 file changed, 102 insertions(+), 10 deletions(-) diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 383dad2444..5c1da408dc 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -37,7 +37,17 @@ pg_neon_at_runtime_but_uncompiled() #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) void -grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) +#if defined(ENV64BIT) +#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) +#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg) +#else +#define LOAD_64_INTO_M128(num, reg) \ + *reg = _mm_loadl_epi64((const __m128i *)num) +#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg) +#endif + + void + grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) { /* For the SSE2 SIMD version of grayscale we do one pixel at a time * Thus we can calculate the number of loops (and pixels) by multiplying @@ -55,7 +65,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) * should treat this 'most normal' case as the critical path to maximise * performance. */ - int s_row_skip = (src->pitch - src->w * src->format->BytesPerPixel) >> 2; + int s_row_skip = (src->pitch - src->w * 4) / 4; // generate number of batches of pixels we need to loop through int pixel_batch_length = src->w * src->h; @@ -64,7 +74,11 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) pixel_batch_length = src->w; num_batches = src->h; } - int pixel_batch_counter = pixel_batch_length; + int remaining_pixels = pixel_batch_length % 2; + int perfect_2_pixels = pixel_batch_length / 2; + + int perfect_2_pixels_batch_counter = perfect_2_pixels; + int remaining_pixels_batch_counter = remaining_pixels; Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; @@ -73,23 +87,100 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) (src->format->Rmask | src->format->Gmask | src->format->Bmask); Uint32 amask = ~rgbmask; - int rgb_weights = + Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; + Uint64 amask64 = ~rgbmask64; + + Uint64 rgb_weights = + ((Uint64)((0x4C << src->format->Rshift) | + (0x96 << src->format->Gshift) | + (0x1D << src->format->Bshift)) + << 32) | ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | (0x1D << src->format->Bshift)); + Uint64 *srcp64 = (Uint64 *)src->pixels; + Uint64 *dstp64 = (Uint64 *)newsurf->pixels; + __m128i mm_src, mm_dst, mm_alpha, mm_zero, mm_two_five_fives, mm_rgb_weights, mm_alpha_mask, mm_rgb_mask; mm_zero = _mm_setzero_si128(); - mm_alpha_mask = _mm_cvtsi32_si128(amask); - mm_rgb_mask = _mm_cvtsi32_si128(rgbmask); + LOAD_64_INTO_M128(&amask64, &mm_alpha_mask); + LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask); mm_two_five_fives = _mm_set1_epi64x(0x00FF00FF00FF00FF); - mm_rgb_weights = - _mm_unpacklo_epi8(_mm_cvtsi32_si128(rgb_weights), mm_zero); + + LOAD_64_INTO_M128(&rgb_weights, &mm_rgb_weights); + mm_rgb_weights = _mm_unpacklo_epi8(mm_rgb_weights, mm_zero); while (num_batches--) { - pixel_batch_counter = pixel_batch_length; - while (pixel_batch_counter--) { + perfect_2_pixels_batch_counter = perfect_2_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_2_pixels_batch_counter--) { + LOAD_64_INTO_M128(srcp64, &mm_src); + /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ + mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/ + + /* This is where we do the efficient 8bit 'floating point multiply' + operation of each channel by the weights - using a 16bit integer + multiply, an add and a bitshift. We use this trick repeatedly + for multiplication by a 0 to 1 value in SIMD code. + */ + mm_src = _mm_unpacklo_epi8(mm_src, mm_zero); + /*mm_src = 0x000000RR00GG00BB000000RR00GG00BB*/ + mm_dst = _mm_mullo_epi16(mm_src, mm_rgb_weights); + /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/ + mm_dst = _mm_add_epi16(mm_dst, mm_two_five_fives); + /*mm_dst = 0x0000RRRRGGGGBBBB0000RRRRGGGGBBBB*/ + mm_dst = _mm_srli_epi16(mm_dst, 8); + /*mm_dst = 0x000000RR00GG00BB000000RR00GG00BB*/ + + /* now we have the multiplied channels we 'shuffle them out' one + * at a time so there are four copies of red, four copies of green, + * four copies of blue etc. Then we add all these together + * so each of channels contains R+G+B. + */ + mm_dst = _mm_adds_epu8( + _mm_adds_epu8(_mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _PG_SIMD_SHUFFLE(0, 0, 0, 0)), + _mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(1, 1, 1, 1)), + _PG_SIMD_SHUFFLE(1, 1, 1, 1))), + _mm_adds_epu8(_mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _PG_SIMD_SHUFFLE(2, 2, 2, 2)), + _mm_shufflehi_epi16( + _mm_shufflelo_epi16( + mm_dst, _PG_SIMD_SHUFFLE(3, 3, 3, 3)), + _PG_SIMD_SHUFFLE(3, 3, 3, 3)))); + /* Gr here stands for 'Gray' as we've now added all the channels + * back together after multiplying them above. + * mm_dst = 0x0000GrGr00GrGr00GrGr00GrGr0000GrGr00GrGr00GrGr00GrGr + */ + + /* The rest is just packing the grayscale back to the original + * 8bit pixel layout and adding the alpha we removed earlier back + * in again + */ + mm_dst = _mm_packus_epi16(mm_dst, mm_dst); + /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/ + mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); + mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ + STORE_M128_INTO_64(&mm_dst, dstp64); + /*dstp = 0xAARRGGBB*/ + srcp64++; + dstp64++; + } + srcp = (Uint32 *)srcp64; + dstp = (Uint32 *)dstp64; + if (remaining_pixels_batch_counter > 0) { mm_src = _mm_cvtsi32_si128(*srcp); /*mm_src = 0x000000000000000000000000AARRGGBB*/ /* First we strip out the alpha so we have one of our 4 channels @@ -144,6 +235,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) dstp++; } srcp += s_row_skip; + srcp64 = (Uint64 *)srcp; } } #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ From 7d8254bae813ca856c436299d39f8eb18d6ed333 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 2 Oct 2023 20:57:54 +0100 Subject: [PATCH 07/15] Upgrade SSE2 to 2 pixels at a time --- src_c/simd_transform_sse2.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 5c1da408dc..ec869cab7d 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -36,7 +36,7 @@ pg_neon_at_runtime_but_uncompiled() } #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) -void + #if defined(ENV64BIT) #define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) #define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg) @@ -46,8 +46,8 @@ void #define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg) #endif - void - grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) +void +grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) { /* For the SSE2 SIMD version of grayscale we do one pixel at a time * Thus we can calculate the number of loops (and pixels) by multiplying @@ -85,8 +85,6 @@ void Uint32 rgbmask = (src->format->Rmask | src->format->Gmask | src->format->Bmask); - Uint32 amask = ~rgbmask; - Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; Uint64 amask64 = ~rgbmask64; From 14e0cdabecab0b1135bfb447908ec3e0b46c3c5e Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 15 Oct 2023 08:44:42 +0100 Subject: [PATCH 08/15] change src->format->BytesPerPixel to 4 --- src_c/transform.c | 140 ++++++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 74 deletions(-) diff --git a/src_c/transform.c b/src_c/transform.c index bb04065deb..5d9852d112 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -1952,86 +1952,79 @@ clamp_4 */ -#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - p_color = (Uint32) * \ - ((Uint8 *)(p_pixels) + (p_y) * p_surf->pitch + (p_x)); \ - break; \ - case 2: \ - p_color = \ - (Uint32) * \ - ((Uint16 *)((p_pixels) + (p_y) * p_surf->pitch) + (p_x)); \ - break; \ - case 3: \ - p_pix = \ - ((Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3); \ - p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN) \ - ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16) \ - : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \ - break; \ - default: /* case 4: */ \ - p_color = \ - *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)); \ - break; \ +#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + p_color = (Uint32) * \ + ((Uint8 *)(p_pixels) + (p_y)*p_surf->pitch + (p_x)); \ + break; \ + case 2: \ + p_color = (Uint32) * \ + ((Uint16 *)((p_pixels) + (p_y)*p_surf->pitch) + (p_x)); \ + break; \ + case 3: \ + p_pix = ((Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3); \ + p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN) \ + ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16) \ + : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \ + break; \ + default: /* case 4: */ \ + p_color = *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)); \ + break; \ } #if (SDL_BYTEORDER == SDL_LIL_ENDIAN) -#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ - p_byte_buf) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) = \ - (Uint8)p_color; \ - break; \ - case 2: \ - *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ - (Uint16)p_color; \ - break; \ - case 3: \ - p_byte_buf = \ - (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \ - *(p_byte_buf + (p_format->Rshift >> 3)) = \ - (Uint8)(p_color >> p_format->Rshift); \ - *(p_byte_buf + (p_format->Gshift >> 3)) = \ - (Uint8)(p_color >> p_format->Gshift); \ - *(p_byte_buf + (p_format->Bshift >> 3)) = \ - (Uint8)(p_color >> p_format->Bshift); \ - break; \ - default: \ - *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ - p_color; \ - break; \ +#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ + p_byte_buf) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) = \ + (Uint8)p_color; \ + break; \ + case 2: \ + *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = \ + (Uint16)p_color; \ + break; \ + case 3: \ + p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \ + *(p_byte_buf + (p_format->Rshift >> 3)) = \ + (Uint8)(p_color >> p_format->Rshift); \ + *(p_byte_buf + (p_format->Gshift >> 3)) = \ + (Uint8)(p_color >> p_format->Gshift); \ + *(p_byte_buf + (p_format->Bshift >> 3)) = \ + (Uint8)(p_color >> p_format->Bshift); \ + break; \ + default: \ + *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color; \ + break; \ } #else -#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ - p_byte_buf) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) = \ - (Uint8)p_color; \ - break; \ - case 2: \ - *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ - (Uint16)p_color; \ - break; \ - case 3: \ - p_byte_buf = \ - (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \ - *(p_byte_buf + 2 - (p_format->Rshift >> 3)) = \ - (Uint8)(p_color >> p_format->Rshift); \ - *(p_byte_buf + 2 - (p_format->Gshift >> 3)) = \ - (Uint8)(p_color >> p_format->Gshift); \ - *(p_byte_buf + 2 - (p_format->Bshift >> 3)) = \ - (Uint8)(p_color >> p_format->Bshift); \ - break; \ - default: \ - *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ - p_color; \ - break; \ +#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ + p_byte_buf) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) = \ + (Uint8)p_color; \ + break; \ + case 2: \ + *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = \ + (Uint16)p_color; \ + break; \ + case 3: \ + p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \ + *(p_byte_buf + 2 - (p_format->Rshift >> 3)) = \ + (Uint8)(p_color >> p_format->Rshift); \ + *(p_byte_buf + 2 - (p_format->Gshift >> 3)) = \ + (Uint8)(p_color >> p_format->Gshift); \ + *(p_byte_buf + 2 - (p_format->Bshift >> 3)) = \ + (Uint8)(p_color >> p_format->Bshift); \ + break; \ + default: \ + *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color; \ + break; \ } #endif @@ -2097,8 +2090,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) src->format->Rmask == newsurf->format->Rmask && src->format->Gmask == newsurf->format->Gmask && src->format->Bmask == newsurf->format->Bmask && - (src->pitch % src->format->BytesPerPixel == 0) && - (newsurf->pitch == (newsurf->w * 4))) { + (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) { if (pg_has_avx2()) { grayscale_avx2(src, newsurf); } From 25a611bf549f1e047fb275bbe4bf309ecd07b756 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 15 Oct 2023 10:48:15 +0100 Subject: [PATCH 09/15] update formatting to latest version of clang --- src_c/transform.c | 137 ++++++++++++++++++++++++---------------------- 1 file changed, 72 insertions(+), 65 deletions(-) diff --git a/src_c/transform.c b/src_c/transform.c index 5d9852d112..845556b5e6 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -1952,79 +1952,86 @@ clamp_4 */ -#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - p_color = (Uint32) * \ - ((Uint8 *)(p_pixels) + (p_y)*p_surf->pitch + (p_x)); \ - break; \ - case 2: \ - p_color = (Uint32) * \ - ((Uint16 *)((p_pixels) + (p_y)*p_surf->pitch) + (p_x)); \ - break; \ - case 3: \ - p_pix = ((Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3); \ - p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN) \ - ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16) \ - : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \ - break; \ - default: /* case 4: */ \ - p_color = *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)); \ - break; \ +#define SURF_GET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, p_pix) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + p_color = (Uint32) * \ + ((Uint8 *)(p_pixels) + (p_y) * p_surf->pitch + (p_x)); \ + break; \ + case 2: \ + p_color = \ + (Uint32) * \ + ((Uint16 *)((p_pixels) + (p_y) * p_surf->pitch) + (p_x)); \ + break; \ + case 3: \ + p_pix = \ + ((Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3); \ + p_color = (SDL_BYTEORDER == SDL_LIL_ENDIAN) \ + ? (p_pix[0]) + (p_pix[1] << 8) + (p_pix[2] << 16) \ + : (p_pix[2]) + (p_pix[1] << 8) + (p_pix[0] << 16); \ + break; \ + default: /* case 4: */ \ + p_color = \ + *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)); \ + break; \ } #if (SDL_BYTEORDER == SDL_LIL_ENDIAN) -#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ - p_byte_buf) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) = \ - (Uint8)p_color; \ - break; \ - case 2: \ - *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = \ - (Uint16)p_color; \ - break; \ - case 3: \ - p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \ - *(p_byte_buf + (p_format->Rshift >> 3)) = \ - (Uint8)(p_color >> p_format->Rshift); \ - *(p_byte_buf + (p_format->Gshift >> 3)) = \ - (Uint8)(p_color >> p_format->Gshift); \ - *(p_byte_buf + (p_format->Bshift >> 3)) = \ - (Uint8)(p_color >> p_format->Bshift); \ - break; \ - default: \ - *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color; \ - break; \ +#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ + p_byte_buf) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) = \ + (Uint8)p_color; \ + break; \ + case 2: \ + *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ + (Uint16)p_color; \ + break; \ + case 3: \ + p_byte_buf = \ + (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \ + *(p_byte_buf + (p_format->Rshift >> 3)) = \ + (Uint8)(p_color >> p_format->Rshift); \ + *(p_byte_buf + (p_format->Gshift >> 3)) = \ + (Uint8)(p_color >> p_format->Gshift); \ + *(p_byte_buf + (p_format->Bshift >> 3)) = \ + (Uint8)(p_color >> p_format->Bshift); \ + break; \ + default: \ + *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ + p_color; \ + break; \ } #else -#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ - p_byte_buf) \ - switch (p_format->BytesPerPixel) { \ - case 1: \ - *((Uint8 *)p_pixels + (p_y)*p_surf->pitch + (p_x)) = \ - (Uint8)p_color; \ - break; \ - case 2: \ - *((Uint16 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = \ - (Uint16)p_color; \ - break; \ - case 3: \ - p_byte_buf = (Uint8 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)*3; \ - *(p_byte_buf + 2 - (p_format->Rshift >> 3)) = \ - (Uint8)(p_color >> p_format->Rshift); \ - *(p_byte_buf + 2 - (p_format->Gshift >> 3)) = \ - (Uint8)(p_color >> p_format->Gshift); \ - *(p_byte_buf + 2 - (p_format->Bshift >> 3)) = \ - (Uint8)(p_color >> p_format->Bshift); \ - break; \ - default: \ - *((Uint32 *)(p_pixels + (p_y)*p_surf->pitch) + (p_x)) = p_color; \ - break; \ +#define SURF_SET_AT(p_color, p_surf, p_x, p_y, p_pixels, p_format, \ + p_byte_buf) \ + switch (p_format->BytesPerPixel) { \ + case 1: \ + *((Uint8 *)p_pixels + (p_y) * p_surf->pitch + (p_x)) = \ + (Uint8)p_color; \ + break; \ + case 2: \ + *((Uint16 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ + (Uint16)p_color; \ + break; \ + case 3: \ + p_byte_buf = \ + (Uint8 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x) * 3; \ + *(p_byte_buf + 2 - (p_format->Rshift >> 3)) = \ + (Uint8)(p_color >> p_format->Rshift); \ + *(p_byte_buf + 2 - (p_format->Gshift >> 3)) = \ + (Uint8)(p_color >> p_format->Gshift); \ + *(p_byte_buf + 2 - (p_format->Bshift >> 3)) = \ + (Uint8)(p_color >> p_format->Bshift); \ + break; \ + default: \ + *((Uint32 *)(p_pixels + (p_y) * p_surf->pitch) + (p_x)) = \ + p_color; \ + break; \ } #endif From ce96a26f343b54d8e837c341275aafa8ac256691 Mon Sep 17 00:00:00 2001 From: Starbuck5 <46412508+Starbuck5@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:05:44 -0700 Subject: [PATCH 10/15] Add test for assumptions --- test/transform_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/transform_test.py b/test/transform_test.py index 087b1e5044..f7585f2d4f 100644 --- a/test/transform_test.py +++ b/test/transform_test.py @@ -239,6 +239,16 @@ def test_grayscale(self): self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) self.assertEqual(pygame.transform.average_color(grey_sub_surf)[0], 76) + def test_grayscale_simd_assumptions(self): + # The grayscale SIMD algorithm relies on the destination surface pitch + # being exactly width * 4 (4 bytes per pixel), for maximum speed. + # This test is here to make sure that assumption is always true. + widths = [1, 5, 6, 23, 54, 233] + for width in widths: + self.assertEqual( + pygame.Surface((width, 1), depth=32).get_pitch(), width * 4 + ) + def test_threshold__honors_third_surface(self): # __doc__ for threshold as of Tue 07/15/2008 From 8b8b4b64974efd8e09f3864079df709aabd76a45 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Thu, 26 Oct 2023 07:16:54 +0100 Subject: [PATCH 11/15] Add versionchanged documentation --- docs/reST/ref/transform.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/reST/ref/transform.rst b/docs/reST/ref/transform.rst index 9823f42bf1..81002f8c90 100644 --- a/docs/reST/ref/transform.rst +++ b/docs/reST/ref/transform.rst @@ -334,6 +334,9 @@ Instead, always begin with the original image and scale to the desired size.) .. versionadded:: 2.1.4 + .. versionchanged:: 2.4.0 Adjusted formula slightly to support performance optimisation. It may return very slightly + different pixels than before, but should run seven to eleven times faster on most systems. + .. ## pygame.transform.grayscale ## .. function:: threshold From 4e42929b6ab91892a6701ccbcb17ed339f47d765 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 3 Nov 2023 19:18:40 +0000 Subject: [PATCH 12/15] Change to using bitwise logic for alpha switcheroo --- src_c/simd_transform_avx2.c | 12 ++++++------ src_c/simd_transform_sse2.c | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index cc4effa907..1df78957a4 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -133,7 +133,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) remaining_pixels_batch_counter = remaining_pixels; while (perfect_8_pixels_batch_counter--) { mm256_src = _mm256_loadu_si256(srcp256); - mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); @@ -156,8 +156,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); - mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); - mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask); + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); _mm256_storeu_si256(dstp256, mm256_dst); @@ -168,7 +168,7 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) dstp = (Uint32 *)dstp256; if (remaining_pixels_batch_counter > 0) { mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); - mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); @@ -191,8 +191,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); - mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); - mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask); + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 9efbb2ba4c..8cf1ffd6c2 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -494,7 +494,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/ /* First we strip out the alpha so we have one of our 4 channels empty for the rest of the calculation */ - mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/ /* This is where we do the efficient 8bit 'floating point multiply' @@ -544,8 +544,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) */ mm_dst = _mm_packus_epi16(mm_dst, mm_dst); /*mm_dst = 0x000000000000000000GrGrGrGrGrGr00GrGrGrGrGrGr*/ - mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); - mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask); + mm_dst = _mm_or_si128(mm_dst, mm_alpha); /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ STORE_M128_INTO_64(&mm_dst, dstp64); /*dstp = 0xAARRGGBB*/ @@ -559,7 +559,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) /*mm_src = 0x000000000000000000000000AARRGGBB*/ /* First we strip out the alpha so we have one of our 4 channels empty for the rest of the calculation */ - mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); /*mm_src = 0x00000000000000000000000000RRGGBB*/ /* This is where we do the efficient 8bit 'floating point multiply' @@ -600,8 +600,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) */ mm_dst = _mm_packus_epi16(mm_dst, mm_dst); /*mm_dst = 0x000000000000000000000000GrGrGrGrGrGrGrGr*/ - mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); - mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask); + mm_dst = _mm_or_si128(mm_dst, mm_alpha); /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ *dstp = _mm_cvtsi128_si32(mm_dst); /*dstp = 0xAARRGGBB*/ From 56442bfcc37cbe47ce85b5df909a8f38e6c64f6c Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 6 Nov 2023 21:38:20 +0000 Subject: [PATCH 13/15] Disable SIMD on Emscripten --- src_c/transform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src_c/transform.c b/src_c/transform.c index c7d1300645..9217bc188b 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2115,7 +2115,9 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) PyExc_ValueError, "Source and destination surfaces need the same format.")); } - +#if defined(__EMSCRIPTEN__) + invert_non_simd(src, newsurf); +#else // !defined(__EMSCRIPTEN__) if (src->format->BytesPerPixel == 4 && src->format->Rmask == newsurf->format->Rmask && src->format->Gmask == newsurf->format->Gmask && @@ -2136,6 +2138,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) else { grayscale_non_simd(src, newsurf); } +#endif // !defined(__EMSCRIPTEN__) SDL_UnlockSurface(newsurf); From e8841d43183d3e511edc4de43e54c94c9beac65a Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Wed, 8 Nov 2023 18:33:36 +0000 Subject: [PATCH 14/15] Call correct non_simd_ function --- src_c/transform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_c/transform.c b/src_c/transform.c index 9217bc188b..f4fcf51405 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -2116,7 +2116,7 @@ grayscale(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) "Source and destination surfaces need the same format.")); } #if defined(__EMSCRIPTEN__) - invert_non_simd(src, newsurf); + grayscale_non_simd(src, newsurf); #else // !defined(__EMSCRIPTEN__) if (src->format->BytesPerPixel == 4 && src->format->Rmask == newsurf->format->Rmask && From b5a1ca015feaf80d0f45a6674757ce33164a69b0 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 12 Nov 2023 12:04:19 +0000 Subject: [PATCH 15/15] Add some changes and documentation from code review. --- src_c/simd_transform_avx2.c | 59 ++++++++++++++++++++++--------------- src_c/simd_transform_sse2.c | 12 ++++---- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 1df78957a4..5be6f9863f 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -85,9 +85,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = - (src->format->Rmask | src->format->Gmask | src->format->Bmask); - Uint32 amask = ~rgbmask; + Uint32 amask = src->format->Amask; + Uint32 rgbmask = ~amask; int rgb_weights = ((0x4C << src->format->Rshift) | (0x96 << src->format->Gshift) | @@ -99,7 +98,8 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) __m256i mm256_src, mm256_srcA, mm256_srcB, mm256_dst, mm256_dstA, mm256_dstB, mm256_shuff_mask_A, mm256_shuff_mask_B, mm256_two_five_fives, mm256_rgb_weights, mm256_shuff_mask_gray, - mm256_alpha, mm256_rgb_mask, mm256_alpha_mask; + mm256_alpha, mm256_rgb_mask, mm256_alpha_mask, + mm256_shuffled_weights_A, mm256_shuffled_weights_B; mm256_shuff_mask_A = _mm256_set_epi8(0x80, 23, 0x80, 22, 0x80, 21, 0x80, 20, 0x80, 19, 0x80, @@ -119,43 +119,57 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm256_rgb_mask = _mm256_set1_epi32(rgbmask); mm256_alpha_mask = _mm256_set1_epi32(amask); - __m256i _partial8_mask = - _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00, - (remaining_pixels > 5) ? 0x80000000 : 0x00, - (remaining_pixels > 4) ? 0x80000000 : 0x00, - (remaining_pixels > 3) ? 0x80000000 : 0x00, - (remaining_pixels > 2) ? 0x80000000 : 0x00, - (remaining_pixels > 1) ? 0x80000000 : 0x00, - (remaining_pixels > 0) ? 0x80000000 : 0x00); + mm256_shuffled_weights_A = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); + mm256_shuffled_weights_B = + _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); + + __m256i _partial8_mask = _mm256_set_epi32( + 0, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0, + (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0, + (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0, + (remaining_pixels > 0) ? -1 : 0); while (num_batches--) { perfect_8_pixels_batch_counter = perfect_8_pixels; remaining_pixels_batch_counter = remaining_pixels; while (perfect_8_pixels_batch_counter--) { mm256_src = _mm256_loadu_si256(srcp256); + // strip out the the alpha and store it mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); + // shuffle out the 8 pixels into two spaced out registers + // there are four pixels in each register with 16bits of room + // per channel. This gives us bit space for multiplication. mm256_srcA = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_A); mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); + // Do the 'percentage multiplications' with the weights + // with accuracy correction so values like 255 * '255' + // (here effectively 1.0) = 255 and not 254. + // For our greyscale this should mean 255 white stays 255 white + // after greyscaling. mm256_dstA = - _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); - mm256_dstB = - _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); - - mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA); + _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A); mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); - mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB); + mm256_dstB = + _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B); mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); + // Add up weighted R+G+B into the first channel of each of the 8 + // pixels. This is the grey value we want in all our colour + // channels. mm256_dst = _mm256_hadd_epi16(mm256_dstA, mm256_dstB); mm256_dst = _mm256_add_epi16(mm256_dst, _mm256_srli_epi32(mm256_dst, 16)); + // Shuffle the grey value from ther first channel of each pixel + // into every channel of each pixel mm256_dst = _mm256_shuffle_epi8(mm256_dst, mm256_shuff_mask_gray); + // Add the alpha back mm256_dst = _mm256_and_si256(mm256_dst, mm256_rgb_mask); mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); @@ -174,15 +188,12 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm256_srcB = _mm256_shuffle_epi8(mm256_src, mm256_shuff_mask_B); mm256_dstA = - _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_A); - mm256_dstB = - _mm256_shuffle_epi8(mm256_rgb_weights, mm256_shuff_mask_B); - - mm256_dstA = _mm256_mullo_epi16(mm256_srcA, mm256_dstA); + _mm256_mullo_epi16(mm256_srcA, mm256_shuffled_weights_A); mm256_dstA = _mm256_add_epi16(mm256_dstA, mm256_two_five_fives); mm256_dstA = _mm256_srli_epi16(mm256_dstA, 8); - mm256_dstB = _mm256_mullo_epi16(mm256_srcB, mm256_dstB); + mm256_dstB = + _mm256_mullo_epi16(mm256_srcB, mm256_shuffled_weights_B); mm256_dstB = _mm256_add_epi16(mm256_dstB, mm256_two_five_fives); mm256_dstB = _mm256_srli_epi16(mm256_dstB, 8); diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 8cf1ffd6c2..35689ac72d 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -45,11 +45,11 @@ pg_neon_at_runtime_but_uncompiled() #if defined(ENV64BIT) #define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) -#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg) +#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(reg) #else #define LOAD_64_INTO_M128(num, reg) \ *reg = _mm_loadl_epi64((const __m128i *)num) -#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg) +#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, reg) #endif void @@ -459,10 +459,8 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = - (src->format->Rmask | src->format->Gmask | src->format->Bmask); - Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; - Uint64 amask64 = ~rgbmask64; + Uint64 amask64 = ((Uint64)src->format->Amask) | src->format->Amask; + Uint64 rgbmask64 = ~amask64; Uint64 rgb_weights = ((Uint64)((0x4C << src->format->Rshift) | @@ -547,7 +545,7 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf) mm_dst = _mm_and_si128(mm_dst, mm_rgb_mask); mm_dst = _mm_or_si128(mm_dst, mm_alpha); /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ - STORE_M128_INTO_64(&mm_dst, dstp64); + STORE_M128_INTO_64(mm_dst, dstp64); /*dstp = 0xAARRGGBB*/ srcp64++; dstp64++;