diff --git a/buildconfig/Setup.Android.SDL2.in b/buildconfig/Setup.Android.SDL2.in index 2103c45763..6be0ddd5f2 100644 --- a/buildconfig/Setup.Android.SDL2.in +++ b/buildconfig/Setup.Android.SDL2.in @@ -51,7 +51,7 @@ key src_c/key.c $(SDL) $(DEBUG) mouse src_c/mouse.c $(SDL) $(DEBUG) rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG) rwobject src_c/rwobject.c $(SDL) $(DEBUG) -surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG) +surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) surflock src_c/surflock.c $(SDL) $(DEBUG) time src_c/time.c $(SDL) $(DEBUG) joystick src_c/joystick.c $(SDL) $(DEBUG) diff --git a/buildconfig/Setup.SDL2.in b/buildconfig/Setup.SDL2.in index 36e4b9ac03..c5e75c5369 100644 --- a/buildconfig/Setup.SDL2.in +++ b/buildconfig/Setup.SDL2.in @@ -61,7 +61,7 @@ key src_c/key.c $(SDL) $(DEBUG) mouse src_c/mouse.c $(SDL) $(DEBUG) rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG) rwobject src_c/rwobject.c $(SDL) $(DEBUG) -surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG) +surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG) surflock src_c/surflock.c $(SDL) $(DEBUG) time src_c/time.c $(SDL) $(DEBUG) joystick src_c/joystick.c $(SDL) $(DEBUG) diff --git a/setup.py b/setup.py index ad0cdec4f6..6df657a575 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ import distutils.ccompiler -avx2_filenames = ['simd_blitters_avx2'] +avx2_filenames = ['simd_blitters_avx2', 'simd_surface_fill_avx2'] compiler_options = { 'unix': ('-mavx2',), diff --git a/src_c/simd_fill.h b/src_c/simd_fill.h new file mode 100644 index 0000000000..5f31774f42 --- /dev/null +++ b/src_c/simd_fill.h @@ -0,0 +1,17 @@ +#define NO_PYGAME_C_API + +#ifndef SIMD_FILL_H +#define SIMD_FILL_H +#include "_surface.h" + +// AVX2 functions +int +_pg_has_avx2(); + +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color); +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color); +#endif // SIMD_FILL_H diff --git a/src_c/simd_surface_fill_avx2.c b/src_c/simd_surface_fill_avx2.c new file mode 100644 index 0000000000..f6bc162188 --- /dev/null +++ b/src_c/simd_surface_fill_avx2.c @@ -0,0 +1,142 @@ +#include "_surface.h" + +#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) +#include +#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */ + +#define BAD_AVX2_FUNCTION_CALL \ + printf( \ + "Fatal Error: Attempted calling an AVX2 function when both compile " \ + "time and runtime support is missing. If you are seeing this " \ + "message, you have stumbled across a pygame bug, please report it " \ + "to the devs!"); \ + PG_EXIT(1) + +/* helper function that does a runtime check for AVX2. It has the added + * functionality of also returning 0 if compile time support is missing */ +int +_pg_has_avx2() +{ +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) + return SDL_HasAVX2(); +#else + return 0; +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ +} + +#define SETUP_AVX2_FILLER \ + /* initialize surface data */ \ + int width = rect->w, height = rect->h; \ + int skip = \ + (surface->pitch - width * surface->format->BytesPerPixel) >> 2; \ + int pxl_skip = surface->format->BytesPerPixel >> 2; \ + int pre_8_width = width % 8; \ + int post_8_width = (width - pre_8_width) / 8; \ + \ + /* load pixel data */ \ + Uint32 *pixels = (Uint32 *)surface->pixels + \ + (Uint64)rect->y * (surface->pitch >> 2) + \ + (Uint64)rect->x * pxl_skip; \ + \ + __m256i *mm256_pixels = (__m256i *)pixels; \ + \ + /* load color data */ \ + __m256i mm256_color = _mm256_set1_epi32(color); \ + __m128i mm_color = _mm_cvtsi32_si128(color); \ + \ + __m128i mm_src; \ + __m256i mm256_src; \ + int n; + +#define RUN_AVX2_FILLER(CODE_1, CODE_8) \ + while (height--) { \ + if (pre_8_width > 0) { \ + LOOP_UNROLLED4( \ + { \ + /* load 1 pixel */ \ + mm_src = _mm_cvtsi32_si128(*pixels); \ + \ + CODE_1 \ + \ + /* store 1 pixel */ \ + *pixels = _mm_cvtsi128_si32(mm_src); \ + \ + pixels += pxl_skip; \ + }, \ + n, pre_8_width); \ + } \ + mm256_pixels = (__m256i *)pixels; \ + if (post_8_width > 0) { \ + LOOP_UNROLLED4( \ + { \ + /* load 8 pixels */ \ + mm256_src = _mm256_loadu_si256(mm256_pixels); \ + \ + CODE_8 \ + \ + /* store 8 pixels */ \ + _mm256_storeu_si256(mm256_pixels, mm256_src); \ + \ + mm256_pixels++; \ + }, \ + n, post_8_width); \ + } \ + \ + pixels = (Uint32 *)mm256_pixels + skip; \ + } + +/* BLEND_ADD */ +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color) +{ + SETUP_AVX2_FILLER + Uint32 amask = surface->format->Amask; + + if (amask) { + /* if Amask is set, subtract the alpha value from the color*/ + mm256_color = _mm256_subs_epu8(mm256_color, _mm256_set1_epi32(amask)); + mm_color = _mm_subs_epu8(mm_color, _mm_cvtsi32_si128(amask)); + } + + RUN_AVX2_FILLER({ mm_src = _mm_adds_epu8(mm_src, mm_color); }, + { mm256_src = _mm256_adds_epu8(mm256_src, mm256_color); }); + + return 0; +} +#else +int +surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color) +{ + BAD_AVX2_FUNCTION_CALL; + return -1; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color) +{ + SETUP_AVX2_FILLER + + RUN_AVX2_FILLER({ mm_src = _mm_adds_epu8(mm_src, mm_color); }, + { mm256_src = _mm256_adds_epu8(mm256_src, mm256_color); }); + + return 0; +} +#else +int +surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect, + Uint32 color) +{ + BAD_AVX2_FUNCTION_CALL; + return -1; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ diff --git a/src_c/surface_fill.c b/src_c/surface_fill.c index 7ee14c82ad..e48d031893 100644 --- a/src_c/surface_fill.c +++ b/src_c/surface_fill.c @@ -18,7 +18,8 @@ */ #define NO_PYGAME_C_API -#include "_surface.h" + +#include "simd_fill.h" /* * Changes SDL_Rect to respect any clipping rect defined on the surface. @@ -866,6 +867,16 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color, switch (blendargs) { case PYGAME_BLEND_ADD: { +#if !defined(__EMSCRIPTEN__) +#if SDL_BYTEORDER == SDL_LIL_ENDIAN + if (surface->format->BytesPerPixel == 4 && + (surface->pitch % surface->format->BytesPerPixel == 0) && + _pg_has_avx2()) { + result = surface_fill_blend_add_avx2(surface, rect, color); + break; + } +#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ +#endif /* __EMSCRIPTEN__ */ result = surface_fill_blend_add(surface, rect, color); break; } @@ -887,6 +898,17 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color, } case PYGAME_BLEND_RGBA_ADD: { +#if !defined(__EMSCRIPTEN__) +#if SDL_BYTEORDER == SDL_LIL_ENDIAN + if (surface->format->BytesPerPixel == 4 && + (surface->pitch % surface->format->BytesPerPixel == 0) && + _pg_has_avx2()) { + result = + surface_fill_blend_rgba_add_avx2(surface, rect, color); + break; + } +#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */ +#endif /* __EMSCRIPTEN__ */ result = surface_fill_blend_rgba_add(surface, rect, color); break; }