Skip to content

Commit

Permalink
optimizes the BLEND_ADD flag when used in surface.fill
Browse files Browse the repository at this point in the history
  • Loading branch information
itzpr3d4t0r committed Aug 6, 2023
1 parent 4472ac6 commit 5d7f47c
Show file tree
Hide file tree
Showing 6 changed files with 185 additions and 4 deletions.
2 changes: 1 addition & 1 deletion buildconfig/Setup.Android.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion buildconfig/Setup.SDL2.in
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ key src_c/key.c $(SDL) $(DEBUG)
mouse src_c/mouse.c $(SDL) $(DEBUG)
rect src_c/rect.c src_c/pgcompat_rect.c $(SDL) $(DEBUG)
rwobject src_c/rwobject.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c $(SDL) $(DEBUG)
surface src_c/simd_blitters_sse2.c src_c/simd_blitters_avx2.c src_c/surface.c src_c/alphablit.c src_c/surface_fill.c src_c/simd_surface_fill_avx2.c $(SDL) $(DEBUG)
surflock src_c/surflock.c $(SDL) $(DEBUG)
time src_c/time.c $(SDL) $(DEBUG)
joystick src_c/joystick.c $(SDL) $(DEBUG)
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@

import distutils.ccompiler

avx2_filenames = ['simd_blitters_avx2']
avx2_filenames = ['simd_blitters_avx2', 'simd_surface_fill_avx2']

compiler_options = {
'unix': ('-mavx2',),
Expand Down
17 changes: 17 additions & 0 deletions src_c/simd_fill.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#define NO_PYGAME_C_API

#ifndef SIMD_FILL_H
#define SIMD_FILL_H
#include "_surface.h"

// AVX2 functions
int
_pg_has_avx2();

int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color);
#endif // SIMD_FILL_H
142 changes: 142 additions & 0 deletions src_c/simd_surface_fill_avx2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
#include "_surface.h"

#if defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H)
#include <immintrin.h>
#endif /* defined(HAVE_IMMINTRIN_H) && !defined(SDL_DISABLE_IMMINTRIN_H) */

#define BAD_AVX2_FUNCTION_CALL \
printf( \
"Fatal Error: Attempted calling an AVX2 function when both compile " \
"time and runtime support is missing. If you are seeing this " \
"message, you have stumbled across a pygame bug, please report it " \
"to the devs!"); \
PG_EXIT(1)

/* helper function that does a runtime check for AVX2. It has the added
* functionality of also returning 0 if compile time support is missing */
int
_pg_has_avx2()
{
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
return SDL_HasAVX2();
#else
return 0;
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
}

#define SETUP_AVX2_FILLER \
/* initialize surface data */ \
int width = rect->w, height = rect->h; \
int skip = \
(surface->pitch - width * surface->format->BytesPerPixel) >> 2; \
int pxl_skip = surface->format->BytesPerPixel >> 2; \
int pre_8_width = width % 8; \
int post_8_width = (width - pre_8_width) / 8; \
\
/* load pixel data */ \
Uint32 *pixels = (Uint32 *)surface->pixels + \
(Uint64)rect->y * (surface->pitch >> 2) + \
(Uint64)rect->x * pxl_skip; \
\
__m256i *mm256_pixels = (__m256i *)pixels; \
\
/* load color data */ \
__m256i mm256_color = _mm256_set1_epi32(color); \
__m128i mm_color = _mm_cvtsi32_si128(color); \
\
__m128i mm_src; \
__m256i mm256_src; \
int n;

#define RUN_AVX2_FILLER(CODE_1, CODE_8) \
while (height--) { \
if (pre_8_width > 0) { \
LOOP_UNROLLED4( \
{ \
/* load 1 pixel */ \
mm_src = _mm_cvtsi32_si128(*pixels); \
\
CODE_1 \
\
/* store 1 pixel */ \
*pixels = _mm_cvtsi128_si32(mm_src); \
\
pixels += pxl_skip; \
}, \
n, pre_8_width); \
} \
mm256_pixels = (__m256i *)pixels; \
if (post_8_width > 0) { \
LOOP_UNROLLED4( \
{ \
/* load 8 pixels */ \
mm256_src = _mm256_loadu_si256(mm256_pixels); \
\
CODE_8 \
\
/* store 8 pixels */ \
_mm256_storeu_si256(mm256_pixels, mm256_src); \
\
mm256_pixels++; \
}, \
n, post_8_width); \
} \
\
pixels = (Uint32 *)mm256_pixels + skip; \
}

/* BLEND_ADD */
#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
SETUP_AVX2_FILLER
Uint32 amask = surface->format->Amask;

if (amask) {
/* if Amask is set, subtract the alpha value from the color*/
mm256_color = _mm256_subs_epu8(mm256_color, _mm256_set1_epi32(amask));
mm_color = _mm_subs_epu8(mm_color, _mm_cvtsi32_si128(amask));
}

RUN_AVX2_FILLER({ mm_src = _mm_adds_epu8(mm_src, mm_color); },
{ mm256_src = _mm256_adds_epu8(mm256_src, mm256_color); });

return 0;
}
#else
int
surface_fill_blend_add_avx2(SDL_Surface *surface, SDL_Rect *rect, Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */

#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H)
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
SETUP_AVX2_FILLER

RUN_AVX2_FILLER({ mm_src = _mm_adds_epu8(mm_src, mm_color); },
{ mm256_src = _mm256_adds_epu8(mm256_src, mm256_color); });

return 0;
}
#else
int
surface_fill_blend_rgba_add_avx2(SDL_Surface *surface, SDL_Rect *rect,
Uint32 color)
{
BAD_AVX2_FUNCTION_CALL;
return -1;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
24 changes: 23 additions & 1 deletion src_c/surface_fill.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
*/

#define NO_PYGAME_C_API
#include "_surface.h"

#include "simd_fill.h"

/*
* Changes SDL_Rect to respect any clipping rect defined on the surface.
Expand Down Expand Up @@ -866,6 +867,16 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,

switch (blendargs) {
case PYGAME_BLEND_ADD: {
#if !defined(__EMSCRIPTEN__)
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 &&
(surface->pitch % surface->format->BytesPerPixel == 0) &&
_pg_has_avx2()) {
result = surface_fill_blend_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_add(surface, rect, color);
break;
}
Expand All @@ -887,6 +898,17 @@ surface_fill_blend(SDL_Surface *surface, SDL_Rect *rect, Uint32 color,
}

case PYGAME_BLEND_RGBA_ADD: {
#if !defined(__EMSCRIPTEN__)
#if SDL_BYTEORDER == SDL_LIL_ENDIAN
if (surface->format->BytesPerPixel == 4 &&
(surface->pitch % surface->format->BytesPerPixel == 0) &&
_pg_has_avx2()) {
result =
surface_fill_blend_rgba_add_avx2(surface, rect, color);
break;
}
#endif /* SDL_BYTEORDER == SDL_LIL_ENDIAN */
#endif /* __EMSCRIPTEN__ */
result = surface_fill_blend_rgba_add(surface, rect, color);
break;
}
Expand Down

0 comments on commit 5d7f47c

Please sign in to comment.