Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized transform.scale2x() #2859

Merged
merged 4 commits into from
Jun 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
252 changes: 143 additions & 109 deletions src_c/scale2x.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,19 @@
#define MAX(a, b) (((a) > (b)) ? (a) : (b))
#define MIN(a, b) (((a) < (b)) ? (a) : (b))

#define READINT24(x) ((x)[0] << 16 | (x)[1] << 8 | (x)[2])
#define WRITEINT24(x, i) \
{ \
(x)[0] = i >> 16; \
(x)[1] = (i >> 8) & 0xff; \
x[2] = i & 0xff; \
}
static inline int
read_int24(const Uint8 *x)
{
return (x[0] << 16 | x[1] << 8 | x[2]);
}

static inline void
store_int24(Uint8 *x, int i)
{
x[0] = i >> 16;
x[1] = (i >> 8) & 0xff;
x[2] = i & 0xff;
}

/*
this requires a destination surface already setup to be twice as
Expand All @@ -62,137 +68,165 @@ scale2x(SDL_Surface *src, SDL_Surface *dst)
const int height = src->h;

#if SDL_VERSION_ATLEAST(3, 0, 0)
switch (src->format->bytes_per_pixel) {
const Uint8 Bpp = src->format->bytes_per_pixel;
#else
switch (src->format->BytesPerPixel) {
const Uint8 Bpp = src->format->BytesPerPixel;
#endif

switch (Bpp) {
case 1: {
Uint8 E0, E1, E2, E3, B, D, E, F, H;
for (looph = 0; looph < height; ++looph) {
Uint8 *src_row = srcpix + looph * srcpitch;
Uint8 *dst_row0 = dstpix + looph * 2 * dstpitch;
Uint8 *dst_row1 = dstpix + (looph * 2 + 1) * dstpitch;

Uint8 *src_row_prev = srcpix + MAX(0, looph - 1) * srcpitch;
Uint8 *src_row_next =
srcpix + MIN(height - 1, looph + 1) * srcpitch;

for (loopw = 0; loopw < width; ++loopw) {
B = *(Uint8 *)(srcpix + (MAX(0, looph - 1) * srcpitch) +
(1 * loopw));
D = *(Uint8 *)(srcpix + (looph * srcpitch) +
(1 * MAX(0, loopw - 1)));
E = *(Uint8 *)(srcpix + (looph * srcpitch) + (1 * loopw));
F = *(Uint8 *)(srcpix + (looph * srcpitch) +
(1 * MIN(width - 1, loopw + 1)));
H = *(Uint8 *)(srcpix +
(MIN(height - 1, looph + 1) * srcpitch) +
(1 * loopw));

E0 = D == B && B != F && D != H ? D : E;
E1 = B == F && B != D && F != H ? F : E;
E2 = D == H && D != B && H != F ? D : E;
E3 = H == F && D != H && B != F ? F : E;

*(Uint8 *)(dstpix + looph * 2 * dstpitch + loopw * 2 * 1) =
E0;
*(Uint8 *)(dstpix + looph * 2 * dstpitch +
(loopw * 2 + 1) * 1) = E1;
*(Uint8 *)(dstpix + (looph * 2 + 1) * dstpitch +
loopw * 2 * 1) = E2;
*(Uint8 *)(dstpix + (looph * 2 + 1) * dstpitch +
(loopw * 2 + 1) * 1) = E3;
B = *(Uint8 *)(src_row_prev + loopw);
D = *(Uint8 *)(src_row + MAX(0, loopw - 1));
E = *(Uint8 *)(src_row + loopw);
F = *(Uint8 *)(src_row + MIN(width - 1, loopw + 1));
H = *(Uint8 *)(src_row_next + loopw);

if (B != H && D != F) {
E0 = (D == B) ? D : E;
E1 = (B == F) ? F : E;
E2 = (D == H) ? D : E;
E3 = (H == F) ? F : E;
}
else {
E0 = E;
E1 = E;
E2 = E;
E3 = E;
}

*(Uint8 *)(dst_row0 + loopw * 2) = E0;
*(Uint8 *)(dst_row0 + loopw * 2 + 1) = E1;
*(Uint8 *)(dst_row1 + loopw * 2) = E2;
*(Uint8 *)(dst_row1 + loopw * 2 + 1) = E3;
}
}
break;
}
case 2: {
Uint16 E0, E1, E2, E3, B, D, E, F, H;
for (looph = 0; looph < height; ++looph) {
Uint8 *src_row = srcpix + looph * srcpitch;
Uint8 *dst_row0 = dstpix + looph * 2 * dstpitch;
Uint8 *dst_row1 = dstpix + (looph * 2 + 1) * dstpitch;

Uint8 *src_row_prev = srcpix + MAX(0, looph - 1) * srcpitch;
Uint8 *src_row_next =
srcpix + MIN(height - 1, looph + 1) * srcpitch;
Comment on lines +119 to +125
Copy link
Member

@Starbuck5 Starbuck5 May 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So in your PR description you said you had optimized using pointers and using functions instead of macros, but reading this code it seems like the main optimization is actually some manual Loop-invariant code motion.

Very clever! This reminds me of your Rect union PR, also a great way of looking at existing code and making it faster in a straightforward way.

It’s a shame the compiler doesn’t do that automatically in these cases.


for (loopw = 0; loopw < width; ++loopw) {
B = *(Uint16 *)(srcpix + (MAX(0, looph - 1) * srcpitch) +
(2 * loopw));
D = *(Uint16 *)(srcpix + (looph * srcpitch) +
(2 * MAX(0, loopw - 1)));
E = *(Uint16 *)(srcpix + (looph * srcpitch) + (2 * loopw));
F = *(Uint16 *)(srcpix + (looph * srcpitch) +
(2 * MIN(width - 1, loopw + 1)));
H = *(Uint16 *)(srcpix +
(MIN(height - 1, looph + 1) * srcpitch) +
(2 * loopw));

E0 = D == B && B != F && D != H ? D : E;
E1 = B == F && B != D && F != H ? F : E;
E2 = D == H && D != B && H != F ? D : E;
E3 = H == F && D != H && B != F ? F : E;

*(Uint16 *)(dstpix + looph * 2 * dstpitch +
loopw * 2 * 2) = E0;
*(Uint16 *)(dstpix + looph * 2 * dstpitch +
(loopw * 2 + 1) * 2) = E1;
*(Uint16 *)(dstpix + (looph * 2 + 1) * dstpitch +
loopw * 2 * 2) = E2;
*(Uint16 *)(dstpix + (looph * 2 + 1) * dstpitch +
(loopw * 2 + 1) * 2) = E3;
B = *(Uint16 *)(src_row_prev + 2 * loopw);
D = *(Uint16 *)(src_row + 2 * MAX(0, loopw - 1));
E = *(Uint16 *)(src_row + 2 * loopw);
F = *(Uint16 *)(src_row + 2 * MIN(width - 1, loopw + 1));
H = *(Uint16 *)(src_row_next + 2 * loopw);

if (B != H && D != F) {
E0 = (D == B) ? D : E;
E1 = (B == F) ? F : E;
E2 = (D == H) ? D : E;
E3 = (H == F) ? F : E;
}
else {
E0 = E;
E1 = E;
E2 = E;
E3 = E;
}

*(Uint16 *)(dst_row0 + loopw * 2 * 2) = E0;
*(Uint16 *)(dst_row0 + (loopw * 2 + 1) * 2) = E1;
*(Uint16 *)(dst_row1 + loopw * 2 * 2) = E2;
*(Uint16 *)(dst_row1 + (loopw * 2 + 1) * 2) = E3;
}
}
break;
}
case 3: {
int E0, E1, E2, E3, B, D, E, F, H;
for (looph = 0; looph < height; ++looph) {
Uint8 *src_row = srcpix + looph * srcpitch;
Uint8 *dst_row0 = dstpix + looph * 2 * dstpitch;
Uint8 *dst_row1 = dstpix + (looph * 2 + 1) * dstpitch;

Uint8 *src_row_prev = srcpix + MAX(0, looph - 1) * srcpitch;
Uint8 *src_row_next =
srcpix + MIN(height - 1, looph + 1) * srcpitch;

for (loopw = 0; loopw < width; ++loopw) {
B = READINT24(srcpix + (MAX(0, looph - 1) * srcpitch) +
(3 * loopw));
D = READINT24(srcpix + (looph * srcpitch) +
(3 * MAX(0, loopw - 1)));
E = READINT24(srcpix + (looph * srcpitch) + (3 * loopw));
F = READINT24(srcpix + (looph * srcpitch) +
(3 * MIN(width - 1, loopw + 1)));
H = READINT24(srcpix +
(MIN(height - 1, looph + 1) * srcpitch) +
(3 * loopw));

E0 = D == B && B != F && D != H ? D : E;
E1 = B == F && B != D && F != H ? F : E;
E2 = D == H && D != B && H != F ? D : E;
E3 = H == F && D != H && B != F ? F : E;

WRITEINT24((dstpix + looph * 2 * dstpitch + loopw * 2 * 3),
E0);
WRITEINT24(
(dstpix + looph * 2 * dstpitch + (loopw * 2 + 1) * 3),
E1);
WRITEINT24(
(dstpix + (looph * 2 + 1) * dstpitch + loopw * 2 * 3),
E2);
WRITEINT24((dstpix + (looph * 2 + 1) * dstpitch +
(loopw * 2 + 1) * 3),
E3);
B = read_int24(src_row_prev + (3 * loopw));
D = read_int24(src_row + (3 * MAX(0, loopw - 1)));
E = read_int24(src_row + (3 * loopw));
F = read_int24(src_row + (3 * MIN(width - 1, loopw + 1)));
H = read_int24(src_row_next + (3 * loopw));

if (B != H && D != F) {
E0 = (D == B) ? D : E;
E1 = (B == F) ? F : E;
E2 = (D == H) ? D : E;
E3 = (H == F) ? F : E;
}
else {
E0 = E;
E1 = E;
E2 = E;
E3 = E;
}

store_int24(dst_row0 + loopw * 2 * 3, E0);
store_int24(dst_row0 + (loopw * 2 + 1) * 3, E1);
store_int24(dst_row1 + loopw * 2 * 3, E2);
store_int24(dst_row1 + (loopw * 2 + 1) * 3, E3);
}
}
break;
}
default: { /*case 4:*/
default: {
Uint32 E0, E1, E2, E3, B, D, E, F, H;

for (looph = 0; looph < height; ++looph) {
Uint8 *src_row = srcpix + looph * srcpitch;
Uint8 *dst_row0 = dstpix + looph * 2 * dstpitch;
Uint8 *dst_row1 = dstpix + (looph * 2 + 1) * dstpitch;

Uint8 *src_row_prev = srcpix + MAX(0, looph - 1) * srcpitch;
Uint8 *src_row_next =
srcpix + MIN(height - 1, looph + 1) * srcpitch;

for (loopw = 0; loopw < width; ++loopw) {
B = *(Uint32 *)(srcpix + (MAX(0, looph - 1) * srcpitch) +
(4 * loopw));
D = *(Uint32 *)(srcpix + (looph * srcpitch) +
(4 * MAX(0, loopw - 1)));
E = *(Uint32 *)(srcpix + (looph * srcpitch) + (4 * loopw));
F = *(Uint32 *)(srcpix + (looph * srcpitch) +
(4 * MIN(width - 1, loopw + 1)));
H = *(Uint32 *)(srcpix +
(MIN(height - 1, looph + 1) * srcpitch) +
(4 * loopw));

E0 = D == B && B != F && D != H ? D : E;
E1 = B == F && B != D && F != H ? F : E;
E2 = D == H && D != B && H != F ? D : E;
E3 = H == F && D != H && B != F ? F : E;

*(Uint32 *)(dstpix + looph * 2 * dstpitch +
loopw * 2 * 4) = E0;
*(Uint32 *)(dstpix + looph * 2 * dstpitch +
(loopw * 2 + 1) * 4) = E1;
*(Uint32 *)(dstpix + (looph * 2 + 1) * dstpitch +
loopw * 2 * 4) = E2;
*(Uint32 *)(dstpix + (looph * 2 + 1) * dstpitch +
(loopw * 2 + 1) * 4) = E3;
B = *(Uint32 *)(src_row_prev + 4 * loopw);
D = *(Uint32 *)(src_row + 4 * MAX(0, loopw - 1));
E = *(Uint32 *)(src_row + 4 * loopw);
F = *(Uint32 *)(src_row + 4 * MIN(width - 1, loopw + 1));
H = *(Uint32 *)(src_row_next + 4 * loopw);

if (B != H && D != F) {
E0 = (D == B) ? D : E;
E1 = (B == F) ? F : E;
E2 = (D == H) ? D : E;
E3 = (H == F) ? F : E;
}
else {
E0 = E;
E1 = E;
E2 = E;
E3 = E;
}

*(Uint32 *)(dst_row0 + loopw * 2 * 4) = E0;
*(Uint32 *)(dst_row0 + (loopw * 2 + 1) * 4) = E1;
*(Uint32 *)(dst_row1 + loopw * 2 * 4) = E2;
*(Uint32 *)(dst_row1 + (loopw * 2 + 1) * 4) = E3;
}
}
break;
Expand Down
Loading