Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize sprite.c #3175

Merged
merged 3 commits into from
Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
244 changes: 95 additions & 149 deletions gflib/sprite.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,7 @@ struct OamDimensions
s8 height;
};

static void UpdateOamCoords(void);
static void BuildSpritePriorities(void);
static void SortSprites(void);
static void CopyMatricesToOamBuffer(void);
static void AddSpritesToOamBuffer(void);
static void SortSprites(u32 *spritePriorities, s32 n);
static u8 CreateSpriteAt(u8 index, const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority);
static void ResetOamMatrices(void);
static void ResetSprite(struct Sprite *sprite);
Expand Down Expand Up @@ -280,12 +276,12 @@ u32 gOamMatrixAllocBitmap;
u8 gReservedSpritePaletteCount;

EWRAM_DATA struct Sprite gSprites[MAX_SPRITES + 1] = {0};
EWRAM_DATA static u16 sSpritePriorities[MAX_SPRITES] = {0};
EWRAM_DATA static u8 sSpriteOrder[MAX_SPRITES] = {0};
EWRAM_DATA static bool8 sShouldProcessSpriteCopyRequests = 0;
EWRAM_DATA static u8 sSpriteCopyRequestCount = 0;
EWRAM_DATA static struct SpriteCopyRequest sSpriteCopyRequests[MAX_SPRITES] = {0};
EWRAM_DATA u8 gOamLimit = 0;
static EWRAM_DATA u8 gOamDummyIndex = 0;
EWRAM_DATA u16 gReservedSpriteTileCount = 0;
EWRAM_DATA static u8 sSpriteTileAllocBitmap[128] = {0};
EWRAM_DATA s16 gSpriteCoordOffsetX = 0;
Expand All @@ -296,6 +292,7 @@ EWRAM_DATA bool8 gAffineAnimsDisabled = FALSE;
void ResetSpriteData(void)
{
ResetOamRange(0, 128);
gOamDummyIndex = 0;
ResetAllSprites();
ClearSpriteCopyRequests();
ResetAffineAnimData();
Expand Down Expand Up @@ -326,179 +323,128 @@ void AnimateSprites(void)

void BuildOamBuffer(void)
{
u8 temp;
UpdateOamCoords();
BuildSpritePriorities();
SortSprites();
temp = gMain.oamLoadDisabled;
gMain.oamLoadDisabled = TRUE;
AddSpritesToOamBuffer();
CopyMatricesToOamBuffer();
gMain.oamLoadDisabled = temp;
sShouldProcessSpriteCopyRequests = TRUE;
}
bool32 oamLoadDisabled;
u32 i, stride;
u8 oamIndex;

// All attributes which affect sorting packed into a single u32:
// { priority:2, subpriority:8, y:9, :5, index:8 }.
// Index has its own byte even though it only needs 6 bits so that
// we can load it with a ldrb instead of having to mask out the
// bottom 6 bits.
u32 spritePriorities[MAX_SPRITES];
s32 toSort = 0;
u8 skippedSprites[MAX_SPRITES];
u32 skippedSpritesN = 0;
u32 matrices = 0;
mrgriffin marked this conversation as resolved.
Show resolved Hide resolved

void UpdateOamCoords(void)
{
u8 i;
for (i = 0; i < MAX_SPRITES; i++)
{
struct Sprite *sprite = &gSprites[i];
if (sprite->inUse && !sprite->invisible)
// Reuse existing sSpriteOrder because we expect the order to be
// relatively stable between frames.
u32 index = sSpriteOrder[i];
struct Sprite *sprite = &gSprites[index];
s32 y;
if (!sprite->inUse || sprite->invisible)
{
if (sprite->coordOffsetEnabled)
{
sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX;
sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY;
}
else
{
sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX;
sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY;
}
skippedSprites[skippedSpritesN++] = index;
continue;
}
}
}

void BuildSpritePriorities(void)
{
u16 i;
for (i = 0; i < MAX_SPRITES; i++)
{
struct Sprite *sprite = &gSprites[i];
u16 priority = sprite->subpriority | (sprite->oam.priority << 8);
sSpritePriorities[i] = priority;
}
}
if (sprite->oam.affineMode & ST_OAM_AFFINE_ON_MASK)
matrices |= 1 << sprite->oam.matrixNum;

void SortSprites(void)
{
u8 i;
for (i = 1; i < MAX_SPRITES; i++)
{
u8 j = i;
struct Sprite *sprite1 = &gSprites[sSpriteOrder[i - 1]];
struct Sprite *sprite2 = &gSprites[sSpriteOrder[i]];
u16 sprite1Priority = sSpritePriorities[sSpriteOrder[i - 1]];
u16 sprite2Priority = sSpritePriorities[sSpriteOrder[i]];
s16 sprite1Y = sprite1->oam.y;
s16 sprite2Y = sprite2->oam.y;

if (sprite1Y >= DISPLAY_HEIGHT)
sprite1Y = sprite1Y - 256;

if (sprite2Y >= DISPLAY_HEIGHT)
sprite2Y = sprite2Y - 256;

if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE
&& sprite1->oam.size == ST_OAM_SIZE_3)
if (sprite->coordOffsetEnabled)
{
u32 shape = sprite1->oam.shape;
if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE)
{
if (sprite1Y > 128)
sprite1Y = sprite1Y - 256;
}
sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX;
sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY;
}
else
{
sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX;
sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY;
}

if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE
&& sprite2->oam.size == ST_OAM_SIZE_3)
y = sprite->oam.y;
if (y >= DISPLAY_HEIGHT)
{
y -= 256;
}
else if (sprite->oam.affineMode == ST_OAM_AFFINE_DOUBLE
&& sprite->oam.size == ST_OAM_SIZE_3)
{
u32 shape = sprite2->oam.shape;
u32 shape = sprite->oam.shape;
if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE)
{
if (sprite2Y > 128)
sprite2Y = sprite2Y - 256;
if (y > 128)
y -= 256;
}
}

while (j > 0
&& ((sprite1Priority > sprite2Priority)
|| (sprite1Priority == sprite2Priority && sprite1Y < sprite2Y)))
{
u8 temp = sSpriteOrder[j];
sSpriteOrder[j] = sSpriteOrder[j - 1];
sSpriteOrder[j - 1] = temp;

// UB: If j equals 1, then j-- makes j equal 0.
// Then, sSpriteOrder[-1] gets accessed below.
// Although this doesn't result in a bug in the ROM,
// the behavior is undefined.
j--;
#ifdef UBFIX
if (j == 0)
break;
#endif

sprite1 = &gSprites[sSpriteOrder[j - 1]];
sprite2 = &gSprites[sSpriteOrder[j]];
sprite1Priority = sSpritePriorities[sSpriteOrder[j - 1]];
sprite2Priority = sSpritePriorities[sSpriteOrder[j]];
sprite1Y = sprite1->oam.y;
sprite2Y = sprite2->oam.y;
// y in [-128...159], so (159 - y) in [0..287].
spritePriorities[toSort++]
= (sprite->oam.priority << 30)
| (sprite->subpriority << 22)
| (((159 - y) & 0x1FF) << 13)
| (index << 0);
}

if (sprite1Y >= DISPLAY_HEIGHT)
sprite1Y = sprite1Y - 256;
SortSprites(spritePriorities, toSort);

if (sprite2Y >= DISPLAY_HEIGHT)
sprite2Y = sprite2Y - 256;
for (i = 0; i < toSort; i++)
sSpriteOrder[i] = spritePriorities[i] & 0xFF;
for (i = 0; i < skippedSpritesN; i++)
sSpriteOrder[toSort + i] = skippedSprites[i];

if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE
&& sprite1->oam.size == ST_OAM_SIZE_3)
{
u32 shape = sprite1->oam.shape;
if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE)
{
if (sprite1Y > 128)
sprite1Y = sprite1Y - 256;
}
}
oamLoadDisabled = gMain.oamLoadDisabled;
gMain.oamLoadDisabled = TRUE;

if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE
&& sprite2->oam.size == ST_OAM_SIZE_3)
{
u32 shape = sprite2->oam.shape;
if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE)
{
if (sprite2Y > 128)
sprite2Y = sprite2Y - 256;
}
}
}
for (i = 0, oamIndex = 0; i < toSort; i++)
{
if (AddSpriteToOamBuffer(&gSprites[spritePriorities[i] & 0xFF], &oamIndex))
break;
}
}

void CopyMatricesToOamBuffer(void)
{
u8 i;
for (i = 0; i < OAM_MATRIX_COUNT; i++)
for (i = oamIndex; i < gOamDummyIndex; i++)
gMain.oamBuffer[i] = gDummyOamData;
gOamDummyIndex = oamIndex;

for (i = 0; matrices != 0; i++, matrices >>= 1)
{
u32 base = 4 * i;
gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a;
gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b;
gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c;
gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d;
if (matrices & 1)
{
u32 base = 4 * i;
gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a;
gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b;
gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c;
gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d;
}
}

gMain.oamLoadDisabled = oamLoadDisabled;
sShouldProcessSpriteCopyRequests = TRUE;
}

void AddSpritesToOamBuffer(void)
static inline void InsertionSort(u32 *spritePriorities, s32 n)
{
u8 i = 0;
u8 oamIndex = 0;

while (i < MAX_SPRITES)
s32 i = 1;
while (i < n)
{
struct Sprite *sprite = &gSprites[sSpriteOrder[i]];
if (sprite->inUse && !sprite->invisible && AddSpriteToOamBuffer(sprite, &oamIndex))
return;
u32 x = spritePriorities[i];
s32 j = i - 1;
while (j >= 0 && spritePriorities[j] > x)
{
spritePriorities[j + 1] = spritePriorities[j];
j--;
}
spritePriorities[j + 1] = x;
i++;
}
}

while (oamIndex < gOamLimit)
{
gMain.oamBuffer[oamIndex] = gDummyOamData;
oamIndex++;
}
static void SortSprites(u32 *spritePriorities, s32 n)
{
InsertionSort(spritePriorities, n);
}

u8 CreateSprite(const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority)
Expand Down Expand Up @@ -849,7 +795,7 @@ void CopyToSprites(u8 *src)

void ResetAllSprites(void)
{
u8 i;
u32 i;

for (i = 0; i < MAX_SPRITES; i++)
{
Expand Down
1 change: 1 addition & 0 deletions include/gba/io_reg.h
Original file line number Diff line number Diff line change
Expand Up @@ -664,6 +664,7 @@
#define TIMER_64CLK 0x01
#define TIMER_256CLK 0x02
#define TIMER_1024CLK 0x03
#define TIMER_COUNTUP 0x04
#define TIMER_INTR_ENABLE 0x40
#define TIMER_ENABLE 0x80

Expand Down
57 changes: 57 additions & 0 deletions test/random.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,3 +151,60 @@ TEST("RandomElement generates a uniform distribution")

EXPECT_LT(error, UQ_4_12(0.025));
}

TEST("RandomUniform mul-based faster than mod-based (compile-time)")
{
u32 i;
struct Benchmark mulBenchmark, modBenchmark;
u32 mulSum = 0, modSum = 0;

BENCHMARK(&mulBenchmark)
{
mulSum += RandomUniformDefault(RNG_NONE, 0, 1);
mulSum += RandomUniformDefault(RNG_NONE, 0, 2);
mulSum += RandomUniformDefault(RNG_NONE, 0, 3);
mulSum += RandomUniformDefault(RNG_NONE, 0, 4);
}

BENCHMARK(&modBenchmark)
{
modSum += Random() % 2;
modSum += Random() % 3;
modSum += Random() % 4;
modSum += Random() % 5;
}

EXPECT_FASTER(mulBenchmark, modBenchmark);

// Reference mulSum/modSum to prevent optimization.
// These numbers are different because multiplication and modulus
// have subtly different biases (so subtle that it's irrelevant for
// our purposes).
EXPECT_EQ(mulSum, 3);
EXPECT_EQ(modSum, 4);
}

TEST("RandomUniform mul-based faster than mod-based (run-time)")
{
u32 i;
struct Benchmark mulBenchmark, modBenchmark;
u32 mulSum = 0, modSum = 0;

BENCHMARK(&mulBenchmark)
{
for (i = 0; i < 32; i++)
mulSum += RandomUniformDefault(RNG_NONE, 0, i);
}

BENCHMARK(&modBenchmark)
{
for (i = 0; i < 32; i++)
modSum += Random() % (i + 1);
}

EXPECT_FASTER(mulBenchmark, modBenchmark);

// Reference mulSum/modSum to prevent optimization.
EXPECT_EQ(mulSum, 232);
EXPECT_EQ(modSum, 249);
}
Loading