From b08b6a37cb150d49c900c7c04aca4cc5c17ddafe Mon Sep 17 00:00:00 2001 From: Martin Griffin Date: Tue, 25 Jul 2023 08:03:08 +0100 Subject: [PATCH] Optimize BuildOamBuffer --- gflib/sprite.c | 244 ++++++++++++++++----------------------- test/sprite.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 398 insertions(+), 149 deletions(-) create mode 100644 test/sprite.c diff --git a/gflib/sprite.c b/gflib/sprite.c index 80fba81fe309..d75f6f7271d6 100644 --- a/gflib/sprite.c +++ b/gflib/sprite.c @@ -48,11 +48,7 @@ struct OamDimensions s8 height; }; -static void UpdateOamCoords(void); -static void BuildSpritePriorities(void); -static void SortSprites(void); -static void CopyMatricesToOamBuffer(void); -static void AddSpritesToOamBuffer(void); +static void SortSprites(u32 *spritePriorities, s32 n); static u8 CreateSpriteAt(u8 index, const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority); static void ResetOamMatrices(void); static void ResetSprite(struct Sprite *sprite); @@ -280,12 +276,12 @@ u32 gOamMatrixAllocBitmap; u8 gReservedSpritePaletteCount; EWRAM_DATA struct Sprite gSprites[MAX_SPRITES + 1] = {0}; -EWRAM_DATA static u16 sSpritePriorities[MAX_SPRITES] = {0}; EWRAM_DATA static u8 sSpriteOrder[MAX_SPRITES] = {0}; EWRAM_DATA static bool8 sShouldProcessSpriteCopyRequests = 0; EWRAM_DATA static u8 sSpriteCopyRequestCount = 0; EWRAM_DATA static struct SpriteCopyRequest sSpriteCopyRequests[MAX_SPRITES] = {0}; EWRAM_DATA u8 gOamLimit = 0; +static EWRAM_DATA u8 gOamDummyIndex = 0; EWRAM_DATA u16 gReservedSpriteTileCount = 0; EWRAM_DATA static u8 sSpriteTileAllocBitmap[128] = {0}; EWRAM_DATA s16 gSpriteCoordOffsetX = 0; @@ -296,6 +292,7 @@ EWRAM_DATA bool8 gAffineAnimsDisabled = FALSE; void ResetSpriteData(void) { ResetOamRange(0, 128); + gOamDummyIndex = 0; ResetAllSprites(); ClearSpriteCopyRequests(); ResetAffineAnimData(); @@ -326,179 +323,128 @@ void AnimateSprites(void) void BuildOamBuffer(void) { - u8 temp; - UpdateOamCoords(); - BuildSpritePriorities(); - SortSprites(); - temp = gMain.oamLoadDisabled; - gMain.oamLoadDisabled = TRUE; - AddSpritesToOamBuffer(); - CopyMatricesToOamBuffer(); - gMain.oamLoadDisabled = temp; - sShouldProcessSpriteCopyRequests = TRUE; -} + bool32 oamLoadDisabled; + u32 i, stride; + u8 oamIndex; + + // All attributes which affect sorting packed into a single u32: + // { priority:2, subpriority:8, y:9, :5, index:8 }. + // Index has its own byte even though it only needs 6 bits so that + // we can load it with a ldrb instead of having to mask out the + // bottom 6 bits. + u32 spritePriorities[MAX_SPRITES]; + s32 toSort = 0; + u8 skippedSprites[MAX_SPRITES]; + u32 skippedSpritesN = 0; + u32 matrices = 0; -void UpdateOamCoords(void) -{ - u8 i; for (i = 0; i < MAX_SPRITES; i++) { - struct Sprite *sprite = &gSprites[i]; - if (sprite->inUse && !sprite->invisible) + // Reuse existing sSpriteOrder because we expect the order to be + // relatively stable between frames. + u32 index = sSpriteOrder[i]; + struct Sprite *sprite = &gSprites[index]; + s32 y; + if (!sprite->inUse || sprite->invisible) { - if (sprite->coordOffsetEnabled) - { - sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; - sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; - } - else - { - sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; - sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; - } + skippedSprites[skippedSpritesN++] = index; + continue; } - } -} -void BuildSpritePriorities(void) -{ - u16 i; - for (i = 0; i < MAX_SPRITES; i++) - { - struct Sprite *sprite = &gSprites[i]; - u16 priority = sprite->subpriority | (sprite->oam.priority << 8); - sSpritePriorities[i] = priority; - } -} + if (sprite->oam.affineMode & ST_OAM_AFFINE_ON_MASK) + matrices |= 1 << sprite->oam.matrixNum; -void SortSprites(void) -{ - u8 i; - for (i = 1; i < MAX_SPRITES; i++) - { - u8 j = i; - struct Sprite *sprite1 = &gSprites[sSpriteOrder[i - 1]]; - struct Sprite *sprite2 = &gSprites[sSpriteOrder[i]]; - u16 sprite1Priority = sSpritePriorities[sSpriteOrder[i - 1]]; - u16 sprite2Priority = sSpritePriorities[sSpriteOrder[i]]; - s16 sprite1Y = sprite1->oam.y; - s16 sprite2Y = sprite2->oam.y; - - if (sprite1Y >= DISPLAY_HEIGHT) - sprite1Y = sprite1Y - 256; - - if (sprite2Y >= DISPLAY_HEIGHT) - sprite2Y = sprite2Y - 256; - - if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite1->oam.size == ST_OAM_SIZE_3) + if (sprite->coordOffsetEnabled) { - u32 shape = sprite1->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite1Y > 128) - sprite1Y = sprite1Y - 256; - } + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; + } + else + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; } - if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite2->oam.size == ST_OAM_SIZE_3) + y = sprite->oam.y; + if (y >= DISPLAY_HEIGHT) + { + y -= 256; + } + else if (sprite->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite->oam.size == ST_OAM_SIZE_3) { - u32 shape = sprite2->oam.shape; + u32 shape = sprite->oam.shape; if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) { - if (sprite2Y > 128) - sprite2Y = sprite2Y - 256; + if (y > 128) + y -= 256; } } - while (j > 0 - && ((sprite1Priority > sprite2Priority) - || (sprite1Priority == sprite2Priority && sprite1Y < sprite2Y))) - { - u8 temp = sSpriteOrder[j]; - sSpriteOrder[j] = sSpriteOrder[j - 1]; - sSpriteOrder[j - 1] = temp; - - // UB: If j equals 1, then j-- makes j equal 0. - // Then, sSpriteOrder[-1] gets accessed below. - // Although this doesn't result in a bug in the ROM, - // the behavior is undefined. - j--; -#ifdef UBFIX - if (j == 0) - break; -#endif - - sprite1 = &gSprites[sSpriteOrder[j - 1]]; - sprite2 = &gSprites[sSpriteOrder[j]]; - sprite1Priority = sSpritePriorities[sSpriteOrder[j - 1]]; - sprite2Priority = sSpritePriorities[sSpriteOrder[j]]; - sprite1Y = sprite1->oam.y; - sprite2Y = sprite2->oam.y; + // y in [-128...159], so (159 - y) in [0..287]. + spritePriorities[toSort++] + = (sprite->oam.priority << 30) + | (sprite->subpriority << 22) + | (((159 - y) & 0x1FF) << 13) + | (index << 0); + } - if (sprite1Y >= DISPLAY_HEIGHT) - sprite1Y = sprite1Y - 256; + SortSprites(spritePriorities, toSort); - if (sprite2Y >= DISPLAY_HEIGHT) - sprite2Y = sprite2Y - 256; + for (i = 0; i < toSort; i++) + sSpriteOrder[i] = spritePriorities[i] & 0xFF; + for (i = 0; i < skippedSpritesN; i++) + sSpriteOrder[toSort + i] = skippedSprites[i]; - if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite1->oam.size == ST_OAM_SIZE_3) - { - u32 shape = sprite1->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite1Y > 128) - sprite1Y = sprite1Y - 256; - } - } + oamLoadDisabled = gMain.oamLoadDisabled; + gMain.oamLoadDisabled = TRUE; - if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite2->oam.size == ST_OAM_SIZE_3) - { - u32 shape = sprite2->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite2Y > 128) - sprite2Y = sprite2Y - 256; - } - } - } + for (i = 0, oamIndex = 0; i < toSort; i++) + { + if (AddSpriteToOamBuffer(&gSprites[spritePriorities[i] & 0xFF], &oamIndex)) + break; } -} -void CopyMatricesToOamBuffer(void) -{ - u8 i; - for (i = 0; i < OAM_MATRIX_COUNT; i++) + for (i = oamIndex; i < gOamDummyIndex; i++) + gMain.oamBuffer[i] = gDummyOamData; + gOamDummyIndex = oamIndex; + + for (i = 0; matrices != 0; i++, matrices >>= 1) { - u32 base = 4 * i; - gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; - gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; - gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; - gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + if (matrices & 1) + { + u32 base = 4 * i; + gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; + gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; + gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; + gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + } } + + gMain.oamLoadDisabled = oamLoadDisabled; + sShouldProcessSpriteCopyRequests = TRUE; } -void AddSpritesToOamBuffer(void) +static inline void InsertionSort(u32 *spritePriorities, s32 n) { - u8 i = 0; - u8 oamIndex = 0; - - while (i < MAX_SPRITES) + s32 i = 1; + while (i < n) { - struct Sprite *sprite = &gSprites[sSpriteOrder[i]]; - if (sprite->inUse && !sprite->invisible && AddSpriteToOamBuffer(sprite, &oamIndex)) - return; + u32 x = spritePriorities[i]; + s32 j = i - 1; + while (j >= 0 && spritePriorities[j] > x) + { + spritePriorities[j + 1] = spritePriorities[j]; + j--; + } + spritePriorities[j + 1] = x; i++; } +} - while (oamIndex < gOamLimit) - { - gMain.oamBuffer[oamIndex] = gDummyOamData; - oamIndex++; - } +static void SortSprites(u32 *spritePriorities, s32 n) +{ + InsertionSort(spritePriorities, n); } u8 CreateSprite(const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority) @@ -849,7 +795,7 @@ void CopyToSprites(u8 *src) void ResetAllSprites(void) { - u8 i; + u32 i; for (i = 0; i < MAX_SPRITES; i++) { diff --git a/test/sprite.c b/test/sprite.c new file mode 100644 index 000000000000..f488929d5179 --- /dev/null +++ b/test/sprite.c @@ -0,0 +1,303 @@ +#include "global.h" +#include "test.h" +#include "main.h" +#include "malloc.h" +#include "random.h" +#include "sprite.h" + +#define OAM_MATRIX_COUNT 32 + +EWRAM_DATA static u16 sSpritePriorities[MAX_SPRITES] = {0}; +EWRAM_DATA static u8 sSpriteOrder[MAX_SPRITES] = {0}; + +static void Old_BuildOamBuffer(void); + +static void ExpectEqOamBuffers(const struct OamData *oldOamBuffer, const struct OamData *newOamBuffer) +{ + u32 i; + u32 matrices = 0; + + // Compare the non-matrix data. + for (i = 0; i < gOamLimit; i++) + { + EXPECT(memcmp(&oldOamBuffer[i], &newOamBuffer[i], 6) == 0); + if (newOamBuffer[i].affineMode & ST_OAM_AFFINE_ON_MASK) + matrices |= 1 << newOamBuffer[i].matrixNum; + } + + // Compare the matrix data. + for (i = 0; i < OAM_MATRIX_COUNT; i++) + { + if (matrices & (1 << i)) + { + u32 base = 4 * i; + EXPECT_EQ(oldOamBuffer[base + 0].affineParam, newOamBuffer[base + 0].affineParam); + EXPECT_EQ(oldOamBuffer[base + 1].affineParam, newOamBuffer[base + 1].affineParam); + EXPECT_EQ(oldOamBuffer[base + 2].affineParam, newOamBuffer[base + 2].affineParam); + EXPECT_EQ(oldOamBuffer[base + 3].affineParam, newOamBuffer[base + 3].affineParam); + } + } +} + +static void ResetSpriteData_(void) +{ + u32 i; + ResetSpriteData(); + for (i = 0; i < MAX_SPRITES; i++) + sSpriteOrder[i] = i; +} + +static void BenchmarkBuildOamBuffer(bool32 preSort) +{ + struct Benchmark oldBuildOamBuffer, newBuildOamBuffer; + struct OamData *oldOamBuffer = Alloc(sizeof(gMain.oamBuffer)); + + if (preSort) + Old_BuildOamBuffer(); + BENCHMARK(&oldBuildOamBuffer) + { + Old_BuildOamBuffer(); + } + memcpy(oldOamBuffer, gMain.oamBuffer, sizeof(gMain.oamBuffer)); + + if (preSort) + BuildOamBuffer(); + BENCHMARK(&newBuildOamBuffer) + { + BuildOamBuffer(); + } + + ExpectEqOamBuffers(oldOamBuffer, gMain.oamBuffer); + EXPECT_FASTER(newBuildOamBuffer, oldBuildOamBuffer); + Free(oldOamBuffer); +} + +TEST("BuildOamBuffer faster with no sprites") +{ + ResetSpriteData_(); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster with max sprites (equal y/subpriority)") +{ + u32 i; + + ResetSpriteData_(); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, 0, 0); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster with max sprites (random y/subpriority)") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster on already-sorted max sprites") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + BenchmarkBuildOamBuffer(TRUE); +} + +TEST("BuildOamBuffer faster with mix of sprites") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES / 2; i++) + { + u32 spriteId = CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + gSprites[spriteId].invisible = Random() % 4 == 0; + } + BenchmarkBuildOamBuffer(FALSE); +} + +// Old implementation. + +#define UBFIX + +static void UpdateOamCoords(void) +{ + u8 i; + for (i = 0; i < MAX_SPRITES; i++) + { + struct Sprite *sprite = &gSprites[i]; + if (sprite->inUse && !sprite->invisible) + { + if (sprite->coordOffsetEnabled) + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; + } + else + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; + } + } + } +} + +static void BuildSpritePriorities(void) +{ + u16 i; + for (i = 0; i < MAX_SPRITES; i++) + { + struct Sprite *sprite = &gSprites[i]; + u16 priority = sprite->subpriority | (sprite->oam.priority << 8); + sSpritePriorities[i] = priority; + } +} + +static void SortSprites(void) +{ + u8 i; + for (i = 1; i < MAX_SPRITES; i++) + { + u8 j = i; + struct Sprite *sprite1 = &gSprites[sSpriteOrder[i - 1]]; + struct Sprite *sprite2 = &gSprites[sSpriteOrder[i]]; + u16 sprite1Priority = sSpritePriorities[sSpriteOrder[i - 1]]; + u16 sprite2Priority = sSpritePriorities[sSpriteOrder[i]]; + s16 sprite1Y = sprite1->oam.y; + s16 sprite2Y = sprite2->oam.y; + + if (sprite1Y >= DISPLAY_HEIGHT) + sprite1Y = sprite1Y - 256; + + if (sprite2Y >= DISPLAY_HEIGHT) + sprite2Y = sprite2Y - 256; + + if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite1->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite1->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite1Y > 128) + sprite1Y = sprite1Y - 256; + } + } + + if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite2->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite2->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite2Y > 128) + sprite2Y = sprite2Y - 256; + } + } + + while (j > 0 + && ((sprite1Priority > sprite2Priority) + || (sprite1Priority == sprite2Priority && sprite1Y < sprite2Y))) + { + u8 temp = sSpriteOrder[j]; + sSpriteOrder[j] = sSpriteOrder[j - 1]; + sSpriteOrder[j - 1] = temp; + + // UB: If j equals 1, then j-- makes j equal 0. + // Then, sSpriteOrder[-1] gets accessed below. + // Although this doesn't result in a bug in the ROM, + // the behavior is undefined. + j--; +#ifdef UBFIX + if (j == 0) + break; +#endif + + sprite1 = &gSprites[sSpriteOrder[j - 1]]; + sprite2 = &gSprites[sSpriteOrder[j]]; + sprite1Priority = sSpritePriorities[sSpriteOrder[j - 1]]; + sprite2Priority = sSpritePriorities[sSpriteOrder[j]]; + sprite1Y = sprite1->oam.y; + sprite2Y = sprite2->oam.y; + + if (sprite1Y >= DISPLAY_HEIGHT) + sprite1Y = sprite1Y - 256; + + if (sprite2Y >= DISPLAY_HEIGHT) + sprite2Y = sprite2Y - 256; + + if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite1->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite1->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite1Y > 128) + sprite1Y = sprite1Y - 256; + } + } + + if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite2->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite2->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite2Y > 128) + sprite2Y = sprite2Y - 256; + } + } + } + } +} + +static void CopyMatricesToOamBuffer(void) +{ + u8 i; + for (i = 0; i < OAM_MATRIX_COUNT; i++) + { + u32 base = 4 * i; + gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; + gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; + gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; + gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + } +} + +static void AddSpritesToOamBuffer(void) +{ + u8 i = 0; + u8 oamIndex = 0; + + while (i < MAX_SPRITES) + { + struct Sprite *sprite = &gSprites[sSpriteOrder[i]]; + if (sprite->inUse && !sprite->invisible && AddSpriteToOamBuffer(sprite, &oamIndex)) + return; + i++; + } + + while (oamIndex < gOamLimit) + { + gMain.oamBuffer[oamIndex] = gDummyOamData; + oamIndex++; + } +} + +static void Old_BuildOamBuffer(void) +{ + u8 temp; + UpdateOamCoords(); + BuildSpritePriorities(); + SortSprites(); + temp = gMain.oamLoadDisabled; + gMain.oamLoadDisabled = TRUE; + AddSpritesToOamBuffer(); + CopyMatricesToOamBuffer(); + gMain.oamLoadDisabled = temp; + //sShouldProcessSpriteCopyRequests = TRUE; +}