From 3f5daa8cf138c1849b2b91bba3425250113fa7a7 Mon Sep 17 00:00:00 2001 From: Martin Griffin Date: Mon, 24 Jul 2023 12:51:43 +0100 Subject: [PATCH 1/3] Add BENCHMARK to test runner Counts the time spent in the block in arbitrary time units (64-cycles currently). If the block takes more than ~4 million cycles (~0.25s) the benchmark will wrap around. Note that the time can be affected by the timeout IRQs, and should only be taken as a loose indication of relative performance. --- include/gba/io_reg.h | 1 + test/random.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ test/test.h | 42 ++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+) diff --git a/include/gba/io_reg.h b/include/gba/io_reg.h index 148ce31db432..d05ec71201ba 100644 --- a/include/gba/io_reg.h +++ b/include/gba/io_reg.h @@ -664,6 +664,7 @@ #define TIMER_64CLK 0x01 #define TIMER_256CLK 0x02 #define TIMER_1024CLK 0x03 +#define TIMER_COUNTUP 0x04 #define TIMER_INTR_ENABLE 0x40 #define TIMER_ENABLE 0x80 diff --git a/test/random.c b/test/random.c index ef364de276f5..091b9323aebf 100644 --- a/test/random.c +++ b/test/random.c @@ -151,3 +151,60 @@ TEST("RandomElement generates a uniform distribution") EXPECT_LT(error, UQ_4_12(0.025)); } + +TEST("RandomUniform mul-based faster than mod-based (compile-time)") +{ + u32 i; + struct Benchmark mulBenchmark, modBenchmark; + u32 mulSum = 0, modSum = 0; + + BENCHMARK(&mulBenchmark) + { + mulSum += RandomUniformDefault(RNG_NONE, 0, 1); + mulSum += RandomUniformDefault(RNG_NONE, 0, 2); + mulSum += RandomUniformDefault(RNG_NONE, 0, 3); + mulSum += RandomUniformDefault(RNG_NONE, 0, 4); + } + + BENCHMARK(&modBenchmark) + { + modSum += Random() % 2; + modSum += Random() % 3; + modSum += Random() % 4; + modSum += Random() % 5; + } + + EXPECT_FASTER(mulBenchmark, modBenchmark); + + // Reference mulSum/modSum to prevent optimization. + // These numbers are different because multiplication and modulus + // have subtly different biases (so subtle that it's irrelevant for + // our purposes). + EXPECT_EQ(mulSum, 3); + EXPECT_EQ(modSum, 4); +} + +TEST("RandomUniform mul-based faster than mod-based (run-time)") +{ + u32 i; + struct Benchmark mulBenchmark, modBenchmark; + u32 mulSum = 0, modSum = 0; + + BENCHMARK(&mulBenchmark) + { + for (i = 0; i < 32; i++) + mulSum += RandomUniformDefault(RNG_NONE, 0, i); + } + + BENCHMARK(&modBenchmark) + { + for (i = 0; i < 32; i++) + modSum += Random() % (i + 1); + } + + EXPECT_FASTER(mulBenchmark, modBenchmark); + + // Reference mulSum/modSum to prevent optimization. + EXPECT_EQ(mulSum, 232); + EXPECT_EQ(modSum, 249); +} diff --git a/test/test.h b/test/test.h index 24abca60dc95..228f195650d6 100644 --- a/test/test.h +++ b/test/test.h @@ -46,6 +46,7 @@ struct TestRunnerState u8 result; u8 expectedResult; bool8 expectLeaks:1; + bool8 inBenchmark:1; u32 timeoutSeconds; }; @@ -158,6 +159,47 @@ s32 MgbaPrintf_(const char *fmt, ...); Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_GE(%d, %d) failed", gTestRunnerState.test->filename, __LINE__, _a, _b); \ } while (0) +struct Benchmark { u32 ticks; }; + +static inline void BenchmarkStart(void) +{ + gTestRunnerState.inBenchmark = TRUE; + REG_TM3CNT = (TIMER_ENABLE | TIMER_64CLK) << 16; +} + +static inline struct Benchmark BenchmarkStop(void) +{ + REG_TM3CNT_H = 0; + gTestRunnerState.inBenchmark = FALSE; + return (struct Benchmark) { REG_TM3CNT_L }; +} + +#define BENCHMARK(id) \ + for (BenchmarkStart(); gTestRunnerState.inBenchmark; *(id) = BenchmarkStop()) + +// An approximation of how much overhead benchmarks introduce. +#define BENCHMARK_ABS 2 + +// An approximation for what percentage faster a benchmark has to be for +// us to be confident that it's faster than another. +#define BENCHMARK_REL 95 + +#define EXPECT_FASTER(a, b) \ + do \ + { \ + u32 a_ = (a).ticks; u32 b_ = (b).ticks; \ + if (((a_ - BENCHMARK_ABS) * BENCHMARK_REL) >= (b_ * 100)) \ + Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_FASTER(" #a ", " #b ") failed", gTestRunnerState.test->filename, __LINE__); \ + } while (0) + +#define EXPECT_SLOWER(a, b) \ + do \ + { \ + u32 a_ = (a).ticks; u32 b_ = (b).ticks; \ + if ((a_ * 100) <= ((b_ - BENCHMARK_ABS) * BENCHMARK_REL)) \ + Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_SLOWER(" #a ", " #b ") failed", gTestRunnerState.test->filename, __LINE__); \ + } while (0) + #define KNOWN_FAILING \ Test_ExpectedResult(TEST_RESULT_FAIL) From b499f41a5cc232369b518206619d1d96a5a0fd62 Mon Sep 17 00:00:00 2001 From: Martin Griffin Date: Tue, 25 Jul 2023 07:40:51 +0100 Subject: [PATCH 2/3] Print ticks --- test/test.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/test.h b/test/test.h index 228f195650d6..5b3ab8af0c3d 100644 --- a/test/test.h +++ b/test/test.h @@ -159,7 +159,7 @@ s32 MgbaPrintf_(const char *fmt, ...); Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_GE(%d, %d) failed", gTestRunnerState.test->filename, __LINE__, _a, _b); \ } while (0) -struct Benchmark { u32 ticks; }; +struct Benchmark { s32 ticks; }; static inline void BenchmarkStart(void) { @@ -188,6 +188,7 @@ static inline struct Benchmark BenchmarkStop(void) do \ { \ u32 a_ = (a).ticks; u32 b_ = (b).ticks; \ + MgbaPrintf_(#a ": %d ticks, " #b ": %d ticks", a_, b_); \ if (((a_ - BENCHMARK_ABS) * BENCHMARK_REL) >= (b_ * 100)) \ Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_FASTER(" #a ", " #b ") failed", gTestRunnerState.test->filename, __LINE__); \ } while (0) @@ -196,6 +197,7 @@ static inline struct Benchmark BenchmarkStop(void) do \ { \ u32 a_ = (a).ticks; u32 b_ = (b).ticks; \ + MgbaPrintf_(#a ": %d ticks, " #b ": %d ticks", a_, b_); \ if ((a_ * 100) <= ((b_ - BENCHMARK_ABS) * BENCHMARK_REL)) \ Test_ExitWithResult(TEST_RESULT_FAIL, "%s:%d: EXPECT_SLOWER(" #a ", " #b ") failed", gTestRunnerState.test->filename, __LINE__); \ } while (0) From b08b6a37cb150d49c900c7c04aca4cc5c17ddafe Mon Sep 17 00:00:00 2001 From: Martin Griffin Date: Tue, 25 Jul 2023 08:03:08 +0100 Subject: [PATCH 3/3] Optimize BuildOamBuffer --- gflib/sprite.c | 244 ++++++++++++++++----------------------- test/sprite.c | 303 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 398 insertions(+), 149 deletions(-) create mode 100644 test/sprite.c diff --git a/gflib/sprite.c b/gflib/sprite.c index 80fba81fe309..d75f6f7271d6 100644 --- a/gflib/sprite.c +++ b/gflib/sprite.c @@ -48,11 +48,7 @@ struct OamDimensions s8 height; }; -static void UpdateOamCoords(void); -static void BuildSpritePriorities(void); -static void SortSprites(void); -static void CopyMatricesToOamBuffer(void); -static void AddSpritesToOamBuffer(void); +static void SortSprites(u32 *spritePriorities, s32 n); static u8 CreateSpriteAt(u8 index, const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority); static void ResetOamMatrices(void); static void ResetSprite(struct Sprite *sprite); @@ -280,12 +276,12 @@ u32 gOamMatrixAllocBitmap; u8 gReservedSpritePaletteCount; EWRAM_DATA struct Sprite gSprites[MAX_SPRITES + 1] = {0}; -EWRAM_DATA static u16 sSpritePriorities[MAX_SPRITES] = {0}; EWRAM_DATA static u8 sSpriteOrder[MAX_SPRITES] = {0}; EWRAM_DATA static bool8 sShouldProcessSpriteCopyRequests = 0; EWRAM_DATA static u8 sSpriteCopyRequestCount = 0; EWRAM_DATA static struct SpriteCopyRequest sSpriteCopyRequests[MAX_SPRITES] = {0}; EWRAM_DATA u8 gOamLimit = 0; +static EWRAM_DATA u8 gOamDummyIndex = 0; EWRAM_DATA u16 gReservedSpriteTileCount = 0; EWRAM_DATA static u8 sSpriteTileAllocBitmap[128] = {0}; EWRAM_DATA s16 gSpriteCoordOffsetX = 0; @@ -296,6 +292,7 @@ EWRAM_DATA bool8 gAffineAnimsDisabled = FALSE; void ResetSpriteData(void) { ResetOamRange(0, 128); + gOamDummyIndex = 0; ResetAllSprites(); ClearSpriteCopyRequests(); ResetAffineAnimData(); @@ -326,179 +323,128 @@ void AnimateSprites(void) void BuildOamBuffer(void) { - u8 temp; - UpdateOamCoords(); - BuildSpritePriorities(); - SortSprites(); - temp = gMain.oamLoadDisabled; - gMain.oamLoadDisabled = TRUE; - AddSpritesToOamBuffer(); - CopyMatricesToOamBuffer(); - gMain.oamLoadDisabled = temp; - sShouldProcessSpriteCopyRequests = TRUE; -} + bool32 oamLoadDisabled; + u32 i, stride; + u8 oamIndex; + + // All attributes which affect sorting packed into a single u32: + // { priority:2, subpriority:8, y:9, :5, index:8 }. + // Index has its own byte even though it only needs 6 bits so that + // we can load it with a ldrb instead of having to mask out the + // bottom 6 bits. + u32 spritePriorities[MAX_SPRITES]; + s32 toSort = 0; + u8 skippedSprites[MAX_SPRITES]; + u32 skippedSpritesN = 0; + u32 matrices = 0; -void UpdateOamCoords(void) -{ - u8 i; for (i = 0; i < MAX_SPRITES; i++) { - struct Sprite *sprite = &gSprites[i]; - if (sprite->inUse && !sprite->invisible) + // Reuse existing sSpriteOrder because we expect the order to be + // relatively stable between frames. + u32 index = sSpriteOrder[i]; + struct Sprite *sprite = &gSprites[index]; + s32 y; + if (!sprite->inUse || sprite->invisible) { - if (sprite->coordOffsetEnabled) - { - sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; - sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; - } - else - { - sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; - sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; - } + skippedSprites[skippedSpritesN++] = index; + continue; } - } -} -void BuildSpritePriorities(void) -{ - u16 i; - for (i = 0; i < MAX_SPRITES; i++) - { - struct Sprite *sprite = &gSprites[i]; - u16 priority = sprite->subpriority | (sprite->oam.priority << 8); - sSpritePriorities[i] = priority; - } -} + if (sprite->oam.affineMode & ST_OAM_AFFINE_ON_MASK) + matrices |= 1 << sprite->oam.matrixNum; -void SortSprites(void) -{ - u8 i; - for (i = 1; i < MAX_SPRITES; i++) - { - u8 j = i; - struct Sprite *sprite1 = &gSprites[sSpriteOrder[i - 1]]; - struct Sprite *sprite2 = &gSprites[sSpriteOrder[i]]; - u16 sprite1Priority = sSpritePriorities[sSpriteOrder[i - 1]]; - u16 sprite2Priority = sSpritePriorities[sSpriteOrder[i]]; - s16 sprite1Y = sprite1->oam.y; - s16 sprite2Y = sprite2->oam.y; - - if (sprite1Y >= DISPLAY_HEIGHT) - sprite1Y = sprite1Y - 256; - - if (sprite2Y >= DISPLAY_HEIGHT) - sprite2Y = sprite2Y - 256; - - if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite1->oam.size == ST_OAM_SIZE_3) + if (sprite->coordOffsetEnabled) { - u32 shape = sprite1->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite1Y > 128) - sprite1Y = sprite1Y - 256; - } + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; + } + else + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; } - if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite2->oam.size == ST_OAM_SIZE_3) + y = sprite->oam.y; + if (y >= DISPLAY_HEIGHT) + { + y -= 256; + } + else if (sprite->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite->oam.size == ST_OAM_SIZE_3) { - u32 shape = sprite2->oam.shape; + u32 shape = sprite->oam.shape; if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) { - if (sprite2Y > 128) - sprite2Y = sprite2Y - 256; + if (y > 128) + y -= 256; } } - while (j > 0 - && ((sprite1Priority > sprite2Priority) - || (sprite1Priority == sprite2Priority && sprite1Y < sprite2Y))) - { - u8 temp = sSpriteOrder[j]; - sSpriteOrder[j] = sSpriteOrder[j - 1]; - sSpriteOrder[j - 1] = temp; - - // UB: If j equals 1, then j-- makes j equal 0. - // Then, sSpriteOrder[-1] gets accessed below. - // Although this doesn't result in a bug in the ROM, - // the behavior is undefined. - j--; -#ifdef UBFIX - if (j == 0) - break; -#endif - - sprite1 = &gSprites[sSpriteOrder[j - 1]]; - sprite2 = &gSprites[sSpriteOrder[j]]; - sprite1Priority = sSpritePriorities[sSpriteOrder[j - 1]]; - sprite2Priority = sSpritePriorities[sSpriteOrder[j]]; - sprite1Y = sprite1->oam.y; - sprite2Y = sprite2->oam.y; + // y in [-128...159], so (159 - y) in [0..287]. + spritePriorities[toSort++] + = (sprite->oam.priority << 30) + | (sprite->subpriority << 22) + | (((159 - y) & 0x1FF) << 13) + | (index << 0); + } - if (sprite1Y >= DISPLAY_HEIGHT) - sprite1Y = sprite1Y - 256; + SortSprites(spritePriorities, toSort); - if (sprite2Y >= DISPLAY_HEIGHT) - sprite2Y = sprite2Y - 256; + for (i = 0; i < toSort; i++) + sSpriteOrder[i] = spritePriorities[i] & 0xFF; + for (i = 0; i < skippedSpritesN; i++) + sSpriteOrder[toSort + i] = skippedSprites[i]; - if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite1->oam.size == ST_OAM_SIZE_3) - { - u32 shape = sprite1->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite1Y > 128) - sprite1Y = sprite1Y - 256; - } - } + oamLoadDisabled = gMain.oamLoadDisabled; + gMain.oamLoadDisabled = TRUE; - if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE - && sprite2->oam.size == ST_OAM_SIZE_3) - { - u32 shape = sprite2->oam.shape; - if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) - { - if (sprite2Y > 128) - sprite2Y = sprite2Y - 256; - } - } - } + for (i = 0, oamIndex = 0; i < toSort; i++) + { + if (AddSpriteToOamBuffer(&gSprites[spritePriorities[i] & 0xFF], &oamIndex)) + break; } -} -void CopyMatricesToOamBuffer(void) -{ - u8 i; - for (i = 0; i < OAM_MATRIX_COUNT; i++) + for (i = oamIndex; i < gOamDummyIndex; i++) + gMain.oamBuffer[i] = gDummyOamData; + gOamDummyIndex = oamIndex; + + for (i = 0; matrices != 0; i++, matrices >>= 1) { - u32 base = 4 * i; - gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; - gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; - gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; - gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + if (matrices & 1) + { + u32 base = 4 * i; + gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; + gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; + gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; + gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + } } + + gMain.oamLoadDisabled = oamLoadDisabled; + sShouldProcessSpriteCopyRequests = TRUE; } -void AddSpritesToOamBuffer(void) +static inline void InsertionSort(u32 *spritePriorities, s32 n) { - u8 i = 0; - u8 oamIndex = 0; - - while (i < MAX_SPRITES) + s32 i = 1; + while (i < n) { - struct Sprite *sprite = &gSprites[sSpriteOrder[i]]; - if (sprite->inUse && !sprite->invisible && AddSpriteToOamBuffer(sprite, &oamIndex)) - return; + u32 x = spritePriorities[i]; + s32 j = i - 1; + while (j >= 0 && spritePriorities[j] > x) + { + spritePriorities[j + 1] = spritePriorities[j]; + j--; + } + spritePriorities[j + 1] = x; i++; } +} - while (oamIndex < gOamLimit) - { - gMain.oamBuffer[oamIndex] = gDummyOamData; - oamIndex++; - } +static void SortSprites(u32 *spritePriorities, s32 n) +{ + InsertionSort(spritePriorities, n); } u8 CreateSprite(const struct SpriteTemplate *template, s16 x, s16 y, u8 subpriority) @@ -849,7 +795,7 @@ void CopyToSprites(u8 *src) void ResetAllSprites(void) { - u8 i; + u32 i; for (i = 0; i < MAX_SPRITES; i++) { diff --git a/test/sprite.c b/test/sprite.c new file mode 100644 index 000000000000..f488929d5179 --- /dev/null +++ b/test/sprite.c @@ -0,0 +1,303 @@ +#include "global.h" +#include "test.h" +#include "main.h" +#include "malloc.h" +#include "random.h" +#include "sprite.h" + +#define OAM_MATRIX_COUNT 32 + +EWRAM_DATA static u16 sSpritePriorities[MAX_SPRITES] = {0}; +EWRAM_DATA static u8 sSpriteOrder[MAX_SPRITES] = {0}; + +static void Old_BuildOamBuffer(void); + +static void ExpectEqOamBuffers(const struct OamData *oldOamBuffer, const struct OamData *newOamBuffer) +{ + u32 i; + u32 matrices = 0; + + // Compare the non-matrix data. + for (i = 0; i < gOamLimit; i++) + { + EXPECT(memcmp(&oldOamBuffer[i], &newOamBuffer[i], 6) == 0); + if (newOamBuffer[i].affineMode & ST_OAM_AFFINE_ON_MASK) + matrices |= 1 << newOamBuffer[i].matrixNum; + } + + // Compare the matrix data. + for (i = 0; i < OAM_MATRIX_COUNT; i++) + { + if (matrices & (1 << i)) + { + u32 base = 4 * i; + EXPECT_EQ(oldOamBuffer[base + 0].affineParam, newOamBuffer[base + 0].affineParam); + EXPECT_EQ(oldOamBuffer[base + 1].affineParam, newOamBuffer[base + 1].affineParam); + EXPECT_EQ(oldOamBuffer[base + 2].affineParam, newOamBuffer[base + 2].affineParam); + EXPECT_EQ(oldOamBuffer[base + 3].affineParam, newOamBuffer[base + 3].affineParam); + } + } +} + +static void ResetSpriteData_(void) +{ + u32 i; + ResetSpriteData(); + for (i = 0; i < MAX_SPRITES; i++) + sSpriteOrder[i] = i; +} + +static void BenchmarkBuildOamBuffer(bool32 preSort) +{ + struct Benchmark oldBuildOamBuffer, newBuildOamBuffer; + struct OamData *oldOamBuffer = Alloc(sizeof(gMain.oamBuffer)); + + if (preSort) + Old_BuildOamBuffer(); + BENCHMARK(&oldBuildOamBuffer) + { + Old_BuildOamBuffer(); + } + memcpy(oldOamBuffer, gMain.oamBuffer, sizeof(gMain.oamBuffer)); + + if (preSort) + BuildOamBuffer(); + BENCHMARK(&newBuildOamBuffer) + { + BuildOamBuffer(); + } + + ExpectEqOamBuffers(oldOamBuffer, gMain.oamBuffer); + EXPECT_FASTER(newBuildOamBuffer, oldBuildOamBuffer); + Free(oldOamBuffer); +} + +TEST("BuildOamBuffer faster with no sprites") +{ + ResetSpriteData_(); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster with max sprites (equal y/subpriority)") +{ + u32 i; + + ResetSpriteData_(); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, 0, 0); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster with max sprites (random y/subpriority)") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + BenchmarkBuildOamBuffer(FALSE); +} + +TEST("BuildOamBuffer faster on already-sorted max sprites") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES; i++) + CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + BenchmarkBuildOamBuffer(TRUE); +} + +TEST("BuildOamBuffer faster with mix of sprites") +{ + u32 i; + ResetSpriteData_(); + SeedRng(0); + for (i = 0; i < MAX_SPRITES / 2; i++) + { + u32 spriteId = CreateSprite(&gDummySpriteTemplate, 0, Random() % 256, Random() % 256); + gSprites[spriteId].invisible = Random() % 4 == 0; + } + BenchmarkBuildOamBuffer(FALSE); +} + +// Old implementation. + +#define UBFIX + +static void UpdateOamCoords(void) +{ + u8 i; + for (i = 0; i < MAX_SPRITES; i++) + { + struct Sprite *sprite = &gSprites[i]; + if (sprite->inUse && !sprite->invisible) + { + if (sprite->coordOffsetEnabled) + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX + gSpriteCoordOffsetX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY + gSpriteCoordOffsetY; + } + else + { + sprite->oam.x = sprite->x + sprite->x2 + sprite->centerToCornerVecX; + sprite->oam.y = sprite->y + sprite->y2 + sprite->centerToCornerVecY; + } + } + } +} + +static void BuildSpritePriorities(void) +{ + u16 i; + for (i = 0; i < MAX_SPRITES; i++) + { + struct Sprite *sprite = &gSprites[i]; + u16 priority = sprite->subpriority | (sprite->oam.priority << 8); + sSpritePriorities[i] = priority; + } +} + +static void SortSprites(void) +{ + u8 i; + for (i = 1; i < MAX_SPRITES; i++) + { + u8 j = i; + struct Sprite *sprite1 = &gSprites[sSpriteOrder[i - 1]]; + struct Sprite *sprite2 = &gSprites[sSpriteOrder[i]]; + u16 sprite1Priority = sSpritePriorities[sSpriteOrder[i - 1]]; + u16 sprite2Priority = sSpritePriorities[sSpriteOrder[i]]; + s16 sprite1Y = sprite1->oam.y; + s16 sprite2Y = sprite2->oam.y; + + if (sprite1Y >= DISPLAY_HEIGHT) + sprite1Y = sprite1Y - 256; + + if (sprite2Y >= DISPLAY_HEIGHT) + sprite2Y = sprite2Y - 256; + + if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite1->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite1->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite1Y > 128) + sprite1Y = sprite1Y - 256; + } + } + + if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite2->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite2->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite2Y > 128) + sprite2Y = sprite2Y - 256; + } + } + + while (j > 0 + && ((sprite1Priority > sprite2Priority) + || (sprite1Priority == sprite2Priority && sprite1Y < sprite2Y))) + { + u8 temp = sSpriteOrder[j]; + sSpriteOrder[j] = sSpriteOrder[j - 1]; + sSpriteOrder[j - 1] = temp; + + // UB: If j equals 1, then j-- makes j equal 0. + // Then, sSpriteOrder[-1] gets accessed below. + // Although this doesn't result in a bug in the ROM, + // the behavior is undefined. + j--; +#ifdef UBFIX + if (j == 0) + break; +#endif + + sprite1 = &gSprites[sSpriteOrder[j - 1]]; + sprite2 = &gSprites[sSpriteOrder[j]]; + sprite1Priority = sSpritePriorities[sSpriteOrder[j - 1]]; + sprite2Priority = sSpritePriorities[sSpriteOrder[j]]; + sprite1Y = sprite1->oam.y; + sprite2Y = sprite2->oam.y; + + if (sprite1Y >= DISPLAY_HEIGHT) + sprite1Y = sprite1Y - 256; + + if (sprite2Y >= DISPLAY_HEIGHT) + sprite2Y = sprite2Y - 256; + + if (sprite1->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite1->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite1->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite1Y > 128) + sprite1Y = sprite1Y - 256; + } + } + + if (sprite2->oam.affineMode == ST_OAM_AFFINE_DOUBLE + && sprite2->oam.size == ST_OAM_SIZE_3) + { + u32 shape = sprite2->oam.shape; + if (shape == ST_OAM_SQUARE || shape == ST_OAM_V_RECTANGLE) + { + if (sprite2Y > 128) + sprite2Y = sprite2Y - 256; + } + } + } + } +} + +static void CopyMatricesToOamBuffer(void) +{ + u8 i; + for (i = 0; i < OAM_MATRIX_COUNT; i++) + { + u32 base = 4 * i; + gMain.oamBuffer[base + 0].affineParam = gOamMatrices[i].a; + gMain.oamBuffer[base + 1].affineParam = gOamMatrices[i].b; + gMain.oamBuffer[base + 2].affineParam = gOamMatrices[i].c; + gMain.oamBuffer[base + 3].affineParam = gOamMatrices[i].d; + } +} + +static void AddSpritesToOamBuffer(void) +{ + u8 i = 0; + u8 oamIndex = 0; + + while (i < MAX_SPRITES) + { + struct Sprite *sprite = &gSprites[sSpriteOrder[i]]; + if (sprite->inUse && !sprite->invisible && AddSpriteToOamBuffer(sprite, &oamIndex)) + return; + i++; + } + + while (oamIndex < gOamLimit) + { + gMain.oamBuffer[oamIndex] = gDummyOamData; + oamIndex++; + } +} + +static void Old_BuildOamBuffer(void) +{ + u8 temp; + UpdateOamCoords(); + BuildSpritePriorities(); + SortSprites(); + temp = gMain.oamLoadDisabled; + gMain.oamLoadDisabled = TRUE; + AddSpritesToOamBuffer(); + CopyMatricesToOamBuffer(); + gMain.oamLoadDisabled = temp; + //sShouldProcessSpriteCopyRequests = TRUE; +}