From 53c6a3933d6f83f535024805700647d390695fcc Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sun, 21 Nov 2021 17:52:51 -0800 Subject: [PATCH 1/8] softgpu: Use ALWAYS for alpha/depth test in clear. --- GPU/Software/FuncId.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp index d40a79695ef0..b0913c8daf54 100644 --- a/GPU/Software/FuncId.cpp +++ b/GPU/Software/FuncId.cpp @@ -38,6 +38,8 @@ void ComputePixelFuncID(PixelFuncID *id) { id->colorTest = gstate.isClearModeColorMask(); id->stencilTest = gstate.isClearModeAlphaMask(); id->depthWrite = gstate.isClearModeDepthMask(); + id->depthTestFunc = GE_COMP_ALWAYS; + id->alphaTestFunc = GE_COMP_ALWAYS; } else { id->colorTest = gstate.isColorTestEnabled() && gstate.getColorTestFunction() != GE_COMP_ALWAYS; if (gstate.isStencilTestEnabled() && gstate.getStencilTestFunction() == GE_COMP_ALWAYS) { From 73de8db99677b5799df0e24f2ebd4cbc1770e498 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 22 Nov 2021 05:57:54 -0800 Subject: [PATCH 2/8] softgpu: Fix stencil DECR on 5551. --- GPU/Software/DrawPixel.cpp | 8 +++++--- GPU/Software/FuncId.cpp | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp index 0dc5174414a0..1f80d1c33ab6 100644 --- a/GPU/Software/DrawPixel.cpp +++ b/GPU/Software/DrawPixel.cpp @@ -179,7 +179,7 @@ static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) { if (pixelID.hasStencilTestMask) stencil &= gstate.getStencilTestMask(); u8 ref = pixelID.stencilTestRef; - switch (GEComparison(pixelID.stencilTestFunc)) { + switch (pixelID.StencilTestFunc()) { case GE_COMP_NEVER: return false; @@ -246,6 +246,8 @@ static inline u8 ApplyStencilOp(GEBufferFormat fmt, GEStencilOp op, u8 old_stenc if (old_stencil >= 0x10) return old_stencil - 0x10; break; + case GE_FORMAT_5551: + return 0; default: if (old_stencil != 0) return old_stencil - 1; @@ -460,7 +462,7 @@ inline void DrawSinglePixel(int x, int y, int z, int fog, const Vec4 &color SingleFunc GetSingleFunc(const PixelFuncID &id) { if (id.clearMode) { - switch (id.FBFormat()) { + switch (id.fbFormat) { case GE_FORMAT_565: return &DrawSinglePixel; case GE_FORMAT_5551: @@ -471,7 +473,7 @@ SingleFunc GetSingleFunc(const PixelFuncID &id) { return &DrawSinglePixel; } } - switch (id.FBFormat()) { + switch (id.fbFormat) { case GE_FORMAT_565: return &DrawSinglePixel; case GE_FORMAT_5551: diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp index b0913c8daf54..7ba9a09bbba8 100644 --- a/GPU/Software/FuncId.cpp +++ b/GPU/Software/FuncId.cpp @@ -110,7 +110,7 @@ std::string DescribePixelFuncID(const PixelFuncID &id) { if (id.applyColorWriteMask) desc += "Msk:"; - switch (id.FBFormat()) { + switch (id.fbFormat) { case GE_FORMAT_565: desc += "5650:"; break; case GE_FORMAT_5551: desc += "5551:"; break; case GE_FORMAT_4444: desc += "4444:"; break; From 2ef7dd6b032fff58774c264ddaefcc40a02239f6 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Mon, 22 Nov 2021 17:14:29 -0800 Subject: [PATCH 3/8] softgpu: Correct tagging of vertexjit. --- GPU/Software/SoftGpu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp index c4e5a19511b0..bf67e06e11c5 100644 --- a/GPU/Software/SoftGpu.cpp +++ b/GPU/Software/SoftGpu.cpp @@ -1015,5 +1015,5 @@ bool SoftGPU::DescribeCodePtr(const u8 *ptr, std::string &name) { name = "RasterizerJit:" + subname; return true; } - return false; + return GPUCommon::DescribeCodePtr(ptr, name); } From 2acf7f4edfefa003e31406ba2344a8239a758099 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 25 Nov 2021 18:45:23 -0800 Subject: [PATCH 4/8] softgpu: Use 0 alpha for 565 alpha blending. We were previously blending as 0xFF. --- GPU/Software/DrawPixel.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp index 1f80d1c33ab6..00beb3e7f3e2 100644 --- a/GPU/Software/DrawPixel.cpp +++ b/GPU/Software/DrawPixel.cpp @@ -83,7 +83,8 @@ static inline void SetPixelDepth(int x, int y, u16 value) { static inline u32 GetPixelColor(GEBufferFormat fmt, int x, int y) { switch (fmt) { case GE_FORMAT_565: - return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())); + // A should be zero for the purposes of alpha blending. + return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())) & 0x00FFFFFF; case GE_FORMAT_5551: return RGBA5551ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())); From 35444b30518a174ff37428182eb1dc73e37afa6a Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 25 Nov 2021 18:46:15 -0800 Subject: [PATCH 5/8] softgpu: Accurately alpha blend. --- GPU/Software/Rasterizer.cpp | 58 +++++++++++++++++++++++++++++-------- 1 file changed, 46 insertions(+), 12 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index 458483d73a3f..c500a432f0e3 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -474,33 +474,67 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc case GE_BLENDMODE_MUL_AND_ADD: { #if defined(_M_SSE) - const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); - const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); - return Vec3(_mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(s, d), _mm_set_ps1(1.0f / 255.0f)))); + // We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free. + const __m128i half = _mm_set1_epi16(1 << 3); + + const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half); + const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half); + const __m128i s = _mm_mulhi_epi16(srgb, sf); + + const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half); + const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half); + const __m128i d = _mm_mulhi_epi16(drgb, df); + + return Vec3(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128())); #else - return (source.rgb() * srcfactor + dst.rgb() * dstfactor) / 255; + Vec3 half = Vec3::AssignToAll(1); + Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; + Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; + return lhs + rhs; #endif } case GE_BLENDMODE_MUL_AND_SUBTRACT: { #if defined(_M_SSE) - const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); - const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); - return Vec3(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(s, d), _mm_set_ps1(1.0f / 255.0f)))); + const __m128i half = _mm_set1_epi16(1 << 3); + + const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half); + const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half); + const __m128i s = _mm_mulhi_epi16(srgb, sf); + + const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half); + const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half); + const __m128i d = _mm_mulhi_epi16(drgb, df); + + return Vec3(_mm_unpacklo_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128())); #else - return (source.rgb() * srcfactor - dst.rgb() * dstfactor) / 255; + Vec3 half = Vec3::AssignToAll(1); + Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; + Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; + return lhs - rhs; #endif } case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE: { #if defined(_M_SSE) - const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec)); - const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec)); - return Vec3(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(d, s), _mm_set_ps1(1.0f / 255.0f)))); + const __m128i half = _mm_set1_epi16(1 << 3); + + const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half); + const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half); + const __m128i s = _mm_mulhi_epi16(srgb, sf); + + const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half); + const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half); + const __m128i d = _mm_mulhi_epi16(drgb, df); + + return Vec3(_mm_unpacklo_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128())); #else - return (dst.rgb() * dstfactor - source.rgb() * srcfactor) / 255; + Vec3 half = Vec3::AssignToAll(1); + Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; + Vec3 rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024; + return rhs - lhs; #endif } From d4bf7ea3922edf6fbe620504823cc8b24aa823bf Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 25 Nov 2021 18:59:32 -0800 Subject: [PATCH 6/8] sofgpu: Disable alpha blend for invalid equations. --- GPU/Software/FuncId.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp index 7ba9a09bbba8..d18e87243c3f 100644 --- a/GPU/Software/FuncId.cpp +++ b/GPU/Software/FuncId.cpp @@ -70,7 +70,8 @@ void ComputePixelFuncID(PixelFuncID *id) { id->hasAlphaTestMask = gstate.getAlphaTestMask() != 0xFF; } - id->alphaBlend = gstate.isAlphaBlendEnabled(); + // If invalid (6 or 7), doesn't do any blending, so force off. + id->alphaBlend = gstate.isAlphaBlendEnabled() && gstate.getBlendEq() <= 5; // Force it off if the factors are constant and don't blend. Some games use this... if (id->alphaBlend && gstate.getBlendEq() == GE_BLENDMODE_MUL_AND_ADD) { bool srcFixedOne = gstate.getBlendFuncA() == GE_SRCBLEND_FIXA && gstate.getFixA() == 0x00FFFFFF; From dad85b97f15a55900acc76e5e43c9c3b7ab9befe Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 25 Nov 2021 21:00:34 -0800 Subject: [PATCH 7/8] softgpu: Use KEEP for any invalid stencil ops. This just keeps the ID more consistent. --- GPU/Software/FuncId.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp index d18e87243c3f..c92414764861 100644 --- a/GPU/Software/FuncId.cpp +++ b/GPU/Software/FuncId.cpp @@ -57,10 +57,15 @@ void ComputePixelFuncID(PixelFuncID *id) { if (id->stencilTest) { id->stencilTestFunc = gstate.getStencilTestFunction(); id->stencilTestRef = gstate.getStencilTestRef() & gstate.getStencilTestMask(); - id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF; - id->sFail = gstate.getStencilOpSFail(); - id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP; - id->zPass = gstate.getStencilOpZPass(); + id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF && gstate.FrameBufFormat() != GE_FORMAT_565; + + // Stencil can't be written on 565, and any invalid op acts like KEEP, which is 0. + if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpSFail() <= GE_STENCILOP_DECR) + id->sFail = gstate.getStencilOpSFail(); + if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZFail() <= GE_STENCILOP_DECR) + id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP; + if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZPass() <= GE_STENCILOP_DECR) + id->zPass = gstate.getStencilOpZPass(); } id->depthTestFunc = gstate.isDepthTestEnabled() ? gstate.getDepthTestFunction() : GE_COMP_ALWAYS; From ce5ae95854d5560e238ec7dbec68042e9b5c7ad8 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Thu, 25 Nov 2021 22:06:48 -0800 Subject: [PATCH 8/8] softgpu: Correct alpha blend subtract on negative. Oops, we need to subtract signed, but then clamp to unsigned. --- GPU/Software/Rasterizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index c500a432f0e3..2a4c32b0a67f 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -507,7 +507,7 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half); const __m128i d = _mm_mulhi_epi16(drgb, df); - return Vec3(_mm_unpacklo_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128())); + return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128())); #else Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024; @@ -529,7 +529,7 @@ Vec3 AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4 &sourc const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half); const __m128i d = _mm_mulhi_epi16(drgb, df); - return Vec3(_mm_unpacklo_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128())); + return Vec3(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128())); #else Vec3 half = Vec3::AssignToAll(1); Vec3 lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;