Skip to content

Commit

Permalink
Merge pull request #15171 from unknownbrackets/softgpu-cleanup
Browse files Browse the repository at this point in the history
Correct some alpha/stencil/blend issues in softgpu
  • Loading branch information
hrydgard authored Nov 26, 2021
2 parents 65d3f96 + ce5ae95 commit daae09b
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 23 deletions.
11 changes: 7 additions & 4 deletions GPU/Software/DrawPixel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ static inline void SetPixelDepth(int x, int y, u16 value) {
static inline u32 GetPixelColor(GEBufferFormat fmt, int x, int y) {
switch (fmt) {
case GE_FORMAT_565:
return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
// A should be zero for the purposes of alpha blending.
return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())) & 0x00FFFFFF;

case GE_FORMAT_5551:
return RGBA5551ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
Expand Down Expand Up @@ -179,7 +180,7 @@ static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
if (pixelID.hasStencilTestMask)
stencil &= gstate.getStencilTestMask();
u8 ref = pixelID.stencilTestRef;
switch (GEComparison(pixelID.stencilTestFunc)) {
switch (pixelID.StencilTestFunc()) {
case GE_COMP_NEVER:
return false;

Expand Down Expand Up @@ -246,6 +247,8 @@ static inline u8 ApplyStencilOp(GEBufferFormat fmt, GEStencilOp op, u8 old_stenc
if (old_stencil >= 0x10)
return old_stencil - 0x10;
break;
case GE_FORMAT_5551:
return 0;
default:
if (old_stencil != 0)
return old_stencil - 1;
Expand Down Expand Up @@ -460,7 +463,7 @@ inline void DrawSinglePixel(int x, int y, int z, int fog, const Vec4<int> &color

SingleFunc GetSingleFunc(const PixelFuncID &id) {
if (id.clearMode) {
switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565:
return &DrawSinglePixel<true, GE_FORMAT_565>;
case GE_FORMAT_5551:
Expand All @@ -471,7 +474,7 @@ SingleFunc GetSingleFunc(const PixelFuncID &id) {
return &DrawSinglePixel<true, GE_FORMAT_8888>;
}
}
switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565:
return &DrawSinglePixel<false, GE_FORMAT_565>;
case GE_FORMAT_5551:
Expand Down
20 changes: 14 additions & 6 deletions GPU/Software/FuncId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
id->colorTest = gstate.isClearModeColorMask();
id->stencilTest = gstate.isClearModeAlphaMask();
id->depthWrite = gstate.isClearModeDepthMask();
id->depthTestFunc = GE_COMP_ALWAYS;
id->alphaTestFunc = GE_COMP_ALWAYS;
} else {
id->colorTest = gstate.isColorTestEnabled() && gstate.getColorTestFunction() != GE_COMP_ALWAYS;
if (gstate.isStencilTestEnabled() && gstate.getStencilTestFunction() == GE_COMP_ALWAYS) {
Expand All @@ -55,10 +57,15 @@ void ComputePixelFuncID(PixelFuncID *id) {
if (id->stencilTest) {
id->stencilTestFunc = gstate.getStencilTestFunction();
id->stencilTestRef = gstate.getStencilTestRef() & gstate.getStencilTestMask();
id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF;
id->sFail = gstate.getStencilOpSFail();
id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
id->zPass = gstate.getStencilOpZPass();
id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF && gstate.FrameBufFormat() != GE_FORMAT_565;

// Stencil can't be written on 565, and any invalid op acts like KEEP, which is 0.
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpSFail() <= GE_STENCILOP_DECR)
id->sFail = gstate.getStencilOpSFail();
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZFail() <= GE_STENCILOP_DECR)
id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZPass() <= GE_STENCILOP_DECR)
id->zPass = gstate.getStencilOpZPass();
}

id->depthTestFunc = gstate.isDepthTestEnabled() ? gstate.getDepthTestFunction() : GE_COMP_ALWAYS;
Expand All @@ -68,7 +75,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
id->hasAlphaTestMask = gstate.getAlphaTestMask() != 0xFF;
}

id->alphaBlend = gstate.isAlphaBlendEnabled();
// If invalid (6 or 7), doesn't do any blending, so force off.
id->alphaBlend = gstate.isAlphaBlendEnabled() && gstate.getBlendEq() <= 5;
// Force it off if the factors are constant and don't blend. Some games use this...
if (id->alphaBlend && gstate.getBlendEq() == GE_BLENDMODE_MUL_AND_ADD) {
bool srcFixedOne = gstate.getBlendFuncA() == GE_SRCBLEND_FIXA && gstate.getFixA() == 0x00FFFFFF;
Expand Down Expand Up @@ -108,7 +116,7 @@ std::string DescribePixelFuncID(const PixelFuncID &id) {
if (id.applyColorWriteMask)
desc += "Msk:";

switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565: desc += "5650:"; break;
case GE_FORMAT_5551: desc += "5551:"; break;
case GE_FORMAT_4444: desc += "4444:"; break;
Expand Down
58 changes: 46 additions & 12 deletions GPU/Software/Rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,33 +474,67 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
case GE_BLENDMODE_MUL_AND_ADD:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
#else
return (source.rgb() * srcfactor + dst.rgb() * dstfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs + rhs;
#endif
}

case GE_BLENDMODE_MUL_AND_SUBTRACT:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
#else
return (source.rgb() * srcfactor - dst.rgb() * dstfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs - rhs;
#endif
}

case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(d, s), _mm_set_ps1(1.0f / 255.0f))));
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
#else
return (dst.rgb() * dstfactor - source.rgb() * srcfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return rhs - lhs;
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion GPU/Software/SoftGpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,5 +1015,5 @@ bool SoftGPU::DescribeCodePtr(const u8 *ptr, std::string &name) {
name = "RasterizerJit:" + subname;
return true;
}
return false;
return GPUCommon::DescribeCodePtr(ptr, name);
}

0 comments on commit daae09b

Please sign in to comment.