Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct some alpha/stencil/blend issues in softgpu #15171

Merged
merged 8 commits into from
Nov 26, 2021
11 changes: 7 additions & 4 deletions GPU/Software/DrawPixel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ static inline void SetPixelDepth(int x, int y, u16 value) {
static inline u32 GetPixelColor(GEBufferFormat fmt, int x, int y) {
switch (fmt) {
case GE_FORMAT_565:
return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
// A should be zero for the purposes of alpha blending.
return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())) & 0x00FFFFFF;

case GE_FORMAT_5551:
return RGBA5551ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
Expand Down Expand Up @@ -179,7 +180,7 @@ static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
if (pixelID.hasStencilTestMask)
stencil &= gstate.getStencilTestMask();
u8 ref = pixelID.stencilTestRef;
switch (GEComparison(pixelID.stencilTestFunc)) {
switch (pixelID.StencilTestFunc()) {
case GE_COMP_NEVER:
return false;

Expand Down Expand Up @@ -246,6 +247,8 @@ static inline u8 ApplyStencilOp(GEBufferFormat fmt, GEStencilOp op, u8 old_stenc
if (old_stencil >= 0x10)
return old_stencil - 0x10;
break;
case GE_FORMAT_5551:
return 0;
default:
if (old_stencil != 0)
return old_stencil - 1;
Expand Down Expand Up @@ -460,7 +463,7 @@ inline void DrawSinglePixel(int x, int y, int z, int fog, const Vec4<int> &color

SingleFunc GetSingleFunc(const PixelFuncID &id) {
if (id.clearMode) {
switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565:
return &DrawSinglePixel<true, GE_FORMAT_565>;
case GE_FORMAT_5551:
Expand All @@ -471,7 +474,7 @@ SingleFunc GetSingleFunc(const PixelFuncID &id) {
return &DrawSinglePixel<true, GE_FORMAT_8888>;
}
}
switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565:
return &DrawSinglePixel<false, GE_FORMAT_565>;
case GE_FORMAT_5551:
Expand Down
20 changes: 14 additions & 6 deletions GPU/Software/FuncId.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
id->colorTest = gstate.isClearModeColorMask();
id->stencilTest = gstate.isClearModeAlphaMask();
id->depthWrite = gstate.isClearModeDepthMask();
id->depthTestFunc = GE_COMP_ALWAYS;
id->alphaTestFunc = GE_COMP_ALWAYS;
} else {
id->colorTest = gstate.isColorTestEnabled() && gstate.getColorTestFunction() != GE_COMP_ALWAYS;
if (gstate.isStencilTestEnabled() && gstate.getStencilTestFunction() == GE_COMP_ALWAYS) {
Expand All @@ -55,10 +57,15 @@ void ComputePixelFuncID(PixelFuncID *id) {
if (id->stencilTest) {
id->stencilTestFunc = gstate.getStencilTestFunction();
id->stencilTestRef = gstate.getStencilTestRef() & gstate.getStencilTestMask();
id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF;
id->sFail = gstate.getStencilOpSFail();
id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
id->zPass = gstate.getStencilOpZPass();
id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF && gstate.FrameBufFormat() != GE_FORMAT_565;

// Stencil can't be written on 565, and any invalid op acts like KEEP, which is 0.
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpSFail() <= GE_STENCILOP_DECR)
id->sFail = gstate.getStencilOpSFail();
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZFail() <= GE_STENCILOP_DECR)
id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZPass() <= GE_STENCILOP_DECR)
id->zPass = gstate.getStencilOpZPass();
}

id->depthTestFunc = gstate.isDepthTestEnabled() ? gstate.getDepthTestFunction() : GE_COMP_ALWAYS;
Expand All @@ -68,7 +75,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
id->hasAlphaTestMask = gstate.getAlphaTestMask() != 0xFF;
}

id->alphaBlend = gstate.isAlphaBlendEnabled();
// If invalid (6 or 7), doesn't do any blending, so force off.
id->alphaBlend = gstate.isAlphaBlendEnabled() && gstate.getBlendEq() <= 5;
// Force it off if the factors are constant and don't blend. Some games use this...
if (id->alphaBlend && gstate.getBlendEq() == GE_BLENDMODE_MUL_AND_ADD) {
bool srcFixedOne = gstate.getBlendFuncA() == GE_SRCBLEND_FIXA && gstate.getFixA() == 0x00FFFFFF;
Expand Down Expand Up @@ -108,7 +116,7 @@ std::string DescribePixelFuncID(const PixelFuncID &id) {
if (id.applyColorWriteMask)
desc += "Msk:";

switch (id.FBFormat()) {
switch (id.fbFormat) {
case GE_FORMAT_565: desc += "5650:"; break;
case GE_FORMAT_5551: desc += "5551:"; break;
case GE_FORMAT_4444: desc += "4444:"; break;
Expand Down
58 changes: 46 additions & 12 deletions GPU/Software/Rasterizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,33 +474,67 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
case GE_BLENDMODE_MUL_AND_ADD:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
#else
return (source.rgb() * srcfactor + dst.rgb() * dstfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs + rhs;
#endif
}

case GE_BLENDMODE_MUL_AND_SUBTRACT:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
#else
return (source.rgb() * srcfactor - dst.rgb() * dstfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return lhs - rhs;
#endif
}

case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
{
#if defined(_M_SSE)
const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(d, s), _mm_set_ps1(1.0f / 255.0f))));
const __m128i half = _mm_set1_epi16(1 << 3);

const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
const __m128i s = _mm_mulhi_epi16(srgb, sf);

const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
const __m128i d = _mm_mulhi_epi16(drgb, df);

return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
#else
return (dst.rgb() * dstfactor - source.rgb() * srcfactor) / 255;
Vec3<int> half = Vec3<int>::AssignToAll(1);
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
return rhs - lhs;
#endif
}

Expand Down
2 changes: 1 addition & 1 deletion GPU/Software/SoftGpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1015,5 +1015,5 @@ bool SoftGPU::DescribeCodePtr(const u8 *ptr, std::string &name) {
name = "RasterizerJit:" + subname;
return true;
}
return false;
return GPUCommon::DescribeCodePtr(ptr, name);
}