hrydgard · hrydgard · Nov 26, 2021 · Nov 22, 2021 · Nov 22, 2021 · Nov 23, 2021
diff --git a/GPU/Software/DrawPixel.cpp b/GPU/Software/DrawPixel.cpp
@@ -83,7 +83,8 @@ static inline void SetPixelDepth(int x, int y, u16 value) {
 static inline u32 GetPixelColor(GEBufferFormat fmt, int x, int y) {
 	switch (fmt) {
 	case GE_FORMAT_565:
-		return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
+		// A should be zero for the purposes of alpha blending.
+		return RGB565ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride())) & 0x00FFFFFF;
 
 	case GE_FORMAT_5551:
 		return RGBA5551ToRGBA8888(fb.Get16(x, y, gstate.FrameBufStride()));
@@ -179,7 +180,7 @@ static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
 	if (pixelID.hasStencilTestMask)
 		stencil &= gstate.getStencilTestMask();
 	u8 ref = pixelID.stencilTestRef;
-	switch (GEComparison(pixelID.stencilTestFunc)) {
+	switch (pixelID.StencilTestFunc()) {
 	case GE_COMP_NEVER:
 		return false;
 
@@ -246,6 +247,8 @@ static inline u8 ApplyStencilOp(GEBufferFormat fmt, GEStencilOp op, u8 old_stenc
 			if (old_stencil >= 0x10)
 				return old_stencil - 0x10;
 			break;
+		case GE_FORMAT_5551:
+			return 0;
 		default:
 			if (old_stencil != 0)
 				return old_stencil - 1;
@@ -460,7 +463,7 @@ inline void DrawSinglePixel(int x, int y, int z, int fog, const Vec4<int> &color
 
 SingleFunc GetSingleFunc(const PixelFuncID &id) {
 	if (id.clearMode) {
-		switch (id.FBFormat()) {
+		switch (id.fbFormat) {
 		case GE_FORMAT_565:
 			return &DrawSinglePixel<true, GE_FORMAT_565>;
 		case GE_FORMAT_5551:
@@ -471,7 +474,7 @@ SingleFunc GetSingleFunc(const PixelFuncID &id) {
 			return &DrawSinglePixel<true, GE_FORMAT_8888>;
 		}
 	}
-	switch (id.FBFormat()) {
+	switch (id.fbFormat) {
 	case GE_FORMAT_565:
 		return &DrawSinglePixel<false, GE_FORMAT_565>;
 	case GE_FORMAT_5551:

diff --git a/GPU/Software/FuncId.cpp b/GPU/Software/FuncId.cpp
@@ -38,6 +38,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
 		id->colorTest = gstate.isClearModeColorMask();
 		id->stencilTest = gstate.isClearModeAlphaMask();
 		id->depthWrite = gstate.isClearModeDepthMask();
+		id->depthTestFunc = GE_COMP_ALWAYS;
+		id->alphaTestFunc = GE_COMP_ALWAYS;
 	} else {
 		id->colorTest = gstate.isColorTestEnabled() && gstate.getColorTestFunction() != GE_COMP_ALWAYS;
 		if (gstate.isStencilTestEnabled() && gstate.getStencilTestFunction() == GE_COMP_ALWAYS) {
@@ -55,10 +57,15 @@ void ComputePixelFuncID(PixelFuncID *id) {
 		if (id->stencilTest) {
 			id->stencilTestFunc = gstate.getStencilTestFunction();
 			id->stencilTestRef = gstate.getStencilTestRef() & gstate.getStencilTestMask();
-			id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF;
-			id->sFail = gstate.getStencilOpSFail();
-			id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
-			id->zPass = gstate.getStencilOpZPass();
+			id->hasStencilTestMask = gstate.getStencilTestMask() != 0xFF && gstate.FrameBufFormat() != GE_FORMAT_565;
+
+			// Stencil can't be written on 565, and any invalid op acts like KEEP, which is 0.
+			if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpSFail() <= GE_STENCILOP_DECR)
+				id->sFail = gstate.getStencilOpSFail();
+			if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZFail() <= GE_STENCILOP_DECR)
+				id->zFail = gstate.isDepthTestEnabled() ? gstate.getStencilOpZFail() : GE_STENCILOP_KEEP;
+			if (gstate.FrameBufFormat() != GE_FORMAT_565 && gstate.getStencilOpZPass() <= GE_STENCILOP_DECR)
+				id->zPass = gstate.getStencilOpZPass();
 		}
 
 		id->depthTestFunc = gstate.isDepthTestEnabled() ? gstate.getDepthTestFunction() : GE_COMP_ALWAYS;
@@ -68,7 +75,8 @@ void ComputePixelFuncID(PixelFuncID *id) {
 			id->hasAlphaTestMask = gstate.getAlphaTestMask() != 0xFF;
 		}
 
-		id->alphaBlend = gstate.isAlphaBlendEnabled();
+		// If invalid (6 or 7), doesn't do any blending, so force off.
+		id->alphaBlend = gstate.isAlphaBlendEnabled() && gstate.getBlendEq() <= 5;
 		// Force it off if the factors are constant and don't blend.  Some games use this...
 		if (id->alphaBlend && gstate.getBlendEq() == GE_BLENDMODE_MUL_AND_ADD) {
 			bool srcFixedOne = gstate.getBlendFuncA() == GE_SRCBLEND_FIXA && gstate.getFixA() == 0x00FFFFFF;
@@ -108,7 +116,7 @@ std::string DescribePixelFuncID(const PixelFuncID &id) {
 	if (id.applyColorWriteMask)
 		desc += "Msk:";
 
-	switch (id.FBFormat()) {
+	switch (id.fbFormat) {
 	case GE_FORMAT_565: desc += "5650:"; break;
 	case GE_FORMAT_5551: desc += "5551:"; break;
 	case GE_FORMAT_4444: desc += "4444:"; break;

diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
@@ -474,33 +474,67 @@ Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &sourc
 	case GE_BLENDMODE_MUL_AND_ADD:
 	{
 #if defined(_M_SSE)
-		const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
-		const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
-		return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_add_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
+		// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
+		const __m128i half = _mm_set1_epi16(1 << 3);
+
+		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
+		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
+		const __m128i s = _mm_mulhi_epi16(srgb, sf);
+
+		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
+		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
+		const __m128i d = _mm_mulhi_epi16(drgb, df);
+
+		return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
 #else
-		return (source.rgb() * srcfactor + dst.rgb() * dstfactor) / 255;
+		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
+		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
+		return lhs + rhs;
 #endif
 	}
 
 	case GE_BLENDMODE_MUL_AND_SUBTRACT:
 	{
 #if defined(_M_SSE)
-		const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
-		const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
-		return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(s, d), _mm_set_ps1(1.0f / 255.0f))));
+		const __m128i half = _mm_set1_epi16(1 << 3);
+
+		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
+		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
+		const __m128i s = _mm_mulhi_epi16(srgb, sf);
+
+		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
+		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
+		const __m128i d = _mm_mulhi_epi16(drgb, df);
+
+		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
 #else
-		return (source.rgb() * srcfactor - dst.rgb() * dstfactor) / 255;
+		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
+		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
+		return lhs - rhs;
 #endif
 	}
 
 	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
 	{
 #if defined(_M_SSE)
-		const __m128 s = _mm_mul_ps(_mm_cvtepi32_ps(source.ivec), _mm_cvtepi32_ps(srcfactor.ivec));
-		const __m128 d = _mm_mul_ps(_mm_cvtepi32_ps(dst.ivec), _mm_cvtepi32_ps(dstfactor.ivec));
-		return Vec3<int>(_mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(d, s), _mm_set_ps1(1.0f / 255.0f))));
+		const __m128i half = _mm_set1_epi16(1 << 3);
+
+		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
+		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
+		const __m128i s = _mm_mulhi_epi16(srgb, sf);
+
+		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
+		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
+		const __m128i d = _mm_mulhi_epi16(drgb, df);
+
+		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
 #else
-		return (dst.rgb() * dstfactor - source.rgb() * srcfactor) / 255;
+		Vec3<int> half = Vec3<int>::AssignToAll(1);
+		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
+		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
+		return rhs - lhs;
 #endif
 	}
 

diff --git a/GPU/Software/SoftGpu.cpp b/GPU/Software/SoftGpu.cpp
@@ -1015,5 +1015,5 @@ bool SoftGPU::DescribeCodePtr(const u8 *ptr, std::string &name) {
 		name = "RasterizerJit:" + subname;
 		return true;
 	}
-	return false;
+	return GPUCommon::DescribeCodePtr(ptr, name);
 }