From 795de9b164296b6f112ac9be25575b3c0843e7d7 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 24 Jun 2023 12:36:07 -0700
Subject: [PATCH 1/3] softgpu: Use SIMD for more Vec4 casts.

A number of these were falling back to some pretty terrible code.
Thanks to fp64 for noticing.
---
 GPU/Math3D.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)
diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 6309801c4e49..96e3ae61eba6 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -580,8 +580,25 @@ class Vec4
 #endif
 
 	template<typename T2>
-	Vec4<T2> Cast() const
-	{
+	Vec4<T2> Cast() const {
+		if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+			return _mm_cvtps_epi32(vec);
+#elif defined(_M_SSE)
+			return _mm_cvtps_epi32(_mm_loadu_ps((float *)&vec));
+#elif PPSSPP_ARCH(ARM64_NEON)
+			return vcvtq_s32_f32(ivec);
+#endif
+		}
+		if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
+#if defined(_M_SSE) && !PPSSPP_ARCH(X86)
+			return _mm_cvtepi32_ps(ivec);
+#elif defined(_M_SSE)
+			return _mm_cvtepi32_ps(_mm_loadu_si128(&ivec));
+#elif PPSSPP_ARCH(ARM64_NEON)
+			return vcvtq_f32_s32(ivec);
+#endif
+		}
 		return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
 	}
 

From ae9d34370e35e40bc27bd5a3df31687291f81b93 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 24 Jun 2023 12:37:08 -0700
Subject: [PATCH 2/3] softgpu: Move wsum_recip out of the triangle loop.

Seems like a small benefit, but not seeing any issues from this.
Noticed by fp64.
---
 GPU/Software/Rasterizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp
index dbb53dea7648..4e375d16a74c 100644
--- a/GPU/Software/Rasterizer.cpp
+++ b/GPU/Software/Rasterizer.cpp
@@ -901,6 +901,9 @@ void DrawTriangleSlice(
 	Vec4<int> w1_base = e1.Start(v2.screenpos, v0.screenpos, pprime);
 	Vec4<int> w2_base = e2.Start(v0.screenpos, v1.screenpos, pprime);
 
+	// The sum of weights should remain constant as we move toward/away from the edges.
+	const Vec4<float> wsum_recip = EdgeRecip(w0_base, w1_base, w2_base);
+
 	// All the z values are the same, no interpolation required.
 	// This is common, and when we interpolate, we lose accuracy.
 	const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z;
@@ -963,8 +966,6 @@ void DrawTriangleSlice(
 			// If p is on or inside all edges, render pixel
 			Vec4<int> mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask);
 			if (AnyMask<useSSE4>(mask)) {
-				Vec4<float> wsum_recip = EdgeRecip(w0, w1, w2);
-
 				Vec4<int> z;
 				if (flatZ) {
 					z = Vec4<int>::AssignToAll(v2.screenpos.z);

From 15b66ba6c0aae3278cc5560b89ecf55e1456b256 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sat, 24 Jun 2023 14:49:23 -0700
Subject: [PATCH 3/3] softgpu: Make SIMD on x86_32 a bit safer.

---
 GPU/Math3D.h | 112 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 79 insertions(+), 33 deletions(-)

diff --git a/GPU/Math3D.h b/GPU/Math3D.h
index 96e3ae61eba6..ecb0aa27440f 100644
--- a/GPU/Math3D.h
+++ b/GPU/Math3D.h
@@ -585,7 +585,7 @@ class Vec4
 #if defined(_M_SSE) && !PPSSPP_ARCH(X86)
 			return _mm_cvtps_epi32(vec);
 #elif defined(_M_SSE)
-			return _mm_cvtps_epi32(_mm_loadu_ps((float *)&vec));
+			return _mm_cvtps_epi32(_mm_loadu_ps(&x));
 #elif PPSSPP_ARCH(ARM64_NEON)
 			return vcvtq_s32_f32(ivec);
 #endif
@@ -928,10 +928,15 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12])
 }
 
 inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
-#if defined(_M_SSE) && PPSSPP_ARCH(64BIT)
-	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
-	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
-	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+#if defined(_M_SSE)
+#if PPSSPP_ARCH(X86)
+	const __m128 vv = _mm_loadu_ps(&v.x);
+#else
+	const __m128 vv = v.vec;
+#endif
+	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
 	return Vec3ByMatrix43Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM64_NEON)
 	return Vec3ByMatrix43Internal(v.vec, m);
@@ -987,10 +992,15 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16])
 }
 
 inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
-#if defined(_M_SSE) && PPSSPP_ARCH(64BIT)
-	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
-	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
-	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+#if defined(_M_SSE)
+#if PPSSPP_ARCH(X86)
+	const __m128 vv = _mm_loadu_ps(&v.x);
+#else
+	const __m128 vv = v.vec;
+#endif
+	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
 	return Vec3ByMatrix44Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM64_NEON)
 	return Vec3ByMatrix44Internal(v.vec, m);
@@ -1046,10 +1056,15 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]
 }
 
 inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
-#if defined(_M_SSE) && PPSSPP_ARCH(64BIT)
-	__m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0));
-	__m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1));
-	__m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2));
+#if defined(_M_SSE)
+#if PPSSPP_ARCH(X86)
+	const __m128 vv = _mm_loadu_ps(&v.x);
+#else
+	const __m128 vv = v.vec;
+#endif
+	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
 	return Norm3ByMatrix43Internal(x, y, z, m);
 #elif PPSSPP_ARCH(ARM64_NEON)
 	return Norm3ByMatrix43Internal(v.vec, m);
@@ -1213,7 +1228,7 @@ __forceinline unsigned int Vec3<float>::ToRGB() const
 #if PPSSPP_ARCH(64BIT)
 	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
 #else
-	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f)));
+	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f)));
 #endif
 	__m128i c16 = _mm_packs_epi32(c, c);
 	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
@@ -1297,7 +1312,7 @@ __forceinline unsigned int Vec4<float>::ToRGBA() const
 #if PPSSPP_ARCH(64BIT)
 	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f)));
 #else
-	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f)));
+	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f)));
 #endif
 	__m128i c16 = _mm_packs_epi32(c, c);
 	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
@@ -1343,60 +1358,91 @@ __forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
 
 // Vec3<float> operation
 template<>
-inline void Vec3<float>::operator += (const Vec3<float> &other)
-{
+inline void Vec3<float>::operator += (const Vec3<float> &other) {
+#if PPSSPP_ARCH(X86)
+	*this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x));
+#else
 	vec = _mm_add_ps(vec, other.vec);
+#endif
 }
 
 template<>
-inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const
-{
+inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec3<float>(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
+#else
 	return Vec3<float>(_mm_add_ps(vec, other.vec));
+#endif
 }
 
 template<>
-inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const
-{
+inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec3<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
+#else
 	return Vec3<float>(_mm_mul_ps(vec, other.vec));
+#endif
 }
 
 template<> template<>
-inline Vec3<float> Vec3<float>::operator * (const float &other) const
-{
+inline Vec3<float> Vec3<float>::operator * (const float &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec3<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other)));
+#else
 	return Vec3<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
+#endif
 }
 
 // Vec4<float> operation
 template<>
-inline void Vec4<float>::operator += (const Vec4<float> &other)
-{
+inline void Vec4<float>::operator += (const Vec4<float> &other) {
+#if PPSSPP_ARCH(X86)
+	*this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x));
+#else
 	vec = _mm_add_ps(vec, other.vec);
+#endif
 }
 
 template<>
-inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const
-{
+inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec4<float>(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
+#else
 	return Vec4<float>(_mm_add_ps(vec, other.vec));
+#endif
 }
 
 template<>
-inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const
-{
+inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec4<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)));
+#else
 	return Vec4<float>(_mm_mul_ps(vec, other.vec));
+#endif
 }
 
 template<> template<>
-inline Vec4<float> Vec4<float>::operator * (const float &other) const
-{
+inline Vec4<float> Vec4<float>::operator * (const float &other) const {
+#if PPSSPP_ARCH(X86)
+	return Vec4<float>(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other)));
+#else
 	return Vec4<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
+#endif
 }
 
 // Vec3<float> cross product
 template<>
 inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
 {
-	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2)));
-	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1)));
+#if PPSSPP_ARCH(X86)
+	__m128 avec = _mm_loadu_ps(&a.x);
+	__m128 bvec = _mm_loadu_ps(&b.x);
+#else
+	__m128 avec = a.vec;
+	__m128 bvec = b.vec;
+#endif
+	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
+	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
 	return _mm_sub_ps(left, right);
 }
 #endif