From 795de9b164296b6f112ac9be25575b3c0843e7d7 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 24 Jun 2023 12:36:07 -0700 Subject: [PATCH 1/3] softgpu: Use SIMD for more Vec4 casts. A number of these were falling back to some pretty terrible code. Thanks to fp64 for noticing. --- GPU/Math3D.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 6309801c4e49..96e3ae61eba6 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -580,8 +580,25 @@ class Vec4 #endif template - Vec4 Cast() const - { + Vec4 Cast() const { + if constexpr (std::is_same::value && std::is_same::value) { +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) + return _mm_cvtps_epi32(vec); +#elif defined(_M_SSE) + return _mm_cvtps_epi32(_mm_loadu_ps((float *)&vec)); +#elif PPSSPP_ARCH(ARM64_NEON) + return vcvtq_s32_f32(ivec); +#endif + } + if constexpr (std::is_same::value && std::is_same::value) { +#if defined(_M_SSE) && !PPSSPP_ARCH(X86) + return _mm_cvtepi32_ps(ivec); +#elif defined(_M_SSE) + return _mm_cvtepi32_ps(_mm_loadu_si128(&ivec)); +#elif PPSSPP_ARCH(ARM64_NEON) + return vcvtq_f32_s32(ivec); +#endif + } return Vec4((T2)x, (T2)y, (T2)z, (T2)w); } From ae9d34370e35e40bc27bd5a3df31687291f81b93 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 24 Jun 2023 12:37:08 -0700 Subject: [PATCH 2/3] softgpu: Move wsum_recip out of the triangle loop. Seems like a small benefit, but not seeing any issues from this. Noticed by fp64. --- GPU/Software/Rasterizer.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPU/Software/Rasterizer.cpp b/GPU/Software/Rasterizer.cpp index dbb53dea7648..4e375d16a74c 100644 --- a/GPU/Software/Rasterizer.cpp +++ b/GPU/Software/Rasterizer.cpp @@ -901,6 +901,9 @@ void DrawTriangleSlice( Vec4 w1_base = e1.Start(v2.screenpos, v0.screenpos, pprime); Vec4 w2_base = e2.Start(v0.screenpos, v1.screenpos, pprime); + // The sum of weights should remain constant as we move toward/away from the edges. + const Vec4 wsum_recip = EdgeRecip(w0_base, w1_base, w2_base); + // All the z values are the same, no interpolation required. // This is common, and when we interpolate, we lose accuracy. const bool flatZ = v0.screenpos.z == v1.screenpos.z && v0.screenpos.z == v2.screenpos.z; @@ -963,8 +966,6 @@ void DrawTriangleSlice( // If p is on or inside all edges, render pixel Vec4 mask = MakeMask(w0, w1, w2, bias0, bias1, bias2, scissor_mask); if (AnyMask(mask)) { - Vec4 wsum_recip = EdgeRecip(w0, w1, w2); - Vec4 z; if (flatZ) { z = Vec4::AssignToAll(v2.screenpos.z); From 15b66ba6c0aae3278cc5560b89ecf55e1456b256 Mon Sep 17 00:00:00 2001 From: "Unknown W. Brackets" Date: Sat, 24 Jun 2023 14:49:23 -0700 Subject: [PATCH 3/3] softgpu: Make SIMD on x86_32 a bit safer. --- GPU/Math3D.h | 112 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 79 insertions(+), 33 deletions(-) diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 96e3ae61eba6..ecb0aa27440f 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -585,7 +585,7 @@ class Vec4 #if defined(_M_SSE) && !PPSSPP_ARCH(X86) return _mm_cvtps_epi32(vec); #elif defined(_M_SSE) - return _mm_cvtps_epi32(_mm_loadu_ps((float *)&vec)); + return _mm_cvtps_epi32(_mm_loadu_ps(&x)); #elif PPSSPP_ARCH(ARM64_NEON) return vcvtq_s32_f32(ivec); #endif @@ -928,10 +928,15 @@ inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) } inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) { -#if defined(_M_SSE) && PPSSPP_ARCH(64BIT) - __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); +#if defined(_M_SSE) +#if PPSSPP_ARCH(X86) + const __m128 vv = _mm_loadu_ps(&v.x); +#else + const __m128 vv = v.vec; +#endif + __m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Vec3ByMatrix43Internal(v.vec, m); @@ -987,10 +992,15 @@ inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) } inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) { -#if defined(_M_SSE) && PPSSPP_ARCH(64BIT) - __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); +#if defined(_M_SSE) +#if PPSSPP_ARCH(X86) + const __m128 vv = _mm_loadu_ps(&v.x); +#else + const __m128 vv = v.vec; +#endif + __m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Vec3ByMatrix44Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Vec3ByMatrix44Internal(v.vec, m); @@ -1046,10 +1056,15 @@ inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12] } inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) { -#if defined(_M_SSE) && PPSSPP_ARCH(64BIT) - __m128 x = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(0, 0, 0, 0)); - __m128 y = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(1, 1, 1, 1)); - __m128 z = _mm_shuffle_ps(v.vec, v.vec, _MM_SHUFFLE(2, 2, 2, 2)); +#if defined(_M_SSE) +#if PPSSPP_ARCH(X86) + const __m128 vv = _mm_loadu_ps(&v.x); +#else + const __m128 vv = v.vec; +#endif + __m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2)); return Norm3ByMatrix43Internal(x, y, z, m); #elif PPSSPP_ARCH(ARM64_NEON) return Norm3ByMatrix43Internal(v.vec, m); @@ -1213,7 +1228,7 @@ __forceinline unsigned int Vec3::ToRGB() const #if PPSSPP_ARCH(64BIT) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); #else - __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f))); + __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f))); #endif __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF; @@ -1297,7 +1312,7 @@ __forceinline unsigned int Vec4::ToRGBA() const #if PPSSPP_ARCH(64BIT) __m128i c = _mm_cvtps_epi32(_mm_mul_ps(vec, _mm_set_ps1(255.0f))); #else - __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps((float *)&vec), _mm_set_ps1(255.0f))); + __m128i c = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(255.0f))); #endif __m128i c16 = _mm_packs_epi32(c, c); return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)); @@ -1343,60 +1358,91 @@ __forceinline void Vec4::ToRGBA(u8 *rgba) const // Vec3 operation template<> -inline void Vec3::operator += (const Vec3 &other) -{ +inline void Vec3::operator += (const Vec3 &other) { +#if PPSSPP_ARCH(X86) + *this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)); +#else vec = _mm_add_ps(vec, other.vec); +#endif } template<> -inline Vec3 Vec3::operator + (const Vec3 &other) const -{ +inline Vec3 Vec3::operator + (const Vec3 &other) const { +#if PPSSPP_ARCH(X86) + return Vec3(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x))); +#else return Vec3(_mm_add_ps(vec, other.vec)); +#endif } template<> -inline Vec3 Vec3::operator * (const Vec3 &other) const -{ +inline Vec3 Vec3::operator * (const Vec3 &other) const { +#if PPSSPP_ARCH(X86) + return Vec3(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x))); +#else return Vec3(_mm_mul_ps(vec, other.vec)); +#endif } template<> template<> -inline Vec3 Vec3::operator * (const float &other) const -{ +inline Vec3 Vec3::operator * (const float &other) const { +#if PPSSPP_ARCH(X86) + return Vec3(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other))); +#else return Vec3(_mm_mul_ps(vec, _mm_set_ps1(other))); +#endif } // Vec4 operation template<> -inline void Vec4::operator += (const Vec4 &other) -{ +inline void Vec4::operator += (const Vec4 &other) { +#if PPSSPP_ARCH(X86) + *this = _mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x)); +#else vec = _mm_add_ps(vec, other.vec); +#endif } template<> -inline Vec4 Vec4::operator + (const Vec4 &other) const -{ +inline Vec4 Vec4::operator + (const Vec4 &other) const { +#if PPSSPP_ARCH(X86) + return Vec4(_mm_add_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x))); +#else return Vec4(_mm_add_ps(vec, other.vec)); +#endif } template<> -inline Vec4 Vec4::operator * (const Vec4 &other) const -{ +inline Vec4 Vec4::operator * (const Vec4 &other) const { +#if PPSSPP_ARCH(X86) + return Vec4(_mm_mul_ps(_mm_loadu_ps(&x), _mm_loadu_ps(&other.x))); +#else return Vec4(_mm_mul_ps(vec, other.vec)); +#endif } template<> template<> -inline Vec4 Vec4::operator * (const float &other) const -{ +inline Vec4 Vec4::operator * (const float &other) const { +#if PPSSPP_ARCH(X86) + return Vec4(_mm_mul_ps(_mm_loadu_ps(&x), _mm_set_ps1(other))); +#else return Vec4(_mm_mul_ps(vec, _mm_set_ps1(other))); +#endif } // Vec3 cross product template<> inline Vec3 Cross(const Vec3 &a, const Vec3 &b) { - const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2))); - const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1))); +#if PPSSPP_ARCH(X86) + __m128 avec = _mm_loadu_ps(&a.x); + __m128 bvec = _mm_loadu_ps(&b.x); +#else + __m128 avec = a.vec; + __m128 bvec = b.vec; +#endif + const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2))); + const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1))); return _mm_sub_ps(left, right); } #endif