From 9f7802a8876564f4db5141fa91214cdb2b7fab8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Fri, 20 Dec 2024 10:30:23 +0100 Subject: [PATCH] Reorganize the depth vertex pipeline for future optimizations --- GPU/Common/DepthRaster.cpp | 267 +++++++++++--------------------- GPU/Common/DepthRaster.h | 13 +- GPU/Common/DrawEngineCommon.cpp | 74 ++++++++- GPU/Common/DrawEngineCommon.h | 15 ++ GPU/GPUCommonHW.cpp | 9 -- GPU/Vulkan/DrawEngineVulkan.cpp | 9 ++ 6 files changed, 201 insertions(+), 186 deletions(-) diff --git a/GPU/Common/DepthRaster.cpp b/GPU/Common/DepthRaster.cpp index 5b132fbbb304..a1ddab0727e3 100644 --- a/GPU/Common/DepthRaster.cpp +++ b/GPU/Common/DepthRaster.cpp @@ -8,13 +8,6 @@ #include "Common/Math/math_util.h" #include "GPU/Common/VertexDecoderCommon.h" -struct ScreenVert { - int x; - int y; - uint16_t z; - uint16_t behind; -}; - void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) { // Swap coordinates if needed, we don't back-face-cull rects. // We also ignore the UV rotation here. @@ -84,7 +77,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, // Adapted from Intel's depth rasterizer example. // Started with the scalar version, will SIMD-ify later. // x1/y1 etc are the scissor rect. -void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) { +void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) { int tileStartX = x1; int tileEndX = x2; @@ -96,7 +89,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, // are slow on SSE2. // Convert to whole pixels for now. Later subpixel precision. - ScreenVert verts[3]; + DepthScreenVertex verts[3]; verts[0].x = vertsSub[0].x; verts[0].y = vertsSub[0].y; verts[0].z = vertsSub[0].z; @@ -214,179 +207,123 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, } // for each row } -// We ignore lots of primitive types for now. -void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData, - const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) { +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) { + // TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder. + _dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0); - GEComparison compareMode = gstate.getDepthTestFunction(); - if (gstate.isModeClear()) { - if (!gstate.isClearModeDepthMask()) { - return; - } - compareMode = GE_COMP_ALWAYS; - } else { - if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) - return; - } + int vertexStride = dec->VertexSize(); + int offset = dec->posoff; - switch (prim) { - case GE_PRIM_INVALID: - case GE_PRIM_KEEP_PREVIOUS: - case GE_PRIM_LINES: - case GE_PRIM_LINE_STRIP: - case GE_PRIM_POINTS: - return; - default: + float temp[3]; + switch (vertTypeID & GE_VTYPE_POS_MASK) { + case GE_VTYPE_POS_8BIT: + for (int i = 0; i < count; i++) { + const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_16BIT: + for (int i = 0; i < count; i++) { + const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); + for (int j = 0; j < 3; j++) { + temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? + } + Vec3ByMatrix44(dest + i * 4, temp, worldviewproj); + } + break; + case GE_VTYPE_POS_FLOAT: + for (int i = 0; i < count; i++) { + const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); + Vec3ByMatrix44(dest + i * 4, data, worldviewproj); + } break; } +} - // TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without - // running the full decoder. - if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) { - return; +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) { + for (int i = 0; i < count; i++) { + screenVerts[i].x = (int)transformed[i].pos[0]; + screenVerts[i].y = (int)transformed[i].pos[1]; + screenVerts[i].z = (u16)transformed[i].pos[2]; } +} - bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0; - bool cullEnabled = false; - bool cullCCW = false; - - // Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer. - float *transformed = (float *)bufferData; +int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) { + bool cullEnabled = gstate.isCullEnabled(); - ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8); + const float viewportX = gstate.getViewportXCenter(); + const float viewportY = gstate.getViewportYCenter(); + const float viewportZ = gstate.getViewportZCenter(); + const float viewportScaleX = gstate.getViewportXScale(); + const float viewportScaleY = gstate.getViewportYScale(); + const float viewportScaleZ = gstate.getViewportZScale(); - // Simple, most common case. - int vertexStride = dec->VertexSize(); - int offset = dec->posoff; + bool cullCCW = false; // OK, we now have the coordinates. Let's transform, we can actually do this in-place. - if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) { - float world[16]; - float view[16]; - float worldview[16]; - float worldviewproj[16]; - ConvertMatrix4x3To4x4(world, gstate.worldMatrix); - ConvertMatrix4x3To4x4(view, gstate.viewMatrix); - Matrix4ByMatrix4(worldview, world, view); - Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? - - cullEnabled = gstate.isCullEnabled(); - - float viewportX = gstate.getViewportXCenter(); - float viewportY = gstate.getViewportYCenter(); - float viewportZ = gstate.getViewportZCenter(); - float viewportScaleX = gstate.getViewportXScale(); - float viewportScaleY = gstate.getViewportYScale(); - float viewportScaleZ = gstate.getViewportZScale(); - - bool allBehind = true; - - float temp[3]; - for (int i = 0; i < count; i++) { - switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: - for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); - } - break; - case GE_VTYPE_POS_16BIT: - for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere? - } - Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj); - } - break; - case GE_VTYPE_POS_FLOAT: - for (int i = 0; i < count; i++) { - const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset); - Vec3ByMatrix44(transformed + i * 4, data, worldviewproj); - } - break; - } - } - for (int i = 0; i < count; i++) { - float proj[4]; - memcpy(proj, transformed + i * 4, 4 * sizeof(float)); + int outCount = 0; - float w = proj[3]; + for (int i = 0; i < count; i += 3) { + const float *verts[3] = { + transformed + indexBuffer[i] * 4, + transformed + indexBuffer[i + 1] * 4, + transformed + indexBuffer[i + 2] * 4, + }; - bool inFront = w > 0.0f; - screenVerts[i].behind = !inFront; - if (inFront) { - allBehind = false; - } + // Check if any vertex is behind the 0 plane. + if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) { + // Ditch this triangle. Later we should clip here. + continue; + } + + for (int c = 0; c < 3; c++) { + const float *src = verts[c]; + float invW = 1.0f / src[3]; - // Clip to the w=0 plane. - proj[0] /= w; - proj[1] /= w; - proj[2] /= w; + float x = src[0] * invW; + float y = src[1] * invW; + float z = src[2] * invW; - // Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport - // and offset params. float screen[3]; - screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); - screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); - screen[2] = (proj[2] * viewportScaleZ + viewportZ); + screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16(); + screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16(); + screen[2] = (z * viewportScaleZ + viewportZ); if (screen[2] < 0.0f) { screen[2] = 0.0f; } if (screen[2] >= 65535.0f) { screen[2] = 65535.0f; } - screenVerts[i].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. - screenVerts[i].y = screen[1] * (1.0f / 16.0f); - screenVerts[i].z = screen[2]; + screenVerts[outCount].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here. + screenVerts[outCount].y = screen[1] * (1.0f / 16.0f); + screenVerts[outCount].z = screen[2]; + + outCount++; } - if (allBehind) { - // Cull the whole draw. + } + return outCount; +} + +// Rasterizes screen-space vertices. +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) { + // Prim should now be either TRIANGLES or RECTs. + _dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES); + + GEComparison compareMode = gstate.getDepthTestFunction(); + if (gstate.isModeClear()) { + if (!gstate.isClearModeDepthMask()) { return; } + compareMode = GE_COMP_ALWAYS; } else { - float factor = 1.0f; - switch (vertTypeID & GE_VTYPE_POS_MASK) { - case GE_VTYPE_POS_8BIT: - for (int i = 0; i < count; i++) { - const s8 *data = (const s8 *)vertexData + i * vertexStride + offset; - for (int j = 0; j < 3; j++) { - transformed[i * 4 + j] = data[j] * factor; - } - transformed[i * 4 + 3] = 1.0f; - } - break; - case GE_VTYPE_POS_16BIT: - for (int i = 0; i < count; i++) { - const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset)); - for (int j = 0; j < 3; j++) { - transformed[i * 4 + j] = data[j] * factor; - } - transformed[i * 4 + 3] = 1.0f; - } - break; - case GE_VTYPE_POS_FLOAT: - for (int i = 0; i < count; i++) { - memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3); - transformed[i * 4 + 3] = 1.0f; - } - break; - } - - for (int i = 0; i < count; i++) { - screenVerts[i].x = (int)transformed[i * 4 + 0]; - screenVerts[i].y = (int)transformed[i * 4 + 1]; - screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f); - } + if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled()) + return; } - // Then we need to stitch primitives from strips, etc etc... - // For now we'll just do it tri by tri. Later let's be more efficient. - switch (prim) { case GE_PRIM_RECTANGLES: for (int i = 0; i < count / 2; i++) { @@ -399,30 +336,10 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i break; case GE_PRIM_TRIANGLES: for (int i = 0; i < count / 3; i++) { - if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) { - continue; - } DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode); } break; - case GE_PRIM_TRIANGLE_STRIP: - { - int wind = 2; - for (int i = 0; i < count - 2; i++) { - int i0 = i; - int i1 = i + wind; - wind ^= 3; - int i2 = i + wind; - if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) { - continue; - } - ScreenVert v[3]; - v[0] = screenVerts[i0]; - v[1] = screenVerts[i1]; - v[2] = screenVerts[i2]; - DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode); - } - break; - } + default: + _dbg_assert_(false); } } diff --git a/GPU/Common/DepthRaster.h b/GPU/Common/DepthRaster.h index 01fa60e257d1..d639103aaaa2 100644 --- a/GPU/Common/DepthRaster.h +++ b/GPU/Common/DepthRaster.h @@ -3,10 +3,21 @@ #include "Common/CommonTypes.h" #include "GPU/ge_constants.h" +struct DepthScreenVertex { + int x; + int y; + uint16_t z; +}; + // Specialized, very limited depth-only rasterizer. // Meant to run in parallel with hardware rendering, in games that read back the depth buffer // for effects like lens flare. // So, we can be quite inaccurate without any issues, and skip a lot of functionality. class VertexDecoder; -void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise); +struct TransformedVertex; + +int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count); +void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID); +void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count); +void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count); diff --git a/GPU/Common/DrawEngineCommon.cpp b/GPU/Common/DrawEngineCommon.cpp index 818021a79b3a..df80511b4691 100644 --- a/GPU/Common/DrawEngineCommon.cpp +++ b/GPU/Common/DrawEngineCommon.cpp @@ -23,9 +23,11 @@ #include "Common/LogReporting.h" #include "Common/Math/SIMDHeaders.h" #include "Common/Math/lin/matrix4x4.h" +#include "Core/System.h" #include "Core/Config.h" #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/SplineCommon.h" +#include "GPU/Common/DepthRaster.h" #include "GPU/Common/VertexDecoderCommon.h" #include "GPU/Common/SoftwareTransformCommon.h" #include "GPU/ge_constants.h" @@ -34,7 +36,9 @@ #define QUAD_INDICES_MAX 65536 enum { - TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex) + TRANSFORMED_VERTEX_BUFFER_SIZE = VERTEX_BUFFER_MAX * sizeof(TransformedVertex), + DEPTH_TRANSFORMED_SIZE = VERTEX_BUFFER_MAX * 4, + DEPTH_SCREENVERTS_SIZE = VERTEX_BUFFER_MAX * sizeof(DepthScreenVertex), }; DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { @@ -46,6 +50,12 @@ DrawEngineCommon::DrawEngineCommon() : decoderMap_(32) { decoded_ = (u8 *)AllocateMemoryPages(DECODED_VERTEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); decIndex_ = (u16 *)AllocateMemoryPages(DECODED_INDEX_BUFFER_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); indexGen.Setup(decIndex_); + + useDepthRaster_ = PSP_CoreParameter().compat.flags().SoftwareRasterDepth; + if (useDepthRaster_) { + depthTransformed_ = (float *)AllocateMemoryPages(DEPTH_TRANSFORMED_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + depthScreenVerts_ = (DepthScreenVertex *)AllocateMemoryPages(DEPTH_SCREENVERTS_SIZE, MEM_PROT_READ | MEM_PROT_WRITE); + } } DrawEngineCommon::~DrawEngineCommon() { @@ -53,6 +63,10 @@ DrawEngineCommon::~DrawEngineCommon() { FreeMemoryPages(decIndex_, DECODED_INDEX_BUFFER_SIZE); FreeMemoryPages(transformed_, TRANSFORMED_VERTEX_BUFFER_SIZE); FreeMemoryPages(transformedExpanded_, 3 * TRANSFORMED_VERTEX_BUFFER_SIZE); + if (depthTransformed_) { + FreeMemoryPages(depthTransformed_, DEPTH_TRANSFORMED_SIZE); + FreeMemoryPages(depthScreenVerts_, DEPTH_SCREENVERTS_SIZE); + } delete decJitCache_; decoderMap_.Iterate([&](const uint32_t vtype, VertexDecoder *decoder) { delete decoder; @@ -886,3 +900,61 @@ bool DrawEngineCommon::DescribeCodePtr(const u8 *ptr, std::string &name) const { return false; } } + +void DrawEngineCommon::DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + if (vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) { + return; + } + + float world[16]; + float view[16]; + float worldview[16]; + float worldviewproj[16]; + ConvertMatrix4x3To4x4(world, gstate.worldMatrix); + ConvertMatrix4x3To4x4(view, gstate.viewMatrix); + Matrix4ByMatrix4(worldview, world, view); + Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix? + + // Decode. + int numDec = 0; + for (int i = 0; i < numDrawVerts_; i++) { + DecodeAndTransformForDepthRaster(depthTransformed_ + numDec * 4, prim, worldviewproj, drawVerts_[i].verts, drawVerts_[i].vertexCount, dec, vertTypeID); + numDec += drawVerts_[i].vertexCount; + } + + // Clip and triangulate using the index buffer. + int outVertCount = DepthRasterClipIndexedTriangles(depthScreenVerts_, depthTransformed_, decIndex_, numDec); + + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + GE_PRIM_TRIANGLES, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + depthScreenVerts_, outVertCount); +} + +void DrawEngineCommon::DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count) { + switch (prim) { + case GE_PRIM_INVALID: + case GE_PRIM_KEEP_PREVIOUS: + case GE_PRIM_LINES: + case GE_PRIM_LINE_STRIP: + case GE_PRIM_POINTS: + return; + default: + break; + } + + DepthRasterConvertTransformed(depthScreenVerts_, inVerts, count); + DepthRasterScreenVerts((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), + prim, gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), + depthScreenVerts_, count); +} diff --git a/GPU/Common/DrawEngineCommon.h b/GPU/Common/DrawEngineCommon.h index 595ab929aab4..64e8478cd06e 100644 --- a/GPU/Common/DrawEngineCommon.h +++ b/GPU/Common/DrawEngineCommon.h @@ -27,6 +27,7 @@ #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/IndexGenerator.h" #include "GPU/Common/VertexDecoderCommon.h" +#include "GPU/Common/DepthRaster.h" class VertexDecoder; @@ -174,6 +175,9 @@ class DrawEngineCommon { void ApplyFramebufferRead(FBOTexState *fboTexState); + void DepthRasterTransform(GEPrimitiveType prim, VertexDecoder *dec, uint32_t vertTypeID); + void DepthRasterPretransformed(GEPrimitiveType prim, const TransformedVertex *inVerts, int count); + static inline int IndexSize(u32 vtype) { const u32 indexType = (vtype & GE_VTYPE_IDX_MASK); if (indexType == GE_VTYPE_IDX_16BIT) { @@ -228,6 +232,11 @@ class DrawEngineCommon { } inline bool CollectedPureDraw() const { + // TODO: Do something faster. + if (useDepthRaster_) { + return false; + } + switch (seenPrims_) { case 1 << GE_PRIM_TRIANGLE_STRIP: return !anyCCWOrIndexed_ && numDrawInds_ == 1; @@ -343,4 +352,10 @@ class DrawEngineCommon { bool offsetOutsideEdge_; GPUCommon *gpuCommon_; + + // Software depth raster + bool useDepthRaster_ = false; + + float *depthTransformed_ = nullptr; + DepthScreenVertex *depthScreenVerts_ = nullptr; }; diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index f5383cf6be59..9b5389c8750a 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -13,7 +13,6 @@ #include "GPU/Common/DrawEngineCommon.h" #include "GPU/Common/TextureCacheCommon.h" #include "GPU/Common/FramebufferManagerCommon.h" -#include "GPU/Common/DepthRaster.h" struct CommonCommandTableEntry { uint8_t cmd; @@ -1040,10 +1039,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, prim, count, decoder, vertTypeID, true, &bytesRead)) { canExtend = false; - } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { - DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), - gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), - verts, inds, prim, count, decoder, vertTypeID, false); } onePassed = true; } else { @@ -1122,10 +1117,6 @@ void GPUCommonHW::Execute_Prim(u32 op, u32 diff) { if (passCulling) { if (!drawEngineCommon_->SubmitPrim(verts, inds, newPrim, count, decoder, vertTypeID, clockwise, &bytesRead)) { canExtend = false; - } else if (PSP_CoreParameter().compat.flags().SoftwareRasterDepth) { - DepthRasterPrim((uint16_t *)Memory::GetPointerWrite(gstate.getDepthBufRawAddress() | 0x04000000), gstate.DepthBufStride(), - gstate.getScissorX1(), gstate.getScissorY1(), gstate.getScissorX2(), gstate.getScissorY2(), drawEngineCommon_->GetTempSpace(), - verts, inds, newPrim, count, decoder, vertTypeID, clockwise); } // As soon as one passes, assume we don't need to check the rest of this batch. onePassed = true; diff --git a/GPU/Vulkan/DrawEngineVulkan.cpp b/GPU/Vulkan/DrawEngineVulkan.cpp index f1279b855a69..e1775722ee3e 100644 --- a/GPU/Vulkan/DrawEngineVulkan.cpp +++ b/GPU/Vulkan/DrawEngineVulkan.cpp @@ -370,6 +370,9 @@ void DrawEngineVulkan::Flush() { } else { renderManager->Draw(descSetIndex, ARRAY_SIZE(dynamicUBOOffsets), dynamicUBOOffsets, vbuf, vbOffset, vertexCount); } + if (useDepthRaster_) { + DepthRasterTransform(prim, dec_, dec_->VertexType()); + } } else { PROFILE_THIS_SCOPE("soft"); VertexDecoder *swDec = dec_; @@ -438,6 +441,12 @@ void DrawEngineVulkan::Flush() { swTransform.SetProjMatrix(gstate.projMatrix, gstate_c.vpWidth < 0, gstate_c.vpHeight < 0, trans, scale); swTransform.Transform(prim, swDec->VertexType(), swDec->GetDecVtxFmt(), numDecodedVerts_, &result); + + // At this point, rect and line primitives are still preserved as such. So, it's the best time to do software depth raster. + if (useDepthRaster_) { + DepthRasterPretransformed(prim, transformed_, numDecodedVerts_); + } + // Non-zero depth clears are unusual, but some drivers don't match drawn depth values to cleared values. // Games sometimes expect exact matches (see #12626, for example) for equal comparisons. if (result.action == SW_CLEAR && everUsedEqualDepth_ && gstate.isClearModeDepthMask() && result.depth > 0.0f && result.depth < 1.0f)