diff --git a/Common/GPU/Shader.h b/Common/GPU/Shader.h index 12a7ddb8492d..75e1a9b2e95c 100644 --- a/Common/GPU/Shader.h +++ b/Common/GPU/Shader.h @@ -85,6 +85,12 @@ struct UniformBufferDesc { std::vector uniforms; }; +struct UniformDef { + const char *type; + const char *name; + int index; +}; + struct SamplerDef { const char *name; // TODO: Might need unsigned samplers, 3d samplers, or other types in the future. diff --git a/Common/GPU/ShaderWriter.h b/Common/GPU/ShaderWriter.h index a2d80a69101c..7464ebc652a3 100644 --- a/Common/GPU/ShaderWriter.h +++ b/Common/GPU/ShaderWriter.h @@ -22,12 +22,6 @@ struct InputDef { int semantic; }; -struct UniformDef { - const char *type; - const char *name; - int index; -}; - struct VaryingDef { const char *type; const char *name; diff --git a/Common/Math/math_util.h b/Common/Math/math_util.h index 0807d47003b1..fd47662b5409 100644 --- a/Common/Math/math_util.h +++ b/Common/Math/math_util.h @@ -28,6 +28,7 @@ inline bool isPowerOf2(int n) { return n == 1 || (n & (n - 1)) == 0; } +// Next power of 2. inline uint32_t RoundUpToPowerOf2(uint32_t v) { v--; v |= v >> 1; diff --git a/GPU/Common/DepalettizeShaderCommon.cpp b/GPU/Common/DepalettizeShaderCommon.cpp index e98c42485804..1746215ada46 100644 --- a/GPU/Common/DepalettizeShaderCommon.cpp +++ b/GPU/Common/DepalettizeShaderCommon.cpp @@ -26,6 +26,7 @@ #include "Core/Reporting.h" #include "GPU/Common/GPUStateUtils.h" #include "GPU/Common/DepalettizeShaderCommon.h" +#include "GPU/Common/Draw2D.h" static const InputDef vsInputs[2] = { { "vec2", "a_position", Draw::SEM_POSITION, }, @@ -47,10 +48,23 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) { const int shift = config.shift; const int mask = config.mask; + writer.C(" vec2 texcoord = v_texcoord;\n"); + + // Implement the swizzle we need to simulate, if a game uses 8888 framebuffers and any other mode than "6" to access depth textures. + // This implements the "2" mode swizzle (it fixes up the Y direction but not X. See comments on issue #15898) + // NOTE: This swizzle can be made to work with any power-of-2 resolution scaleFactor by shifting + // the bits around, but not sure how to handle 3x scaling. For now this is 1x-only (rough edges at higher resolutions). if (config.bufferFormat == GE_FORMAT_DEPTH16) { DepthScaleFactors factors = GetDepthScaleFactors(); writer.ConstFloat("z_scale", factors.scale); writer.ConstFloat("z_offset", factors.offset); + if (config.depthUpperBits == 0x2) { + writer.C(R"( + int x = int((texcoord.x / scaleFactor) * texSize.x); + int temp = (x & 0xFFFFFE0F) | ((x >> 1) & 0xF0) | ((x << 4) & 0x100); + texcoord.x = (float(temp) / texSize.x) * scaleFactor; +)"); + } } // Sampling turns our texture into floating point. To avoid this, might be able @@ -66,7 +80,7 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) { // An alternative would be to have a special mode where we keep some extra precision here and sample the CLUT linearly - works for ramps such // as those that Test Drive uses for its color remapping. But would need game specific flagging. - writer.C(" vec4 color = ").SampleTexture2D("tex", "v_texcoord").C(";\n"); + writer.C(" vec4 color = ").SampleTexture2D("tex", "texcoord").C(";\n"); int shiftedMask = mask << shift; switch (config.bufferFormat) { @@ -103,6 +117,7 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) { if (config.bufferFormat == GE_FORMAT_DEPTH16 && config.textureFormat == GE_TFMT_5650) { // Convert depth to 565, without going through a CLUT. + // TODO: Make "depal without a CLUT" a separate concept, to avoid redundantly creating a CLUT texture. writer.C(" int idepth = int(clamp(depth, 0.0, 65535.0));\n"); writer.C(" float r = float(idepth & 31) / 31.0f;\n"); writer.C(" float g = float((idepth >> 5) & 63) / 63.0f;\n"); @@ -323,7 +338,7 @@ void GenerateDepalSmoothed(ShaderWriter &writer, const DepalConfig &config) { void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config) { writer.DeclareSamplers(samplers); writer.HighPrecisionFloat(); - writer.BeginFSMain(Slice::empty(), varyings, FSFLAG_NONE); + writer.BeginFSMain(config.bufferFormat == GE_FORMAT_DEPTH16 ? g_draw2Duniforms : Slice::empty(), varyings, FSFLAG_NONE); if (config.smoothedDepal) { // Handles a limited set of cases, but doesn't need any integer math so we don't // need two variants. diff --git a/GPU/Common/DepalettizeShaderCommon.h b/GPU/Common/DepalettizeShaderCommon.h index 433dfa74df82..0f72afe27c36 100644 --- a/GPU/Common/DepalettizeShaderCommon.h +++ b/GPU/Common/DepalettizeShaderCommon.h @@ -27,13 +27,14 @@ class ShaderWriter; static const int DEPAL_TEXTURE_OLD_AGE = 120; struct DepalConfig { - int mask; - int shift; u32 startPos; + u8 mask; + u8 shift; + bool smoothedDepal; + u8 depthUpperBits; GEPaletteFormat clutFormat; GETextureFormat textureFormat; GEBufferFormat bufferFormat; - bool smoothedDepal; }; void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config); diff --git a/GPU/Common/Draw2D.cpp b/GPU/Common/Draw2D.cpp index 1043858891ec..cd54704e9ab0 100644 --- a/GPU/Common/Draw2D.cpp +++ b/GPU/Common/Draw2D.cpp @@ -40,7 +40,7 @@ static const SamplerDef samplers[1] = { { "tex" }, }; -static const UniformDef uniforms[2] = { +const UniformDef g_draw2Duniforms[2] = { { "vec2", "texSize", 0 }, { "float", "scaleFactor", 1}, }; @@ -53,7 +53,7 @@ struct Draw2DUB { const UniformBufferDesc draw2DUBDesc{ sizeof(Draw2DUB), { { "texSize", -1, 0, UniformType::FLOAT2, 0 }, - { "scaleFactor", -1, 1, UniformType::FLOAT1, 0 }, + { "scaleFactor", -1, 1, UniformType::FLOAT1, 8 }, } }; @@ -102,7 +102,7 @@ Draw2DPipelineInfo GenerateDraw2D565ToDepthFs(ShaderWriter &writer) { Draw2DPipelineInfo GenerateDraw2D565ToDepthDeswizzleFs(ShaderWriter &writer) { writer.DeclareSamplers(samplers); - writer.BeginFSMain(uniforms, varyings, FSFLAG_WRITEDEPTH); + writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_WRITEDEPTH); writer.C(" vec4 outColor = vec4(0.0, 0.0, 0.0, 0.0);\n"); // Unlike when just copying a depth buffer, here we're generating new depth values so we'll // have to apply the scaling. @@ -253,6 +253,20 @@ Draw2DPipeline *Draw2D::Create2DPipeline(std::function samplers; }; +extern const UniformDef g_draw2Duniforms[2]; + struct Draw2DPipeline { Draw::Pipeline *pipeline; Draw2DPipelineInfo info; @@ -58,6 +60,8 @@ class Draw2D { Draw2DPipeline *Create2DPipeline(std::function generate); void DrawStrip2D(Draw::Texture *tex, Draw2DVertex *verts, int vertexCount, bool linearFilter, Draw2DPipeline *pipeline, float texW = 0.0f, float texH = 0.0f, int scaleFactor = 0); + + void Blit(Draw2DPipeline *pipeline, float srcX1, float srcY1, float srcX2, float srcY2, float dstX1, float dstY1, float dstX2, float dstY2, float srcWidth, float srcHeight, float dstWidth, float dstHeight, bool linear, int scaleFactor); void Ensure2DResources(); private: diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp index a75b0d4b851a..1b25a4608538 100644 --- a/GPU/Common/FramebufferManagerCommon.cpp +++ b/GPU/Common/FramebufferManagerCommon.cpp @@ -27,6 +27,7 @@ #include "Common/Math/math_util.h" #include "Common/System/Display.h" #include "Common/CommonTypes.h" +#include "Common/StringUtils.h" #include "Core/Config.h" #include "Core/ConfigValues.h" #include "Core/Core.h" @@ -348,7 +349,7 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(const Frame const int x_offset = (params.fb_address - v->fb_address) / bpp; if (x_offset < params.fb_stride && v->height >= drawing_height) { // Pretty certainly a pure render-to-X-offset. - WARN_LOG_REPORT_ONCE(renderoffset, HLE, "Rendering to framebuffer offset: %08x +%dx%d", v->fb_address, x_offset, 0); + WARN_LOG_REPORT_ONCE(renderoffset, HLE, "Rendering to framebuffer offset at %08x +%dx%d (stride %d)", v->fb_address, x_offset, 0, v->fb_stride); vfb = v; gstate_c.SetCurRTOffset(x_offset, 0); vfb->width = std::max((int)vfb->width, x_offset + drawing_width); @@ -446,32 +447,6 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(const Frame // TODO: Is it worth trying to upload the depth buffer (only if it wasn't copied above..?) } - // Let's check for depth buffer overlap. Might be interesting (not that interesting anymore..) - bool sharingReported = false; - for (size_t i = 0, end = vfbs_.size(); i < end; ++i) { - if (vfbs_[i]->z_stride != 0 && params.fb_address == vfbs_[i]->z_address) { - // If it's clearing it, most likely it just needs more video memory. - // Technically it could write something interesting and the other might not clear, but that's not likely. - if (params.isDrawing) { - if (params.fb_address != params.z_address && vfbs_[i]->fb_address != vfbs_[i]->z_address) { - WARN_LOG_REPORT(SCEGE, "FBO created from existing depthbuffer as color, %08x/%08x and %08x/%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address); - } - } - } else if (params.z_stride != 0 && params.z_address == vfbs_[i]->fb_address) { - // If it's clearing it, then it's probably just the reverse of the above case. - if (params.isWritingDepth) { - WARN_LOG_REPORT(SCEGE, "FBO using existing buffer as depthbuffer, %08x/%08x and %08x/%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address); - } - } else if (vfbs_[i]->z_stride != 0 && params.z_address == vfbs_[i]->z_address && params.fb_address != vfbs_[i]->fb_address && !sharingReported) { - // This happens a lot, but virtually always it's cleared. - // It's possible the other might not clear, but when every game is reported it's not useful. - if (params.isWritingDepth && (vfbs_[i]->usageFlags & FB_USAGE_RENDER_DEPTH)) { - WARN_LOG(SCEGE, "FBO reusing depthbuffer, c=%08x/d=%08x and c=%08x/d=%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address); - sharingReported = true; - } - } - } - // We already have it! } else if (vfb != currentRenderVfb_) { // Use it as a render target. @@ -604,21 +579,30 @@ void FramebufferManagerCommon::CopyToDepthFromOverlappingFramebuffers(VirtualFra } // Can't easily dynamically create these strings, we just pass along the pointer. -static const char *reinterpretStrings[3][3] = { +static const char *reinterpretStrings[4][4] = { { "self_reinterpret_565", "reinterpret_565_to_5551", "reinterpret_565_to_4444", + "reinterpret_565_to_8888", }, { "reinterpret_5551_to_565", "self_reinterpret_5551", "reinterpret_5551_to_4444", + "reinterpret_5551_to_8888", }, { "reinterpret_4444_to_565", "reinterpret_4444_to_5551", "self_reinterpret_4444", + "reinterpret_4444_to_8888", + }, + { + "reinterpret_8888_to_565", + "reinterpret_8888_to_5551", + "reinterpret_8888_to_4444", + "self_reinterpret_8888", }, }; @@ -676,6 +660,17 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra continue; } sources.push_back(CopySource{ src, RASTER_COLOR, xOffset, yOffset }); + } else if (src->fb_address == dst->fb_address && src->FbStrideInBytes() == dst->FbStrideInBytes()) { + if (src->fb_stride == dst->fb_stride * 2) { + // Reinterpret from 16-bit to 32-bit. + sources.push_back(CopySource{ src, RASTER_COLOR, 0, 0 }); + } else if (src->fb_stride * 2 == dst->fb_stride) { + // Reinterpret from 32-bit to 16-bit. + sources.push_back(CopySource{ src, RASTER_COLOR, 0, 0 }); + } else { + // 16-to-16 reinterpret, should have been caught above already. + _assert_msg_(false, "Reinterpret: Shouldn't get here"); + } } } @@ -685,11 +680,15 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra bool tookActions = false; + // TODO: Only do the latest one. for (const CopySource &source : sources) { VirtualFramebuffer *src = source.vfb; // Copy a rectangle from the original to the new buffer. // Yes, we mean to look at src->width/height for the dest rectangle. + + // TODO: Try to bound the blit using gstate_c.vertBounds like depal does. + int srcWidth = src->width * src->renderScaleFactor; int srcHeight = src->height * src->renderScaleFactor; int dstWidth = src->width * dst->renderScaleFactor; @@ -707,44 +706,55 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra gpuStats.numColorCopies++; pipeline = Get2DPipeline(DRAW2D_COPY_COLOR); pass_name = "copy_color"; - } else if (IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) { - if (PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) { - if (PSP_CoreParameter().compat.flags().BlueToAlpha) { - WARN_LOG_ONCE(bta, G3D, "WARNING: Reinterpret encountered with BlueToAlpha on"); - } + } else if (PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) { + if (PSP_CoreParameter().compat.flags().BlueToAlpha) { + WARN_LOG_ONCE(bta, G3D, "WARNING: Reinterpret encountered with BlueToAlpha on"); + } - // Reinterpret! - WARN_LOG_N_TIMES(reint, 20, G3D, "Reinterpret detected from %08x_%s to %08x_%s", - src->fb_address, GeBufferFormatToString(src->fb_format), - dst->fb_address, GeBufferFormatToString(dst->fb_format)); - pipeline = reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format]; - pass_name = reinterpretStrings[(int)src->fb_format][(int)dst->fb_format]; - if (!pipeline) { - pipeline = draw2D_.Create2DPipeline([=](ShaderWriter &shaderWriter) -> Draw2DPipelineInfo { - return GenerateReinterpretFragmentShader(shaderWriter, src->fb_format, dst->fb_format); - }); - reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format] = pipeline; - } - gpuStats.numReinterpretCopies++; - } else { - // Fake reinterpret - just clear the way we always did on Vulkan. Just clear color and stencil. - if (src->fb_format == GE_FORMAT_565) { - // We have to bind here instead of clear, since it can be that no framebuffer is bound. - // The backend can sometimes directly optimize it to a clear. - - // Games that are marked as doing reinterpret just ignore this - better to keep the data than to clear. - // Fixes #13717. - if (!PSP_CoreParameter().compat.flags().ReinterpretFramebuffers && !PSP_CoreParameter().compat.flags().BlueToAlpha) { - draw_->BindFramebufferAsRenderTarget(dst->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "FakeReinterpret"); - // Need to dirty anything that has command buffer dynamic state, in case we started a new pass above. - // Should find a way to feed that information back, maybe... Or simply correct the issue in the rendermanager. - gstate_c.Dirty(DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_BLEND_STATE); - } + if (IsBufferFormat16Bit(src->fb_format) && !IsBufferFormat16Bit(dst->fb_format)) { + // We halve the X coordinates in the destination framebuffer. + // The shader will collect two pixels worth of input data and merge into one. + dstX1 *= 0.5f; + dstX2 *= 0.5f; + } else if (!IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) { + // We double the X coordinates in the destination framebuffer. + // The shader will sample and depending on the X coordinate & 1, use the upper or lower bits. + dstX1 *= 2.0f; + dstX2 *= 2.0f; + } + + // Reinterpret! + WARN_LOG_N_TIMES(reint, 5, G3D, "Reinterpret detected from %08x_%s to %08x_%s", + src->fb_address, GeBufferFormatToString(src->fb_format), + dst->fb_address, GeBufferFormatToString(dst->fb_format)); + pipeline = reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format]; + pass_name = reinterpretStrings[(int)src->fb_format][(int)dst->fb_format]; + if (!pipeline) { + pipeline = draw2D_.Create2DPipeline([=](ShaderWriter &shaderWriter) -> Draw2DPipelineInfo { + return GenerateReinterpretFragmentShader(shaderWriter, src->fb_format, dst->fb_format); + }); + reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format] = pipeline; + } + + gpuStats.numReinterpretCopies++; + } else if (IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) { + // Fake reinterpret - just clear the way we always did on Vulkan. Just clear color and stencil. + if (src->fb_format == GE_FORMAT_565) { + // We have to bind here instead of clear, since it can be that no framebuffer is bound. + // The backend can sometimes directly optimize it to a clear. + + // Games that are marked as doing reinterpret just ignore this - better to keep the data than to clear. + // Fixes #13717. + if (!PSP_CoreParameter().compat.flags().ReinterpretFramebuffers && !PSP_CoreParameter().compat.flags().BlueToAlpha) { + draw_->BindFramebufferAsRenderTarget(dst->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "FakeReinterpret"); + // Need to dirty anything that has command buffer dynamic state, in case we started a new pass above. + // Should find a way to feed that information back, maybe... Or simply correct the issue in the rendermanager. + gstate_c.Dirty(DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_BLEND_STATE); + tookActions = true; } - tookActions = true; } } - + if (pipeline) { tookActions = true; // OK we have the pipeline, now just do the blit. @@ -1435,6 +1445,9 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w, vfb->renderHeight = (u16)(vfb->bufferHeight * renderScaleFactor_); } + bool creating = old.bufferWidth == 0; + WARN_LOG(FRAMEBUF, "%s %s FBO at %08x/%d from %dx%d to %dx%d (force=%d)", creating ? "Creating" : "Resizing", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force); + // During hardware rendering, we always render at full color depth even if the game wouldn't on real hardware. // It's not worth the trouble trying to support lower bit-depth rendering, just // more cases to test that nobody will ever use. @@ -1608,68 +1621,121 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size, } } -void FramebufferManagerCommon::FindTransferFramebuffer(VirtualFramebuffer *&buffer, u32 basePtr, int stride, int &x, int &y, int &width, int &height, int bpp, bool destination) { - u32 xOffset = -1; - u32 yOffset = -1; - int transferWidth = width; - int transferHeight = height; +std::string BlockTransferRect::ToString() const { + int bpp = BufferFormatBytesPerPixel(vfb->fb_format); + return StringFromFormat("%08x/%d/%s seq:%d %d,%d %dx%d", vfb->fb_address, vfb->FbStrideInBytes(), GeBufferFormatToString(vfb->fb_format), vfb->colorBindSeq, x_bytes / bpp, y, w_bytes / bpp, h); +} +// Only looks for color buffers. Due to swizzling and other concerns, games have not been seen using block copies +// for depth data yet. +bool FramebufferManagerCommon::FindTransferFramebuffer(u32 basePtr, int stride_pixels, int x_pixels, int y, int w_pixels, int h, int bpp, bool destination, BlockTransferRect *rect) { basePtr &= 0x3FFFFFFF; + rect->vfb = nullptr; + + if (!stride_pixels) { + WARN_LOG(G3D, "Zero stride in FindTransferFrameBuffer, ignoring"); + return false; + } + + const u32 byteStride = stride_pixels * bpp; + int x_bytes = x_pixels * bpp; + int w_bytes = w_pixels * bpp; + + std::vector candidates; + + // We work entirely in bytes when we do the matching, because games don't consistently use bpps that match + // that of their buffers. Then after matching we try to map the copy to the simplest operation that does + // what we need. for (auto vfb : vfbs_) { const u32 vfb_address = vfb->fb_address & 0x3FFFFFFF; const u32 vfb_size = ColorBufferByteSize(vfb); + + if (basePtr < vfb_address || basePtr >= vfb_address + vfb_size) { + continue; + } + const u32 vfb_bpp = BufferFormatBytesPerPixel(vfb->fb_format); - const u32 vfb_byteStride = vfb->fb_stride * vfb_bpp; - const u32 vfb_byteWidth = vfb->width * vfb_bpp; - - if (vfb_address <= basePtr && basePtr < vfb_address + vfb_size) { - const u32 byteOffset = basePtr - vfb_address; - const u32 byteStride = stride * bpp; - const u32 memYOffset = byteOffset / byteStride; - - // Some games use mismatching bitdepths. But make sure the stride matches. - // If it doesn't, generally this means we detected the framebuffer with too large a height. - // Use bufferHeight in case of buffers that resize up and down often per frame (Valkyrie Profile.) - - // TODO: Surely this first comparison should be <= ? - // Or does the exact match (byteOffset == 0) case get handled elsewhere? - bool match = memYOffset < yOffset && (int)memYOffset <= (int)vfb->bufferHeight - height; - if (match && vfb_byteStride != byteStride) { - // Grand Knights History copies with a mismatching stride but a full line at a time. - // That's why we multiply by height, not width - this copy is a rectangle with the wrong stride but a line with the correct one. - // Makes it hard to detect the wrong transfers in e.g. God of War. - if (transferWidth != stride || (byteStride * transferHeight != vfb_byteStride && byteStride * transferHeight != vfb_byteWidth)) { - if (destination) { - // However, some other games write cluts to framebuffers. - // Let's catch this and upload. Otherwise reject the match. - match = (vfb->usageFlags & FB_USAGE_CLUT) != 0; - if (match) { - width = byteStride * transferHeight / vfb_bpp; - height = 1; - } + const u32 vfb_byteStride = vfb->FbStrideInBytes(); + const u32 vfb_byteWidth = vfb->WidthInBytes(); + + BlockTransferRect candidate{ vfb }; + candidate.w_bytes = w_pixels * bpp; + candidate.h = h; + + const u32 byteOffset = basePtr - vfb_address; + const int memXOffset = byteOffset % byteStride; + const int memYOffset = byteOffset / byteStride; + + // Some games use mismatching bitdepths. But make sure the stride matches. + // If it doesn't, generally this means we detected the framebuffer with too large a height. + // Use bufferHeight in case of buffers that resize up and down often per frame (Valkyrie Profile.) + + // If it's outside the vfb by a single pixel, we currently disregard it. + if (memYOffset > vfb->bufferHeight - h) { + continue; + } + + if (byteOffset == vfb->WidthInBytes() && w_bytes < vfb->FbStrideInBytes()) { + // Looks like we're in a margin texture of the vfb, which is not the vfb itself. + // Ignore the match. + continue; + } + + if (vfb_byteStride != byteStride) { + // Grand Knights History occasionally copies with a mismatching stride but a full line at a time. + // That's why we multiply by height, not width - this copy is a rectangle with the wrong stride but a line with the correct one. + // Makes it hard to detect the wrong transfers in e.g. God of War. + if (w_pixels != stride_pixels || (byteStride * h != vfb_byteStride && byteStride * h != vfb_byteWidth)) { + if (destination) { + // However, some other games write cluts to framebuffers. + // Let's catch this and upload. Otherwise reject the match. + bool match = (vfb->usageFlags & FB_USAGE_CLUT) != 0; + if (match) { + candidate.w_bytes = byteStride * h; + h = 1; } else { - match = false; + continue; } } else { - width = byteStride * transferHeight / vfb_bpp; - height = 1; + continue; } - } else if (match) { - width = transferWidth; - height = transferHeight; - } - if (match) { - xOffset = stride == 0 ? 0 : (byteOffset / bpp) % stride; - yOffset = memYOffset; - buffer = vfb; + } else { + // This is the Grand Knights History case. + candidate.w_bytes = byteStride * h; + candidate.h = 1; } + } else { + candidate.w_bytes = w_bytes; + candidate.h = h; } + + candidate.x_bytes = x_bytes + memXOffset; + candidate.y = y + memYOffset; + candidate.vfb = vfb; + candidates.push_back(candidate); } - if (yOffset != (u32)-1) { - x += xOffset; - y += yOffset; + // Sort candidates by just recency for now, we might add other. + std::sort(candidates.begin(), candidates.end()); + + if (candidates.size() > 1) { + std::string log; + for (auto &candidate : candidates) { + log += " - " + candidate.ToString() + "\n"; + } + WARN_LOG_N_TIMES(mulblock, 5, G3D, "Multiple framebuffer candidates for %08x/%d/%d %d,%d %dx%d (dest = %d):\n%s", basePtr, stride_pixels, bpp, x_pixels, y, w_pixels, h, (int)destination, log.c_str()); + } + + if (!candidates.empty()) { + // Pick the last candidate. + *rect = candidates.back(); + return true; + } else { + if (Memory::IsVRAMAddress(basePtr) && destination && h >= 128) { + WARN_LOG_N_TIMES(nocands, 5, G3D, "Didn't find a destination candidate for %08x/%d/%d %d,%d %dx%d", basePtr, stride_pixels, bpp, x_pixels, y, w_pixels, h); + } + return false; } } @@ -1866,92 +1932,125 @@ bool FramebufferManagerCommon::NotifyBlockTransferBefore(u32 dstBasePtr, int dst return false; } - VirtualFramebuffer *dstBuffer = 0; - VirtualFramebuffer *srcBuffer = 0; - int srcWidth = width; - int srcHeight = height; - int dstWidth = width; - int dstHeight = height; + BlockTransferRect dstRect{}; + BlockTransferRect srcRect{}; // These modify the X/Y/W/H parameters depending on the memory offset of the base pointers from the actual buffers. - FindTransferFramebuffer(srcBuffer, srcBasePtr, srcStride, srcX, srcY, srcWidth, srcHeight, bpp, false); - FindTransferFramebuffer(dstBuffer, dstBasePtr, dstStride, dstX, dstY, dstWidth, dstHeight, bpp, true); + bool srcBuffer = FindTransferFramebuffer(srcBasePtr, srcStride, srcX, srcY, width, height, bpp, false, &srcRect); + bool dstBuffer = FindTransferFramebuffer(dstBasePtr, dstStride, dstX, dstY, width, height, bpp, true, &dstRect); if (srcBuffer && !dstBuffer) { + // In here, we can't read from dstRect. if (PSP_CoreParameter().compat.flags().BlockTransferAllowCreateFB || (PSP_CoreParameter().compat.flags().IntraVRAMBlockTransferAllowCreateFB && - Memory::IsVRAMAddress(srcBuffer->fb_address) && Memory::IsVRAMAddress(dstBasePtr))) { + Memory::IsVRAMAddress(srcRect.vfb->fb_address) && Memory::IsVRAMAddress(dstBasePtr))) { GEBufferFormat ramFormat; // Try to guess the appropriate format. We only know the bpp from the block transfer command (16 or 32 bit). if (bpp == 4) { // Only one possibility unless it's doing split pixel tricks (which we could detect through stride maybe). ramFormat = GE_FORMAT_8888; - } else if (srcBuffer->fb_format != GE_FORMAT_8888) { + } else if (srcRect.vfb->fb_format != GE_FORMAT_8888) { // We guess that the game will interpret the data the same as it was in the source of the copy. // Seems like a likely good guess, and works in Test Drive Unlimited. - ramFormat = srcBuffer->fb_format; + ramFormat = srcRect.vfb->fb_format; } else { // No info left - just fall back to something. But this is definitely split pixel tricks. ramFormat = GE_FORMAT_5551; } - dstBuffer = CreateRAMFramebuffer(dstBasePtr, dstWidth, dstHeight, dstStride, ramFormat); + dstBuffer = true; + dstRect.vfb = CreateRAMFramebuffer(dstBasePtr, width, height, dstStride, ramFormat); } } - if (dstBuffer) - dstBuffer->last_frame_used = gpuStats.numFlips; + if (dstBuffer) { + dstRect.vfb->last_frame_used = gpuStats.numFlips; + // Mark the destination as fresh. + dstRect.vfb->colorBindSeq = GetBindSeqCount(); + } if (dstBuffer && srcBuffer) { - if (srcBuffer == dstBuffer) { - if (srcX != dstX || srcY != dstY) { - WARN_LOG_N_TIMES(dstsrc, 100, G3D, "Intra-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)", - width, height, bpp, - srcBasePtr, srcX, srcY, srcStride, - dstBasePtr, dstX, dstY, dstStride); - FlushBeforeCopy(); - // Some backends can handle blitting within a framebuffer. Others will just have to deal with it or ignore it, apparently. - BlitFramebuffer(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, dstWidth, dstHeight, bpp, "Blit_IntraBufferBlockTransfer"); - RebindFramebuffer("rebind after intra block transfer"); - SetColorUpdated(dstBuffer, skipDrawReason); - return true; // Skip the memory copy. - } else { + if (srcRect.vfb == dstRect.vfb) { + // Transfer within the same buffer. + // This is a simple case because there will be no format conversion or similar shenanigans needed. + // However, the BPP might still mismatch, but in such a case we can convert the coordinates. + if (srcX == dstX && srcY == dstY) { // Ignore, nothing to do. Tales of Phantasia X does this by accident. - return true; // Skip the memory copy. + // Returning true to also skip the memory copy. + return true; } - } else { - WARN_LOG_N_TIMES(dstnotsrc, 100, G3D, "Inter-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)", + + int buffer_bpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format); + + if (bpp != buffer_bpp) { + WARN_LOG_ONCE(intrabpp, G3D, "Mismatched transfer bpp in intra-buffer block transfer. Was %d, expected %d.", bpp, buffer_bpp); + // We just switch to using the buffer's bpp, since we've already converted the rectangle to byte offsets. + bpp = buffer_bpp; + } + + WARN_LOG_N_TIMES(dstsrc, 5, G3D, "Intra-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)", + width, height, bpp, + srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride, + dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride); + FlushBeforeCopy(); + // Some backends can handle blitting within a framebuffer. Others will just have to deal with it or ignore it, apparently. + BlitFramebuffer(dstRect.vfb, dstX, dstY, srcRect.vfb, srcX, srcY, dstRect.w_bytes / bpp, dstRect.h / bpp, bpp, "Blit_IntraBufferBlockTransfer"); + RebindFramebuffer("rebind after intra block transfer"); + SetColorUpdated(dstRect.vfb, skipDrawReason); + return true; // Skip the memory copy. + } + + // Straightforward blit between two same-format framebuffers. + if (srcRect.vfb->fb_format == dstRect.vfb->fb_format) { + WARN_LOG_N_TIMES(dstnotsrc, 5, G3D, "Inter-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d %s) -> %08x (x:%d y:%d stride:%d %s)", width, height, bpp, - srcBasePtr, srcX, srcY, srcStride, - dstBasePtr, dstX, dstY, dstStride); - // Straightforward blit between two framebuffers. + srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride, GeBufferFormatToString(srcRect.vfb->fb_format), + dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride, GeBufferFormatToString(dstRect.vfb->fb_format)); + + // Straight blit will do, but check the bpp, we might need to convert coordinates differently. + int buffer_bpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format); + if (bpp != buffer_bpp) { + WARN_LOG_ONCE(intrabpp, G3D, "Mismatched transfer bpp in inter-buffer block transfer. Was %d, expected %d.", bpp, buffer_bpp); + // We just switch to using the buffer's bpp, since we've already converted the rectangle to byte offsets. + bpp = buffer_bpp; + } FlushBeforeCopy(); - BlitFramebuffer(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, dstWidth, dstHeight, bpp, "Blit_InterBufferBlockTransfer"); + BlitFramebuffer(dstRect.vfb, dstRect.x_bytes / bpp, dstRect.y, srcRect.vfb, srcRect.x_bytes / bpp, srcRect.y, srcRect.w_bytes / bpp, height, bpp, "Blit_InterBufferBlockTransfer"); RebindFramebuffer("RebindFramebuffer - Inter-buffer block transfer"); - SetColorUpdated(dstBuffer, skipDrawReason); - return true; // No need to actually do the memory copy behind, probably. + SetColorUpdated(dstRect.vfb, skipDrawReason); + return true; } - return false; + + // Getting to the more complex cases. Have not actually seen much of these yet. + WARN_LOG_N_TIMES(blockformat, 5, G3D, "Mismatched buffer formats in block transfer: %s->%s (%dx%d)", + GeBufferFormatToString(srcRect.vfb->fb_format), GeBufferFormatToString(dstRect.vfb->fb_format), + width, height); + + // TODO + + // No need to actually do the memory copy behind, probably. + return true; + } else if (dstBuffer) { // Here we should just draw the pixels into the buffer. Copy first. return false; } else if (srcBuffer) { - WARN_LOG_N_TIMES(btd, 100, G3D, "Block transfer readback %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)", + WARN_LOG_N_TIMES(btd, 10, G3D, "Block transfer readback %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)", width, height, bpp, - srcBasePtr, srcX, srcY, srcStride, - dstBasePtr, dstX, dstY, dstStride); + srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride, + dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride); FlushBeforeCopy(); - if (g_Config.bBlockTransferGPU && !srcBuffer->memoryUpdated) { - const int srcBpp = BufferFormatBytesPerPixel(srcBuffer->fb_format); + if (g_Config.bBlockTransferGPU && !srcRect.vfb->memoryUpdated) { + const int srcBpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format); const float srcXFactor = (float)bpp / srcBpp; - const bool tooTall = srcY + srcHeight > srcBuffer->bufferHeight; - if (srcHeight <= 0 || (tooTall && srcY != 0)) { - WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x skipped, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight); + const bool tooTall = srcY + srcRect.h > srcRect.vfb->bufferHeight; + if (srcRect.h <= 0 || (tooTall && srcY != 0)) { + WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x skipped, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcRect.y, srcRect.h, srcRect.vfb->bufferHeight); } else { if (tooTall) { - WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight); + WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcRect.y, srcRect.h, srcRect.vfb->bufferHeight); } - ReadFramebufferToMemory(srcBuffer, static_cast(srcX * srcXFactor), srcY, static_cast(srcWidth * srcXFactor), srcHeight); - srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR; + ReadFramebufferToMemory(srcRect.vfb, static_cast(srcX * srcXFactor), srcY, static_cast(srcRect.w_bytes * srcXFactor), srcRect.h); + srcRect.vfb->usageFlags = (srcRect.vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR; } } return false; // Let the bit copy happen @@ -1975,18 +2074,17 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS if (MayIntersectFramebuffer(srcBasePtr) || MayIntersectFramebuffer(dstBasePtr)) { // TODO: Figure out how we can avoid repeating the search here. - VirtualFramebuffer *dstBuffer = 0; - VirtualFramebuffer *srcBuffer = 0; - int srcWidth = width; - int srcHeight = height; - int dstWidth = width; - int dstHeight = height; - FindTransferFramebuffer(srcBuffer, srcBasePtr, srcStride, srcX, srcY, srcWidth, srcHeight, bpp, false); - FindTransferFramebuffer(dstBuffer, dstBasePtr, dstStride, dstX, dstY, dstWidth, dstHeight, bpp, true); + + BlockTransferRect dstRect{}; + BlockTransferRect srcRect{}; + + // These modify the X/Y/W/H parameters depending on the memory offset of the base pointers from the actual buffers. + bool srcBuffer = FindTransferFramebuffer(srcBasePtr, srcStride, srcX, srcY, width, height, bpp, false, &srcRect); + bool dstBuffer = FindTransferFramebuffer(dstBasePtr, dstStride, dstX, dstY, width, height, bpp, true, &dstRect); // A few games use this INSTEAD of actually drawing the video image to the screen, they just blast it to // the backbuffer. Detect this and have the framebuffermanager draw the pixels. - if (!useBufferedRendering_ && currentRenderVfb_ != dstBuffer) { + if (!useBufferedRendering_ && currentRenderVfb_ != dstRect.vfb) { return; } @@ -1994,21 +2092,21 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS WARN_LOG_ONCE(btu, G3D, "Block transfer upload %08x -> %08x", srcBasePtr, dstBasePtr); FlushBeforeCopy(); const u8 *srcBase = Memory::GetPointerUnchecked(srcBasePtr) + (srcX + srcY * srcStride) * bpp; - int dstBpp = BufferFormatBytesPerPixel(dstBuffer->fb_format); + int dstBpp = BufferFormatBytesPerPixel(dstRect.vfb->fb_format); float dstXFactor = (float)bpp / dstBpp; - if (dstWidth > dstBuffer->width || dstHeight > dstBuffer->height) { + if (dstRect.w_bytes / bpp > dstRect.vfb->width || dstRect.h > dstRect.vfb->height) { // The buffer isn't big enough, and we have a clear hint of size. Resize. // This happens in Valkyrie Profile when uploading video at the ending. - ResizeFramebufFBO(dstBuffer, dstWidth, dstHeight, false, true); + ResizeFramebufFBO(dstRect.vfb, dstRect.w_bytes / bpp, dstRect.h, false, true); // Make sure we don't flop back and forth. - dstBuffer->newWidth = std::max(dstWidth, (int)dstBuffer->width); - dstBuffer->newHeight = std::max(dstHeight, (int)dstBuffer->height); - dstBuffer->lastFrameNewSize = gpuStats.numFlips; + dstRect.vfb->newWidth = std::max(dstRect.w_bytes / bpp, (int)dstRect.vfb->width); + dstRect.vfb->newHeight = std::max(dstRect.h, (int)dstRect.vfb->height); + dstRect.vfb->lastFrameNewSize = gpuStats.numFlips; // Resizing may change the viewport/etc. gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE); } - DrawPixels(dstBuffer, static_cast(dstX * dstXFactor), dstY, srcBase, dstBuffer->fb_format, static_cast(srcStride * dstXFactor), static_cast(dstWidth * dstXFactor), dstHeight); - SetColorUpdated(dstBuffer, skipDrawReason); + DrawPixels(dstRect.vfb, static_cast(dstX * dstXFactor), dstY, srcBase, dstRect.vfb->fb_format, static_cast(srcStride * dstXFactor), static_cast(dstRect.w_bytes / bpp * dstXFactor), dstRect.h); + SetColorUpdated(dstRect.vfb, skipDrawReason); RebindFramebuffer("RebindFramebuffer - NotifyBlockTransferAfter"); } } @@ -2469,8 +2567,8 @@ void FramebufferManagerCommon::DeviceLost() { presentation_->DeviceLost(); - for (int i = 0; i < 3; i++) { - for (int j = 0; j < 3; j++) { + for (int i = 0; i < ARRAY_SIZE(reinterpretFromTo_); i++) { + for (int j = 0; j < ARRAY_SIZE(reinterpretFromTo_); j++) { DoRelease(reinterpretFromTo_[i][j]); } } @@ -2666,17 +2764,6 @@ void FramebufferManagerCommon::BlitUsingRaster( draw_->GetFramebufferDimensions(src, &srcW, &srcH); draw_->GetFramebufferDimensions(dest, &destW, &destH); - float dX = 1.0f / (float)destW; - float dY = 1.0f / (float)destH; - float sX = 1.0f / (float)srcW; - float sY = 1.0f / (float)srcH; - Draw2DVertex vtx[4] = { - { -1.0f + 2.0f * dX * destX1, -(1.0f - 2.0f * dY * destY1), sX * srcX1, sY * srcY1 }, - { -1.0f + 2.0f * dX * destX2, -(1.0f - 2.0f * dY * destY1), sX * srcX2, sY * srcY1 }, - { -1.0f + 2.0f * dX * destX1, -(1.0f - 2.0f * dY * destY2), sX * srcX1, sY * srcY2 }, - { -1.0f + 2.0f * dX * destX2, -(1.0f - 2.0f * dY * destY2), sX * srcX2, sY * srcY2 }, - }; - // Unbind the texture first to avoid the D3D11 hazard check (can't set render target to things bound as textures and vice versa, not even temporarily). draw_->BindTexture(0, nullptr); // This will get optimized away in case it's already bound (in VK and GL at least..) @@ -2687,7 +2774,7 @@ void FramebufferManagerCommon::BlitUsingRaster( draw_->SetViewports(1, &vp); draw_->SetScissorRect(0, 0, (int)dest->Width(), (int)dest->Height()); - draw2D_.DrawStrip2D(nullptr, vtx, 4, linearFilter, pipeline, src->Width(), src->Height(), renderScaleFactor_); + draw2D_.Blit(pipeline, srcX1, srcY1, srcX2, srcY2, destX1, destY1, destX2, destY2, (float)srcW, (float)srcH, (float)destW, (float)destH, linearFilter , renderScaleFactor_); gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE); } @@ -2702,19 +2789,31 @@ VirtualFramebuffer *FramebufferManagerCommon::ResolveFramebufferColorToFormat(Vi continue; } - if (dest->fb_address == src->fb_address && dest->fb_stride == src->fb_stride && dest->fb_format == newFormat) { + if (dest->fb_address == src->fb_address && dest->FbStrideInBytes() == src->FbStrideInBytes() && dest->fb_format == newFormat) { vfb = dest; break; } } if (!vfb) { - WARN_LOG(G3D, "Creating %s clone of %08x/%08x/%s", GeBufferFormatToString(newFormat), src->fb_address, src->z_address, GeBufferFormatToString(src->fb_format)); - // Create a clone! vfb = new VirtualFramebuffer(); *vfb = *src; // Copies everything, but watch out! Can't copy fbo. + + // Adjust width by bpp. + float widthFactor = (float)BufferFormatBytesPerPixel(vfb->fb_format) / (float)BufferFormatBytesPerPixel(newFormat); + + vfb->width *= widthFactor; + vfb->bufferWidth *= widthFactor; + vfb->renderWidth *= widthFactor; + vfb->drawnWidth *= widthFactor; + vfb->newWidth *= widthFactor; + vfb->safeWidth *= widthFactor; + vfb->fb_format = newFormat; + + WARN_LOG(G3D, "Creating %s clone of %08x/%08x/%s (%dx%d -> %dx%d)", GeBufferFormatToString(newFormat), src->fb_address, src->z_address, GeBufferFormatToString(src->fb_format), src->width, src->height, vfb->width, vfb->height); + char tag[128]; FormatFramebufferName(vfb, tag, sizeof(tag)); vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, 1, true, tag }); diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h index eb070b8347c1..deb13e7f6acb 100644 --- a/GPU/Common/FramebufferManagerCommon.h +++ b/GPU/Common/FramebufferManagerCommon.h @@ -142,6 +142,11 @@ struct VirtualFramebuffer { int last_frame_failed; int last_frame_depth_updated; int last_frame_depth_render; + + // Convenience methods + inline int WidthInBytes() const { return width * BufferFormatBytesPerPixel(fb_format); } + inline int FbStrideInBytes() const { return fb_stride * BufferFormatBytesPerPixel(fb_format); } + inline int ZStrideInBytes() const { return z_stride * 2; } }; struct FramebufferHeuristicParams { @@ -213,6 +218,31 @@ inline Draw::DataFormat GEFormatToThin3D(int geFormat) { } } +// Dimensions are in bytes, later steps get to convert back into real coordinates as appropriate. +// Makes it easy to see if blits match etc. +struct BlockTransferRect { + VirtualFramebuffer *vfb; + // RasterChannel channel; // We currently only deal with color for block copies. + + int x_bytes; + int y; + int w_bytes; + int h; + + std::string ToString() const; + + int w_pixels() const { + return w_bytes / BufferFormatBytesPerPixel(vfb->fb_format); + } + int x_pixels() const { + return x_bytes / BufferFormatBytesPerPixel(vfb->fb_format); + } + + bool operator < (const BlockTransferRect &other) const { + return vfb->colorBindSeq < other.vfb->colorBindSeq; + } +}; + namespace Draw { class DrawContext; } @@ -418,7 +448,7 @@ class FramebufferManagerCommon { bool ShouldDownloadFramebuffer(const VirtualFramebuffer *vfb) const; void DownloadFramebufferOnSwitch(VirtualFramebuffer *vfb); - void FindTransferFramebuffer(VirtualFramebuffer *&srcBuffer, u32 srcBasePtr, int srcStride, int &srcX, int &srcY, int &srcWidth, int &srcHeight, int bpp, bool destination); + bool FindTransferFramebuffer(u32 basePtr, int stride, int x, int y, int w, int h, int bpp, bool destination, BlockTransferRect *rect); VirtualFramebuffer *FindDownloadTempBuffer(VirtualFramebuffer *vfb); virtual void UpdateDownloadTempBuffer(VirtualFramebuffer *nvfb) {} @@ -503,10 +533,10 @@ class FramebufferManagerCommon { FBO_OLD_USAGE_FLAG = 15, }; - // Thin3D stuff for reinterpreting image data between the various 16-bit formats. + // Thin3D stuff for reinterpreting image data between the various 16-bit color formats. // Safe, not optimal - there might be input attachment tricks, etc, but we can't use them // since we don't want N different implementations. - Draw2DPipeline *reinterpretFromTo_[3][3]{}; + Draw2DPipeline *reinterpretFromTo_[4][4]{}; // Common implementation of stencil buffer upload. Also not 100% optimal, but not performance // critical either. diff --git a/GPU/Common/GPUStateUtils.h b/GPU/Common/GPUStateUtils.h index 41ee3adc73dc..dcf3cd7c051e 100644 --- a/GPU/Common/GPUStateUtils.h +++ b/GPU/Common/GPUStateUtils.h @@ -4,6 +4,7 @@ #include "Common/CommonTypes.h" #include "GPU/ge_constants.h" +#include "GPU/GPUState.h" // TODO: Replace enums and structs with same from thin3d.h, for convenient mapping. @@ -198,3 +199,21 @@ struct GenericStencilFuncState { }; void ConvertStencilFuncState(GenericStencilFuncState &stencilFuncState); + +// See issue #15898 +inline bool SpongebobDepthInverseConditions(const GenericStencilFuncState &stencilState) { + // Check that the depth/stencil state matches the conditions exactly + return gstate.isDepthTestEnabled() && !gstate.isDepthWriteEnabled() && + gstate.getDepthTestFunction() == GE_COMP_GEQUAL && + stencilState.zFail == GE_STENCILOP_ZERO && stencilState.sFail == GE_STENCILOP_KEEP && stencilState.zPass == GE_STENCILOP_KEEP && + stencilState.testFunc == GE_COMP_ALWAYS && stencilState.writeMask == 0xFF && + // And also verify no color is written. The game does this through simple alpha blending with a constant zero alpha. + // We also check for color mask, since it's more natural, in case another game does it. + (gstate.isAlphaBlendEnabled() && + gstate.getBlendFuncA() == GE_SRCBLEND_SRCALPHA && + gstate.getBlendFuncB() == GE_DSTBLEND_INVSRCALPHA && + gstate.getMaterialAmbientA() == 0x0 && // our accessor is kinda misnamed here, but material diffuse A is both used as default color and as ambient alpha + gstate.getMaterialUpdate() == 0 && + !gstate.isTextureMapEnabled() + ) || gstate.getColorMask() == 0xFFFFFF00; // note that PSP masks are "inverted" +} diff --git a/GPU/Common/ReinterpretFramebuffer.cpp b/GPU/Common/ReinterpretFramebuffer.cpp index cbcf463c25de..b30c71e504b5 100644 --- a/GPU/Common/ReinterpretFramebuffer.cpp +++ b/GPU/Common/ReinterpretFramebuffer.cpp @@ -24,80 +24,136 @@ Draw2DPipelineInfo GenerateReinterpretFragmentShader(ShaderWriter &writer, GEBuf writer.DeclareSamplers(samplers); - writer.BeginFSMain(Slice::empty(), varyings, FSFLAG_NONE); - - writer.C(" vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n"); - if (writer.Lang().bitwiseOps) { switch (from) { case GE_FORMAT_4444: - writer.C(" uint color = uint(val.r * 15.99) | (uint(val.g * 15.99) << 4u) | (uint(val.b * 15.99) << 8u) | (uint(val.a * 15.99) << 12u);\n"); + writer.C("uint packColor(vec4 val) {\n"); + writer.C(" return uint(val.r * 15.99) | (uint(val.g * 15.99) << 4u) | (uint(val.b * 15.99) << 8u) | (uint(val.a * 15.99) << 12u);\n"); + writer.C("}\n"); break; case GE_FORMAT_5551: + writer.C("uint packColor(vec4 val) {\n"); writer.C(" uint color = uint(val.r * 31.99) | (uint(val.g * 31.99) << 5u) | (uint(val.b * 31.99) << 10u);\n"); writer.C(" if (val.a >= 0.5) color |= 0x8000U;\n"); + writer.C(" return color;\n"); + writer.C("}\n"); + break; + case GE_FORMAT_565: + writer.C("uint packColor(vec4 val) {\n"); + writer.C(" return uint(val.r * 31.99) | (uint(val.g * 63.99) << 5u) | (uint(val.b * 31.99) << 11u);\n"); + writer.C("}\n"); + break; + case GE_FORMAT_8888: + writer.C("uint packColor(vec2 val) {\n"); + writer.C(" return uint(val.r * 255.99) | (uint(val.g * 255.99) << 8u);\n"); + writer.C("}\n"); + break; + default: + _assert_(false); + break; + } + } else { + // Floating point can comfortably represent integers up to 16 million, we only need 65536 since these textures are 16-bit. + switch (from) { + case GE_FORMAT_4444: + writer.C("float packColor(vec4 val) {\n"); + writer.C(" return (floor(val.r * 15.99) + floor(val.g * 15.99) * 16.0) + (floor(val.b * 15.99) * 256.0 + floor(val.a * 15.99) * 4096.0);\n"); + writer.C("}\n"); + break; + case GE_FORMAT_5551: + writer.C("float packColor(vec4 val) {\n"); + writer.C(" float color = floor(val.r * 31.99) + floor(val.g * 31.99) * 32.0 + floor(val.b * 31.99) * 1024.0;\n"); + writer.C(" if (val.a >= 0.5) color += 32768.0;\n"); + writer.C(" return color;\n"); + writer.C("}\n"); break; case GE_FORMAT_565: - writer.C(" uint color = uint(val.r * 31.99) | (uint(val.g * 63.99) << 5u) | (uint(val.b * 31.99) << 11u);\n"); + writer.C("float packColor(vec4 val) {\n"); + writer.C(" return floor(val.r * 31.99) + floor(val.g * 63.99) * 32.0 + floor(val.b * 31.99) * 2048.0;\n"); + writer.C("}\n"); + break; + case GE_FORMAT_8888: + writer.C("float packColor(vec2 val) {\n"); + writer.C(" return floor(val.r * 255.99) + floor(val.g * 255.99) * 256.0;\n"); + writer.C("}\n"); break; default: _assert_(false); break; } + } + if (writer.Lang().bitwiseOps) { switch (to) { case GE_FORMAT_4444: + writer.C("vec4 unpackColor(uint color) {\n"); writer.C(" vec4 outColor = vec4(float(color & 0xFU), float((color >> 4u) & 0xFU), float((color >> 8u) & 0xFU), float((color >> 12u) & 0xFU));\n"); writer.C(" outColor *= 1.0 / 15.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; case GE_FORMAT_5551: + writer.C("vec4 unpackColor(uint color) {\n"); writer.C(" vec4 outColor = vec4(float(color & 0x1FU), float((color >> 5u) & 0x1FU), float((color >> 10u) & 0x1FU), 0.0);\n"); writer.C(" outColor.rgb *= 1.0 / 31.0;\n"); writer.C(" outColor.a = float(color >> 15);\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; case GE_FORMAT_565: + writer.C("vec4 unpackColor(uint color) {\n"); writer.C(" vec4 outColor = vec4(float(color & 0x1FU), float((color >> 5u) & 0x3FU), float((color >> 11u) & 0x1FU), 1.0);\n"); writer.C(" outColor.rb *= 1.0 / 31.0;\n"); writer.C(" outColor.g *= 1.0 / 63.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; - default: - _assert_(false); - break; - } - } else { - // Floating point can comfortably represent integers up to 16 million, we only need 65536 since these textures are 16-bit. - switch (from) { - case GE_FORMAT_4444: - writer.C(" float color = (floor(val.r * 15.99) + floor(val.g * 15.99) * 16.0) + (floor(val.b * 15.99) * 256.0 + floor(val.a * 15.99) * 4096.0);\n"); - break; - case GE_FORMAT_5551: - writer.C(" float color = floor(val.r * 31.99) + floor(val.g * 31.99) * 32.0 + floor(val.b * 31.99) * 1024.0;\n"); - writer.C(" if (val.a >= 0.5) color += 32768.0;\n"); - break; - case GE_FORMAT_565: - writer.C(" float color = floor(val.r * 31.99) + floor(val.g * 63.99) * 32.0 + floor(val.b * 31.99) * 2048.0;\n"); + case GE_FORMAT_8888: + writer.C("vec4 unpackColor(uint colorLeft, uint colorRight) {\n"); + writer.C(" vec4 outColor = vec4(float(colorLeft & 0xFFu), float((colorLeft >> 8u) & 0xFFu),\n"); + writer.C(" float(colorRight & 0xFFu), float((colorRight >> 8u) & 0xFFu));\n"); + writer.C(" outColor *= 1.0 / 255.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; default: _assert_(false); break; } - + } else { switch (to) { case GE_FORMAT_4444: + writer.C("vec4 unpackColor(float color) {\n"); writer.C(" vec4 outColor = vec4(mod(floor(color), 16.0), mod(floor(color / 16.0), 16.0),"); writer.C(" mod(floor(color / 256.0), 16.0), mod(floor(color / 4096.0), 16.0)); \n"); writer.C(" outColor *= 1.0 / 15.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; case GE_FORMAT_5551: + writer.C("vec4 unpackColor(float color) {\n"); writer.C(" vec4 outColor = vec4(mod(floor(color), 32.0), mod(floor(color / 32.0), 32.0), mod(floor(color / 1024.0), 32.0), 0.0);\n"); writer.C(" outColor.rgb *= 1.0 / 31.0;\n"); writer.C(" outColor.a = floor(color / 32768.0);\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; case GE_FORMAT_565: + writer.C("vec4 unpackColor(float color) {\n"); writer.C(" vec4 outColor = vec4(mod(floor(color), 32.0), mod(floor(color / 32.0), 64.0), mod(floor(color / 2048.0), 32.0), 0.0);\n"); writer.C(" outColor.rb *= 1.0 / 31.0;\n"); writer.C(" outColor.g *= 1.0 / 63.0;\n"); writer.C(" outColor.a = 1.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); + break; + case GE_FORMAT_8888: + writer.C("vec4 unpackColor(float colorLeft, float colorRight) {\n"); + writer.C(" vec4 outColor = vec4(mod(floor(colorLeft), 256.0), mod(floor(colorLeft / 256.0), 256.0),\n"); + writer.C(" mod(floor(colorRight), 256.0), mod(floor(colorRight / 256.0), 256.0));\n"); + writer.C(" outColor *= 1.0 / 255.0;\n"); + writer.C(" return outColor;\n"); + writer.C("}\n"); break; default: _assert_(false); @@ -105,6 +161,27 @@ Draw2DPipelineInfo GenerateReinterpretFragmentShader(ShaderWriter &writer, GEBuf } } + writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_NONE); + + if (IsBufferFormat16Bit(from) && IsBufferFormat16Bit(to)) { + writer.C(" vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n"); + writer.C(" vec4 outColor = unpackColor(packColor(val));\n"); + } else if (IsBufferFormat16Bit(from) && !IsBufferFormat16Bit(to)) { + // 16-to-32-bit (two pixels, draw size is halved) + + writer.C(" vec4 valLeft = ").SampleTexture2D("tex", "v_texcoord.xy + vec2(-0.25 / texSize.x, 0.0)").C(";\n"); + writer.C(" vec4 valRight = ").SampleTexture2D("tex", "v_texcoord.xy + vec2(0.25 / texSize.x, 0.0)").C(";\n"); + writer.C(" vec4 outColor = unpackColor(packColor(valLeft), packColor(valRight));\n"); + + _assert_("not yet implemented"); + } else if (!IsBufferFormat16Bit(from) && IsBufferFormat16Bit(to)) { + // 32-to-16-bit (half of the pixel, draw size is doubled). + + writer.C(" vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n"); + writer.C(" float u = mod(floor(v_texcoord.x * texSize.x * 2.0), 2.0);\n"); + writer.C(" vec4 outColor = unpackColor(u == 0.0 ? packColor(val.rg) : packColor(val.ba));\n"); + } + writer.EndFSMain("outColor", FSFLAG_NONE); return Draw2DPipelineInfo{ diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp index 87746043cbb3..e6ce09ae62cf 100644 --- a/GPU/Common/TextureCacheCommon.cpp +++ b/GPU/Common/TextureCacheCommon.cpp @@ -662,28 +662,11 @@ int TextureCacheCommon::GetBestCandidateIndex(const std::vector int bestRelevancy = -1; int bestIndex = -1; - // TODO: Instead of scores, we probably want to use std::min_element to pick the top element, using - // a comparison function. + // We simply use the sequence counter as relevancy nowadays. for (int i = 0; i < (int)candidates.size(); i++) { const AttachCandidate &candidate = candidates[i]; int relevancy = candidate.seqCount; - // Bonus point for matching stride. - if (candidate.channel == RASTER_COLOR && candidate.fb->fb_stride == candidate.entry.bufw) { - relevancy += 1000; - } - - // Bonus points for no offset. - if (candidate.match.xOffset == 0 && candidate.match.yOffset == 0) { - relevancy += 100; - } - - if (candidate.channel == RASTER_COLOR && candidate.fb->last_frame_render == gpuStats.numFlips) { - relevancy += 50; - } else if (candidate.channel == RASTER_DEPTH && candidate.fb->last_frame_depth_render == gpuStats.numFlips) { - relevancy += 50; - } - if (relevancy > bestRelevancy) { bestRelevancy = relevancy; bestIndex = i; @@ -907,20 +890,18 @@ bool TextureCacheCommon::MatchFramebuffer( // If they match "exactly", it's non-CLUT and from the top left. if (exactMatch) { + // TODO: Better checks for compatible strides here. if (fb_stride != entry.bufw) { - WARN_LOG_ONCE(diffStrides1, G3D, "Found matching framebuffer with different strides %d != %d", entry.bufw, (int)fb_stride); + WARN_LOG_ONCE(diffStrides1, G3D, "Found matching framebuffer at %08x with different strides %d != %d", fb_address, entry.bufw, (int)fb_stride); } // NOTE: This check is okay because the first texture formats are the same as the buffer formats. if (IsTextureFormatBufferCompatible(entry.format)) { if (TextureFormatMatchesBufferFormat(entry.format, fb_format) || (framebuffer->usageFlags & FB_USAGE_BLUE_TO_ALPHA)) { return true; - } else if (IsTextureFormat16Bit(entry.format) && IsBufferFormat16Bit(fb_format) && channel == RASTER_COLOR) { - WARN_LOG_ONCE(diffFormat1, G3D, "Found matching framebuffer with reinterpretable fb_format: %s != %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format)); + } else { + WARN_LOG_ONCE(diffFormat1, G3D, "Found matching framebuffer with reinterpretable fb_format: %s != %s at %08x", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format), fb_address); *matchInfo = FramebufferMatchInfo{ 0, 0, true, TextureFormatToBufferFormat(entry.format) }; return true; - } else { - WARN_LOG_ONCE(diffFormat2, G3D, "Rejecting framebuffer with incompatible formats %s != %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format)); - return false; } } else { // Format incompatible, ignoring without comment. (maybe some really gnarly hacks will end up here...) @@ -954,7 +935,7 @@ bool TextureCacheCommon::MatchFramebuffer( if (fb_stride != entry.bufw) { if (noOffset) { - WARN_LOG_ONCE(diffStrides2, G3D, "Matching framebuffer(matching_clut = % s) different strides % d != % d", matchingClutFormat ? "yes" : "no", entry.bufw, fb_stride); + WARN_LOG_ONCE(diffStrides2, G3D, "Matching framebuffer(matching_clut = %s) different strides %d != %d", matchingClutFormat ? "yes" : "no", entry.bufw, fb_stride); // Continue on with other checks. // Not actually sure why we even try here. There's no way it'll go well if the strides are different. } else { @@ -984,7 +965,7 @@ bool TextureCacheCommon::MatchFramebuffer( } return true; } else if (IsClutFormat((GETextureFormat)(entry.format)) || IsDXTFormat((GETextureFormat)(entry.format))) { - WARN_LOG_ONCE(fourEightBit, G3D, "%s fb_format not supported when texturing from framebuffer of format %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format)); + WARN_LOG_ONCE(fourEightBit, G3D, "%s fb_format not matching framebuffer of format %s at %08x/%d", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format), fb_address, fb_stride); return false; } @@ -1024,9 +1005,18 @@ void TextureCacheCommon::SetTextureFramebuffer(const AttachCandidate &candidate) nextFramebufferTextureChannel_ = RASTER_COLOR; if (framebufferManager_->UseBufferedRendering()) { + // Detect when we need to apply the horizontal texture swizzle. + u64 depthUpperBits = (channel == RASTER_DEPTH && framebuffer->fb_format == GE_FORMAT_8888) ? ((gstate.getTextureAddress(0) & 0x600000) >> 20) : 0; + bool needsDepthXSwizzle = depthUpperBits == 2; + // We need to force it, since we may have set it on a texture before attaching. gstate_c.curTextureWidth = framebuffer->bufferWidth; gstate_c.curTextureHeight = framebuffer->bufferHeight; + + if (needsDepthXSwizzle) { + gstate_c.curTextureWidth = RoundUpToPowerOf2(gstate_c.curTextureWidth); + } + if (gstate_c.bgraTexture) { gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE); } else if ((gstate_c.curTextureXOffset == 0) != (fbInfo.xOffset == 0) || (gstate_c.curTextureYOffset == 0) != (fbInfo.yOffset == 0)) { @@ -1884,6 +1874,7 @@ static bool CanUseSmoothDepal(const GPUgstate &gstate, GEBufferFormat framebuffe return false; } + void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, RasterChannel channel) { Draw2DPipeline *textureShader = nullptr; uint32_t clutMode = gstate.clutformat & 0xFFFFFF; @@ -1910,6 +1901,7 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); ClutTexture clutTexture{}; bool smoothedDepal = false; + u32 depthUpperBits = 0; if (need_depalettize && !g_Config.bDisableSlowFramebufEffects) { clutTexture = textureShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBufRaw_); @@ -1944,20 +1936,33 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer return; } - textureShader = textureShaderCache_->GetDepalettizeShader(clutMode, texFormat, depth ? GE_FORMAT_DEPTH16 : framebuffer->fb_format, smoothedDepal); + depthUpperBits = (depth && framebuffer->fb_format == GE_FORMAT_8888) ? ((gstate.getTextureAddress(0) & 0x600000) >> 20) : 0; + + textureShader = textureShaderCache_->GetDepalettizeShader(clutMode, texFormat, depth ? GE_FORMAT_DEPTH16 : framebuffer->fb_format, smoothedDepal, depthUpperBits); gstate_c.SetUseShaderDepal(false, false); } if (textureShader) { const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat(); ClutTexture clutTexture = textureShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBufRaw_); - Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, framebuffer->renderWidth, framebuffer->renderHeight); + + bool needsDepthXSwizzle = depthUpperBits == 2; + + int depalWidth = framebuffer->renderWidth; + int texWidth = framebuffer->width; + if (needsDepthXSwizzle) { + texWidth = RoundUpToPowerOf2(framebuffer->width); + depalWidth = texWidth * framebuffer->renderScaleFactor; + gstate_c.Dirty(DIRTY_UVSCALEOFFSET); + } + + Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, depalWidth, framebuffer->renderHeight); draw_->BindTexture(0, nullptr); draw_->BindTexture(1, nullptr); draw_->BindFramebufferAsRenderTarget(depalFBO, { Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE }, "Depal"); - draw_->SetScissorRect(0, 0, (int)framebuffer->renderWidth, (int)framebuffer->renderHeight); - Draw::Viewport vp{ 0.0f, 0.0f, (float)framebuffer->renderWidth, (float)framebuffer->renderHeight, 0.0f, 1.0f }; + draw_->SetScissorRect(0, 0, (int)depalWidth, (int)framebuffer->renderHeight); + Draw::Viewport vp{ 0.0f, 0.0f, (float)depalWidth, (float)framebuffer->renderHeight, 0.0f, 1.0f }; draw_->SetViewports(1, &vp); draw_->BindFramebufferAsTexture(framebuffer->fbo, 0, depth ? Draw::FB_DEPTH_BIT : Draw::FB_COLOR_BIT, 0); @@ -1967,9 +1972,28 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer draw_->BindSamplerStates(0, 1, &nearest); draw_->BindSamplerStates(1, 1, &clutSampler); - textureShaderCache_->ApplyShader(textureShader, - framebuffer->bufferWidth, framebuffer->bufferHeight, framebuffer->renderWidth, framebuffer->renderHeight, - gstate_c.vertBounds, gstate_c.curTextureXOffset, gstate_c.curTextureYOffset); + // If min is not < max, then we don't have values (wasn't set during decode.) + const KnownVertexBounds &bounds = gstate_c.vertBounds; + float u1 = 0.0f; + float v1 = 0.0f; + float u2 = depalWidth; + float v2 = framebuffer->renderHeight; + if (bounds.minV < bounds.maxV) { + u1 = bounds.minU + gstate_c.curTextureXOffset; + v1 = bounds.minV + gstate_c.curTextureYOffset; + u2 = bounds.maxU + gstate_c.curTextureXOffset; + v2 = bounds.maxV + gstate_c.curTextureYOffset; + // We need to reapply the texture next time since we cropped UV. + gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); + } + u1 *= framebuffer->renderScaleFactor; + v1 *= framebuffer->renderScaleFactor; + u2 *= framebuffer->renderScaleFactor; + v2 *= framebuffer->renderScaleFactor; + + draw2D_->Blit(textureShader, u1, v1, u2, v2, u1, v1, u2, v2, framebuffer->renderWidth, framebuffer->renderHeight, depalWidth, framebuffer->renderHeight, false, framebuffer->renderScaleFactor); + + gstate_c.curTextureWidth = texWidth; draw_->BindTexture(0, nullptr); framebufferManager_->RebindFramebuffer("ApplyTextureFramebuffer"); diff --git a/GPU/Common/TextureShaderCommon.cpp b/GPU/Common/TextureShaderCommon.cpp index ab7a280a53df..2584dc595b31 100644 --- a/GPU/Common/TextureShaderCommon.cpp +++ b/GPU/Common/TextureShaderCommon.cpp @@ -187,11 +187,11 @@ void TextureShaderCache::Decimate() { } } -Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETextureFormat textureFormat, GEBufferFormat bufferFormat, bool smoothedDepal) { +Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETextureFormat textureFormat, GEBufferFormat bufferFormat, bool smoothedDepal, u32 depthUpperBits) { using namespace Draw; // Generate an ID for depal shaders. - u32 id = (clutMode & 0xFFFFFF) | (textureFormat << 24) | (bufferFormat << 28); + u64 id = (depthUpperBits << 32) | (clutMode & 0xFFFFFF) | (textureFormat << 24) | (bufferFormat << 28); auto shader = depalCache_.find(id); if (shader != depalCache_.end()) { @@ -207,6 +207,7 @@ Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETe config.bufferFormat = bufferFormat; config.textureFormat = textureFormat; config.smoothedDepal = smoothedDepal; + config.depthUpperBits = depthUpperBits; char *buffer = new char[4096]; Draw2DPipeline *ts = draw2D_->Create2DPipeline([=](ShaderWriter &writer) -> Draw2DPipelineInfo { @@ -247,51 +248,3 @@ std::string TextureShaderCache::DebugGetShaderString(std::string idstr, DebugSha return ""; } } - -void TextureShaderCache::ApplyShader(Draw2DPipeline *pipeline, float bufferW, float bufferH, int renderW, int renderH, const KnownVertexBounds &bounds, u32 uoff, u32 voff) { - Draw2DVertex verts[4] = { - {-1, -1, 0, 0 }, - { 1, -1, 1, 0 }, - {-1, 1, 0, 1 }, - { 1, 1, 1, 1 }, - }; - - // If min is not < max, then we don't have values (wasn't set during decode.) - if (bounds.minV < bounds.maxV) { - const float invWidth = 1.0f / bufferW; - const float invHeight = 1.0f / bufferH; - // Inverse of half = double. - const float invHalfWidth = invWidth * 2.0f; - const float invHalfHeight = invHeight * 2.0f; - - const int u1 = bounds.minU + uoff; - const int v1 = bounds.minV + voff; - const int u2 = bounds.maxU + uoff; - const int v2 = bounds.maxV + voff; - - const float left = u1 * invHalfWidth - 1.0f; - const float right = u2 * invHalfWidth - 1.0f; - const float top = v1 * invHalfHeight - 1.0f; - const float bottom = v2 * invHalfHeight - 1.0f; - - const float uvleft = u1 * invWidth; - const float uvright = u2 * invWidth; - const float uvtop = v1 * invHeight; - const float uvbottom = v2 * invHeight; - - // Points are: BL, BR, TR, TL. - verts[0] = Draw2DVertex{ left, bottom, uvleft, uvbottom }; - verts[1] = Draw2DVertex{ right, bottom, uvright, uvbottom }; - verts[2] = Draw2DVertex{ left, top, uvleft, uvtop }; - verts[3] = Draw2DVertex{ right, top, uvright, uvtop }; - - // We need to reapply the texture next time since we cropped UV. - gstate_c.Dirty(DIRTY_TEXTURE_PARAMS); - } - - Draw::Viewport vp{ 0.0f, 0.0f, (float)renderW, (float)renderH, 0.0f, 1.0f }; - draw_->BindPipeline(pipeline->pipeline); - draw_->SetViewports(1, &vp); - draw_->SetScissorRect(0, 0, renderW, renderH); - draw_->DrawUP((const uint8_t *)verts, 4); -} diff --git a/GPU/Common/TextureShaderCommon.h b/GPU/Common/TextureShaderCommon.h index 5e0c5fe4765b..a11ee812101f 100644 --- a/GPU/Common/TextureShaderCommon.h +++ b/GPU/Common/TextureShaderCommon.h @@ -43,13 +43,11 @@ class TextureShaderCache { TextureShaderCache(Draw::DrawContext *draw, Draw2D *draw2D); ~TextureShaderCache(); - Draw2DPipeline *GetDepalettizeShader(uint32_t clutMode, GETextureFormat texFormat, GEBufferFormat pixelFormat, bool smoothedDepal); + Draw2DPipeline *GetDepalettizeShader(uint32_t clutMode, GETextureFormat texFormat, GEBufferFormat pixelFormat, bool smoothedDepal, u32 depthUpperBits); ClutTexture GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32 *rawClut); Draw::SamplerState *GetSampler(bool linearFilter); - void ApplyShader(Draw2DPipeline *pipeline, float bufferW, float bufferH, int renderW, int renderH, const KnownVertexBounds &bounds, u32 uoff, u32 voff); - void Clear(); void Decimate(); std::vector DebugGetShaderIDs(DebugShaderType type); @@ -64,6 +62,6 @@ class TextureShaderCache { Draw::SamplerState *linearSampler_ = nullptr; Draw2D *draw2D_; - std::map depalCache_; + std::map depalCache_; std::map texCache_; }; diff --git a/GPU/D3D11/StateMappingD3D11.cpp b/GPU/D3D11/StateMappingD3D11.cpp index 11db65c55348..b75edef5d59a 100644 --- a/GPU/D3D11/StateMappingD3D11.cpp +++ b/GPU/D3D11/StateMappingD3D11.cpp @@ -217,44 +217,6 @@ void DrawEngineD3D11::ApplyDrawState(int prim) { keys_.blend.colorWriteMask = (maskState.rgba[0] ? 1 : 0) | (maskState.rgba[1] ? 2 : 0) | (maskState.rgba[2] ? 4 : 0) | (maskState.rgba[3] ? 8 : 0); } - - if (!device1_) { - ID3D11BlendState *bs = blendCache_.Get(keys_.blend.value); - if (bs == nullptr) { - D3D11_BLEND_DESC desc{}; - D3D11_RENDER_TARGET_BLEND_DESC &rt = desc.RenderTarget[0]; - rt.BlendEnable = keys_.blend.blendEnable; - rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor; - rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha; - rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor; - rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor; - rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha; - rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha; - rt.RenderTargetWriteMask = keys_.blend.colorWriteMask; - ASSERT_SUCCESS(device_->CreateBlendState(&desc, &bs)); - blendCache_.Insert(keys_.blend.value, bs); - } - blendState_ = bs; - } else { - ID3D11BlendState1 *bs1 = blendCache1_.Get(keys_.blend.value); - if (bs1 == nullptr) { - D3D11_BLEND_DESC1 desc1{}; - D3D11_RENDER_TARGET_BLEND_DESC1 &rt = desc1.RenderTarget[0]; - rt.BlendEnable = keys_.blend.blendEnable; - rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor; - rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha; - rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor; - rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor; - rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha; - rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha; - rt.RenderTargetWriteMask = keys_.blend.colorWriteMask; - rt.LogicOpEnable = keys_.blend.logicOpEnable; - rt.LogicOp = (D3D11_LOGIC_OP)keys_.blend.logicOp; - ASSERT_SUCCESS(device1_->CreateBlendState1(&desc1, &bs1)); - blendCache1_.Insert(keys_.blend.value, bs1); - } - blendState1_ = bs1; - } } if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) { @@ -275,18 +237,6 @@ void DrawEngineD3D11::ApplyDrawState(int prim) { keys_.raster.depthClipEnable = 1; } } - ID3D11RasterizerState *rs = rasterCache_.Get(keys_.raster.value); - if (rs == nullptr) { - D3D11_RASTERIZER_DESC desc{}; - desc.CullMode = (D3D11_CULL_MODE)(keys_.raster.cullMode); - desc.FillMode = D3D11_FILL_SOLID; - desc.ScissorEnable = TRUE; - desc.FrontCounterClockwise = TRUE; - desc.DepthClipEnable = keys_.raster.depthClipEnable; - ASSERT_SUCCESS(device_->CreateRasterizerState(&desc, &rs)); - rasterCache_.Insert(keys_.raster.value, rs); - } - rasterState_ = rs; } if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) { @@ -343,29 +293,36 @@ void DrawEngineD3D11::ApplyDrawState(int prim) { keys_.depthStencil.stencilWriteMask = stencilState.writeMask; dynState_.useStencil = true; dynState_.stencilRef = stencilState.testRef; + + // Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during + // depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth + // test and modify the alpha function... + if (SpongebobDepthInverseConditions(stencilState)) { + keys_.blend.blendEnable = true; + keys_.blend.blendOpAlpha = D3D11_BLEND_OP_ADD; + keys_.blend.blendOpColor = D3D11_BLEND_OP_ADD; + keys_.blend.srcColor = D3D11_BLEND_ZERO; + keys_.blend.destColor = D3D11_BLEND_ZERO; + keys_.blend.logicOpEnable = false; + keys_.blend.srcAlpha = D3D11_BLEND_ZERO; + keys_.blend.destAlpha = D3D11_BLEND_ZERO; + keys_.blend.colorWriteMask = D3D11_COLOR_WRITE_ENABLE_ALPHA; + + keys_.depthStencil.depthCompareOp = D3D11_COMPARISON_LESS; // Inverse of GREATER_EQUAL + keys_.depthStencil.stencilCompareFunc = D3D11_COMPARISON_ALWAYS; + // Invert + keys_.depthStencil.stencilPassOp = D3D11_STENCIL_OP_ZERO; + keys_.depthStencil.stencilFailOp = D3D11_STENCIL_OP_ZERO; + keys_.depthStencil.stencilDepthFailOp = D3D11_STENCIL_OP_KEEP; + + // TODO: Need to set in a way that carries over to the next draw.. + gstate_c.Dirty(DIRTY_BLEND_STATE); + } } else { keys_.depthStencil.stencilTestEnable = false; dynState_.useStencil = false; } } - ID3D11DepthStencilState *ds = depthStencilCache_.Get(keys_.depthStencil.value); - if (ds == nullptr) { - D3D11_DEPTH_STENCIL_DESC desc{}; - desc.DepthEnable = keys_.depthStencil.depthTestEnable; - desc.DepthWriteMask = keys_.depthStencil.depthWriteEnable ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; - desc.DepthFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.depthCompareOp; - desc.StencilEnable = keys_.depthStencil.stencilTestEnable; - desc.StencilReadMask = keys_.depthStencil.stencilCompareMask; - desc.StencilWriteMask = keys_.depthStencil.stencilWriteMask; - desc.FrontFace.StencilFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilFailOp; - desc.FrontFace.StencilPassOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilPassOp; - desc.FrontFace.StencilDepthFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilDepthFailOp; - desc.FrontFace.StencilFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.stencilCompareFunc; - desc.BackFace = desc.FrontFace; - ASSERT_SUCCESS(device_->CreateDepthStencilState(&desc, &ds)); - depthStencilCache_.Insert(keys_.depthStencil.value, ds); - } - depthStencilState_ = ds; } if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) { @@ -397,6 +354,84 @@ void DrawEngineD3D11::ApplyDrawState(int prim) { scissor.bottom = vpAndScissor.scissorY + std::max(0, vpAndScissor.scissorH); } + // Actually create/set the state objects only after we're done mapping all the state. + // There might have been interactions between depth and blend above. + if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) { + if (!device1_) { + ID3D11BlendState *bs = blendCache_.Get(keys_.blend.value); + if (bs == nullptr) { + D3D11_BLEND_DESC desc{}; + D3D11_RENDER_TARGET_BLEND_DESC &rt = desc.RenderTarget[0]; + rt.BlendEnable = keys_.blend.blendEnable; + rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor; + rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha; + rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor; + rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor; + rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha; + rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha; + rt.RenderTargetWriteMask = keys_.blend.colorWriteMask; + ASSERT_SUCCESS(device_->CreateBlendState(&desc, &bs)); + blendCache_.Insert(keys_.blend.value, bs); + } + blendState_ = bs; + } else { + ID3D11BlendState1 *bs1 = blendCache1_.Get(keys_.blend.value); + if (bs1 == nullptr) { + D3D11_BLEND_DESC1 desc1{}; + D3D11_RENDER_TARGET_BLEND_DESC1 &rt = desc1.RenderTarget[0]; + rt.BlendEnable = keys_.blend.blendEnable; + rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor; + rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha; + rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor; + rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor; + rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha; + rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha; + rt.RenderTargetWriteMask = keys_.blend.colorWriteMask; + rt.LogicOpEnable = keys_.blend.logicOpEnable; + rt.LogicOp = (D3D11_LOGIC_OP)keys_.blend.logicOp; + ASSERT_SUCCESS(device1_->CreateBlendState1(&desc1, &bs1)); + blendCache1_.Insert(keys_.blend.value, bs1); + } + blendState1_ = bs1; + } + } + + if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) { + ID3D11RasterizerState *rs = rasterCache_.Get(keys_.raster.value); + if (rs == nullptr) { + D3D11_RASTERIZER_DESC desc{}; + desc.CullMode = (D3D11_CULL_MODE)(keys_.raster.cullMode); + desc.FillMode = D3D11_FILL_SOLID; + desc.ScissorEnable = TRUE; + desc.FrontCounterClockwise = TRUE; + desc.DepthClipEnable = keys_.raster.depthClipEnable; + ASSERT_SUCCESS(device_->CreateRasterizerState(&desc, &rs)); + rasterCache_.Insert(keys_.raster.value, rs); + } + rasterState_ = rs; + } + + if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) { + ID3D11DepthStencilState *ds = depthStencilCache_.Get(keys_.depthStencil.value); + if (ds == nullptr) { + D3D11_DEPTH_STENCIL_DESC desc{}; + desc.DepthEnable = keys_.depthStencil.depthTestEnable; + desc.DepthWriteMask = keys_.depthStencil.depthWriteEnable ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO; + desc.DepthFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.depthCompareOp; + desc.StencilEnable = keys_.depthStencil.stencilTestEnable; + desc.StencilReadMask = keys_.depthStencil.stencilCompareMask; + desc.StencilWriteMask = keys_.depthStencil.stencilWriteMask; + desc.FrontFace.StencilFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilFailOp; + desc.FrontFace.StencilPassOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilPassOp; + desc.FrontFace.StencilDepthFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilDepthFailOp; + desc.FrontFace.StencilFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.stencilCompareFunc; + desc.BackFace = desc.FrontFace; + ASSERT_SUCCESS(device_->CreateDepthStencilState(&desc, &ds)); + depthStencilCache_.Insert(keys_.depthStencil.value, ds); + } + depthStencilState_ = ds; + } + if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) { textureCache_->SetTexture(); gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS); diff --git a/GPU/Directx9/StateMappingDX9.cpp b/GPU/Directx9/StateMappingDX9.cpp index 9fb92db6fe7f..a82ccb4605c9 100644 --- a/GPU/Directx9/StateMappingDX9.cpp +++ b/GPU/Directx9/StateMappingDX9.cpp @@ -229,7 +229,6 @@ void DrawEngineDX9::ApplyDrawState(int prim) { } else { dxstate.stencilTest.disable(); } - } else { // Depth Test if (gstate.isDepthTestEnabled()) { @@ -248,6 +247,24 @@ void DrawEngineDX9::ApplyDrawState(int prim) { dxstate.stencilCompareMask.set(stencilState.testMask); dxstate.stencilOp.set(stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass]); dxstate.stencilWriteMask.set(stencilState.writeMask); + + // Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during + // depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth + // test and modify the alpha function... + if (SpongebobDepthInverseConditions(stencilState)) { + dxstate.blend.set(true); + dxstate.blendEquation.set(D3DBLENDOP_ADD, D3DBLENDOP_ADD); + dxstate.blendFunc.set(D3DBLEND_ZERO, D3DBLEND_ZERO, D3DBLEND_ZERO, D3DBLEND_ZERO); + dxstate.colorMask.set(8); + + dxstate.depthFunc.set(D3DCMP_LESS); + dxstate.stencilFunc.set(D3DCMP_ALWAYS); + // Invert + dxstate.stencilOp.set(D3DSTENCILOP_ZERO, D3DSTENCILOP_KEEP, D3DSTENCILOP_ZERO); + + // TODO: Need to set in a way that carries over to the next draw.. + gstate_c.Dirty(DIRTY_BLEND_STATE); + } } else { dxstate.stencilTest.disable(); } diff --git a/GPU/GLES/StateMappingGLES.cpp b/GPU/GLES/StateMappingGLES.cpp index 4ebdd165d29b..a06bac650c4f 100644 --- a/GPU/GLES/StateMappingGLES.cpp +++ b/GPU/GLES/StateMappingGLES.cpp @@ -144,7 +144,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) { bool useBufferedRendering = framebufferManager_->UseBufferedRendering(); if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) { - gstate_c.Clean(DIRTY_BLEND_STATE); gstate_c.SetAllowFramebufferRead(!g_Config.bDisableSlowFramebufEffects); if (gstate.isModeClear()) { @@ -208,7 +207,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) { } else { renderManager->SetNoBlendAndMask(mask); } - #ifndef USING_GLES2 if (gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP)) { renderManager->SetLogicOp(gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY, @@ -219,8 +217,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) { } if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) { - gstate_c.Clean(DIRTY_RASTER_STATE); - // Dither bool dither = gstate.isDitherEnabled(); bool cullEnable; @@ -247,7 +243,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) { } if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) { - gstate_c.Clean(DIRTY_DEPTHSTENCIL_STATE); GenericStencilFuncState stencilState; ConvertStencilFuncState(stencilState); @@ -264,6 +259,19 @@ void DrawEngineGLES::ApplyDrawState(int prim) { if (stencilState.enabled) { renderManager->SetStencilFunc(stencilState.enabled, compareOps[stencilState.testFunc], stencilState.testRef, stencilState.testMask); renderManager->SetStencilOp(stencilState.writeMask, stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass]); + + // Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during + // depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth + // test and modify the alpha function... + if (SpongebobDepthInverseConditions(stencilState)) { + renderManager->SetBlendAndMask(0x8, true, GL_ZERO, GL_ZERO, GL_ZERO, GL_ZERO, GL_FUNC_ADD, GL_FUNC_ADD); + renderManager->SetDepth(true, false, GL_LESS); + renderManager->SetStencilFunc(true, GL_ALWAYS, 0xFF, 0xFF); + renderManager->SetStencilOp(0xFF, GL_ZERO, GL_KEEP, GL_ZERO); + + // TODO: Need to set in a way that carries over to the next draw.. + gstate_c.Dirty(DIRTY_BLEND_STATE); + } } else { renderManager->SetStencilDisabled(); } @@ -271,7 +279,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) { } if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) { - gstate_c.Clean(DIRTY_VIEWPORTSCISSOR_STATE); ConvertViewportAndScissor(useBufferedRendering, framebufferManager_->GetRenderWidth(), framebufferManager_->GetRenderHeight(), framebufferManager_->GetTargetBufferWidth(), framebufferManager_->GetTargetBufferHeight(), @@ -284,6 +291,8 @@ void DrawEngineGLES::ApplyDrawState(int prim) { vpAndScissor.viewportW, vpAndScissor.viewportH, vpAndScissor.depthRangeMin, vpAndScissor.depthRangeMax }); } + + gstate_c.Clean(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_BLEND_STATE); } void DrawEngineGLES::ApplyDrawStateLate(bool setStencilValue, int stencilValue) { diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp index 26057328cfe3..0d4b31b96d52 100644 --- a/GPU/Vulkan/ShaderManagerVulkan.cpp +++ b/GPU/Vulkan/ShaderManagerVulkan.cpp @@ -358,7 +358,7 @@ VulkanFragmentShader *ShaderManagerVulkan::GetFragmentShaderFromModule(VkShaderM // instantaneous. #define CACHE_HEADER_MAGIC 0xff51f420 -#define CACHE_VERSION 19 +#define CACHE_VERSION 20 struct VulkanCacheHeader { uint32_t magic; uint32_t version; diff --git a/GPU/Vulkan/StateMappingVulkan.cpp b/GPU/Vulkan/StateMappingVulkan.cpp index 566708480abb..9ca4a81b9d95 100644 --- a/GPU/Vulkan/StateMappingVulkan.cpp +++ b/GPU/Vulkan/StateMappingVulkan.cpp @@ -301,6 +301,30 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag dynState.stencilRef = stencilState.testRef; dynState.stencilCompareMask = stencilState.testMask; dynState.stencilWriteMask = stencilState.writeMask; + + // Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during + // depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth + // test and modify the alpha function... + if (SpongebobDepthInverseConditions(stencilState)) { + key.blendEnable = true; + key.blendOpAlpha = VK_BLEND_OP_ADD; + key.blendOpColor = VK_BLEND_OP_ADD; + key.srcColor = VK_BLEND_FACTOR_ZERO; + key.destColor = VK_BLEND_FACTOR_ZERO; + key.logicOpEnable = false; + key.srcAlpha = VK_BLEND_FACTOR_ZERO; + key.destAlpha = VK_BLEND_FACTOR_ZERO; + key.colorWriteMask = VK_COLOR_COMPONENT_A_BIT; + key.depthCompareOp = VK_COMPARE_OP_LESS; // Inverse of GREATER_EQUAL + key.stencilCompareOp = VK_COMPARE_OP_ALWAYS; + // Invert + key.stencilPassOp = VK_STENCIL_OP_ZERO; + key.stencilFailOp = VK_STENCIL_OP_ZERO; + key.stencilDepthFailOp = VK_STENCIL_OP_KEEP; + + // TODO: Need to set in a way that carries over to the next draw.. + gstate_c.Dirty(DIRTY_BLEND_STATE); + } } else { key.stencilTestEnable = false; key.stencilCompareOp = VK_COMPARE_OP_ALWAYS; diff --git a/assets/compat.ini b/assets/compat.ini index 974a6216defd..289a5d21bb71 100644 --- a/assets/compat.ini +++ b/assets/compat.ini @@ -745,6 +745,12 @@ ULJM05412 = true NPJH50083 = true ULJM05570 = true +# Cars Race-o-rama +ULUS10428 = true +# MX vs ATV Reflex +ULES01375 = true +ULUS10429 = true + [IntraVRAMBlockTransferAllowCreateFB] # Final Fantasy - Type 0 ULJM05900 = true @@ -1149,6 +1155,46 @@ ULES01441 = true ULJM05600 = true ULJM05775 = true +# Spongebob - The Yellow Avenger (see #15898) +ULUS10092 = true +ULES00280 = true + +# MX vs ATV Reflex +ULES01375 = true +ULUS10429 = true + +# MX vs ATV Untamed +ULES00993 = true +ULES00994 = true +ULUS10330 = true + +# Cars race-o-rama +ULES01333 = true +ULUS10428 = true + +# God of War: Chains of Olympus +# The old hack for the shadows isn't working anymore since the framebuffers don't match. +# This is nicer anyway. +UCUS98653 = true +UCES00842 = true +UCKS45084 = true +UCUS98705 = true +ULJM05348 = true +ULJM05438 = true +NPUG80325 = true +NPEG00023 = true +NPHG00028 = true + +# God of War: Ghost of Sparta +UCUS98737 = true +UCAS40323 = true +UCKS45161 = true +NPHG00092 = true +NPEG00044 = true +UCJS10114 = true +UCES01401 = true +NPJG00120 = true + [ShaderColorBitmask] # No users right now, but keeping it around as a more accurate option than BlueToAlpha, for debugging mainly Outrun. diff --git a/unittest/TestShaderGenerators.cpp b/unittest/TestShaderGenerators.cpp index 5e722e3148f1..28ab16e2c486 100644 --- a/unittest/TestShaderGenerators.cpp +++ b/unittest/TestShaderGenerators.cpp @@ -286,6 +286,7 @@ bool TestDepalShaders() { config.mask = 0xFF; config.bufferFormat = GE_FORMAT_8888; config.textureFormat = GE_TFMT_CLUT32; + config.depthUpperBits = 0; ShaderWriter writer(buffer, desc, ShaderStage::Fragment); GenerateDepalFs(writer, config);