diff --git a/Common/GPU/Shader.h b/Common/GPU/Shader.h
index 12a7ddb8492d..75e1a9b2e95c 100644
--- a/Common/GPU/Shader.h
+++ b/Common/GPU/Shader.h
@@ -85,6 +85,12 @@ struct UniformBufferDesc {
 	std::vector<UniformDesc> uniforms;
 };
 
+struct UniformDef {
+	const char *type;
+	const char *name;
+	int index;
+};
+
 struct SamplerDef {
 	const char *name;
 	// TODO: Might need unsigned samplers, 3d samplers, or other types in the future.
diff --git a/Common/GPU/ShaderWriter.h b/Common/GPU/ShaderWriter.h
index a2d80a69101c..7464ebc652a3 100644
--- a/Common/GPU/ShaderWriter.h
+++ b/Common/GPU/ShaderWriter.h
@@ -22,12 +22,6 @@ struct InputDef {
 	int semantic;
 };
 
-struct UniformDef {
-	const char *type;
-	const char *name;
-	int index;
-};
-
 struct VaryingDef {
 	const char *type;
 	const char *name;
diff --git a/Common/Math/math_util.h b/Common/Math/math_util.h
index 0807d47003b1..fd47662b5409 100644
--- a/Common/Math/math_util.h
+++ b/Common/Math/math_util.h
@@ -28,6 +28,7 @@ inline bool isPowerOf2(int n) {
 	return n == 1 || (n & (n - 1)) == 0;
 }
 
+// Next power of 2.
 inline uint32_t RoundUpToPowerOf2(uint32_t v) {
 	v--;
 	v |= v >> 1;
diff --git a/GPU/Common/DepalettizeShaderCommon.cpp b/GPU/Common/DepalettizeShaderCommon.cpp
index e98c42485804..1746215ada46 100644
--- a/GPU/Common/DepalettizeShaderCommon.cpp
+++ b/GPU/Common/DepalettizeShaderCommon.cpp
@@ -26,6 +26,7 @@
 #include "Core/Reporting.h"
 #include "GPU/Common/GPUStateUtils.h"
 #include "GPU/Common/DepalettizeShaderCommon.h"
+#include "GPU/Common/Draw2D.h"
 
 static const InputDef vsInputs[2] = {
 	{ "vec2", "a_position", Draw::SEM_POSITION, },
@@ -47,10 +48,23 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
 	const int shift = config.shift;
 	const int mask = config.mask;
 
+	writer.C("  vec2 texcoord = v_texcoord;\n");
+
+	// Implement the swizzle we need to simulate, if a game uses 8888 framebuffers and any other mode than "6" to access depth textures.
+	// This implements the "2" mode swizzle (it fixes up the Y direction but not X. See comments on issue #15898)
+	// NOTE: This swizzle can be made to work with any power-of-2 resolution scaleFactor by shifting
+	// the bits around, but not sure how to handle 3x scaling. For now this is 1x-only (rough edges at higher resolutions).
 	if (config.bufferFormat == GE_FORMAT_DEPTH16) {
 		DepthScaleFactors factors = GetDepthScaleFactors();
 		writer.ConstFloat("z_scale", factors.scale);
 		writer.ConstFloat("z_offset", factors.offset);
+		if (config.depthUpperBits == 0x2) {
+			writer.C(R"(
+  int x = int((texcoord.x / scaleFactor) * texSize.x);
+  int temp = (x & 0xFFFFFE0F) | ((x >> 1) & 0xF0) | ((x << 4) & 0x100);
+  texcoord.x = (float(temp) / texSize.x) * scaleFactor;
+)");
+		}
 	}
 
 	// Sampling turns our texture into floating point. To avoid this, might be able
@@ -66,7 +80,7 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
 	// An alternative would be to have a special mode where we keep some extra precision here and sample the CLUT linearly - works for ramps such
 	// as those that Test Drive uses for its color remapping. But would need game specific flagging.
 
-	writer.C("  vec4 color = ").SampleTexture2D("tex", "v_texcoord").C(";\n");
+	writer.C("  vec4 color = ").SampleTexture2D("tex", "texcoord").C(";\n");
 
 	int shiftedMask = mask << shift;
 	switch (config.bufferFormat) {
@@ -103,6 +117,7 @@ void GenerateDepalShader300(ShaderWriter &writer, const DepalConfig &config) {
 
 		if (config.bufferFormat == GE_FORMAT_DEPTH16 && config.textureFormat == GE_TFMT_5650) {
 			// Convert depth to 565, without going through a CLUT.
+			// TODO: Make "depal without a CLUT" a separate concept, to avoid redundantly creating a CLUT texture.
 			writer.C("  int idepth = int(clamp(depth, 0.0, 65535.0));\n");
 			writer.C("  float r = float(idepth & 31) / 31.0f;\n");
 			writer.C("  float g = float((idepth >> 5) & 63) / 63.0f;\n");
@@ -323,7 +338,7 @@ void GenerateDepalSmoothed(ShaderWriter &writer, const DepalConfig &config) {
 void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config) {
 	writer.DeclareSamplers(samplers);
 	writer.HighPrecisionFloat();
-	writer.BeginFSMain(Slice<UniformDef>::empty(), varyings, FSFLAG_NONE);
+	writer.BeginFSMain(config.bufferFormat == GE_FORMAT_DEPTH16 ? g_draw2Duniforms : Slice<UniformDef>::empty(), varyings, FSFLAG_NONE);
 	if (config.smoothedDepal) {
 		// Handles a limited set of cases, but doesn't need any integer math so we don't
 		// need two variants.
diff --git a/GPU/Common/DepalettizeShaderCommon.h b/GPU/Common/DepalettizeShaderCommon.h
index 433dfa74df82..0f72afe27c36 100644
--- a/GPU/Common/DepalettizeShaderCommon.h
+++ b/GPU/Common/DepalettizeShaderCommon.h
@@ -27,13 +27,14 @@ class ShaderWriter;
 static const int DEPAL_TEXTURE_OLD_AGE = 120;
 
 struct DepalConfig {
-	int mask;
-	int shift;
 	u32 startPos;
+	u8 mask;
+	u8 shift;
+	bool smoothedDepal;
+	u8 depthUpperBits;
 	GEPaletteFormat clutFormat;
 	GETextureFormat textureFormat;
 	GEBufferFormat bufferFormat;
-	bool smoothedDepal;
 };
 
 void GenerateDepalFs(ShaderWriter &writer, const DepalConfig &config);
diff --git a/GPU/Common/Draw2D.cpp b/GPU/Common/Draw2D.cpp
index 1043858891ec..cd54704e9ab0 100644
--- a/GPU/Common/Draw2D.cpp
+++ b/GPU/Common/Draw2D.cpp
@@ -40,7 +40,7 @@ static const SamplerDef samplers[1] = {
 	{ "tex" },
 };
 
-static const UniformDef uniforms[2] = {
+const UniformDef g_draw2Duniforms[2] = {
 	{ "vec2", "texSize", 0 },
 	{ "float", "scaleFactor", 1},
 };
@@ -53,7 +53,7 @@ struct Draw2DUB {
 
 const UniformBufferDesc draw2DUBDesc{ sizeof(Draw2DUB), {
 	{ "texSize", -1, 0, UniformType::FLOAT2, 0 },
-	{ "scaleFactor", -1, 1, UniformType::FLOAT1, 0 },
+	{ "scaleFactor", -1, 1, UniformType::FLOAT1, 8 },
 } };
 
 
@@ -102,7 +102,7 @@ Draw2DPipelineInfo GenerateDraw2D565ToDepthFs(ShaderWriter &writer) {
 
 Draw2DPipelineInfo GenerateDraw2D565ToDepthDeswizzleFs(ShaderWriter &writer) {
 	writer.DeclareSamplers(samplers);
-	writer.BeginFSMain(uniforms, varyings, FSFLAG_WRITEDEPTH);
+	writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_WRITEDEPTH);
 	writer.C("  vec4 outColor = vec4(0.0, 0.0, 0.0, 0.0);\n");
 	// Unlike when just copying a depth buffer, here we're generating new depth values so we'll
 	// have to apply the scaling.
@@ -253,6 +253,20 @@ Draw2DPipeline *Draw2D::Create2DPipeline(std::function<Draw2DPipelineInfo (Shade
 	};
 }
 
+void Draw2D::Blit(Draw2DPipeline *pipeline, float srcX1, float srcY1, float srcX2, float srcY2, float dstX1, float dstY1, float dstX2, float dstY2, float srcWidth, float srcHeight, float dstWidth, float dstHeight, bool linear, int scaleFactor) {
+	float dX = 1.0f / (float)dstWidth;
+	float dY = 1.0f / (float)dstHeight;
+	float sX = 1.0f / (float)srcWidth;
+	float sY = 1.0f / (float)srcHeight;
+	Draw2DVertex vtx[4] = {
+		{ -1.0f + 2.0f * dX * dstX1, -(1.0f - 2.0f * dY * dstY1), sX * srcX1, sY * srcY1 },
+		{ -1.0f + 2.0f * dX * dstX2, -(1.0f - 2.0f * dY * dstY1), sX * srcX2, sY * srcY1 },
+		{ -1.0f + 2.0f * dX * dstX1, -(1.0f - 2.0f * dY * dstY2), sX * srcX1, sY * srcY2 },
+		{ -1.0f + 2.0f * dX * dstX2, -(1.0f - 2.0f * dY * dstY2), sX * srcX2, sY * srcY2 },
+	};
+
+	DrawStrip2D(nullptr, vtx, 4, linear, pipeline, srcWidth, srcHeight, scaleFactor);
+}
 
 void Draw2D::DrawStrip2D(Draw::Texture *tex, Draw2DVertex *verts, int vertexCount, bool linearFilter, Draw2DPipeline *pipeline, float texW, float texH, int scaleFactor) {
 	using namespace Draw;
diff --git a/GPU/Common/Draw2D.h b/GPU/Common/Draw2D.h
index ad489dfce6ca..2df15581b0cd 100644
--- a/GPU/Common/Draw2D.h
+++ b/GPU/Common/Draw2D.h
@@ -36,6 +36,8 @@ struct Draw2DPipelineInfo {
 	Slice<SamplerDef> samplers;
 };
 
+extern const UniformDef g_draw2Duniforms[2];
+
 struct Draw2DPipeline {
 	Draw::Pipeline *pipeline;
 	Draw2DPipelineInfo info;
@@ -58,6 +60,8 @@ class Draw2D {
 	Draw2DPipeline *Create2DPipeline(std::function<Draw2DPipelineInfo(ShaderWriter &)> generate);
 
 	void DrawStrip2D(Draw::Texture *tex, Draw2DVertex *verts, int vertexCount, bool linearFilter, Draw2DPipeline *pipeline, float texW = 0.0f, float texH = 0.0f, int scaleFactor = 0);
+
+	void Blit(Draw2DPipeline *pipeline, float srcX1, float srcY1, float srcX2, float srcY2, float dstX1, float dstY1, float dstX2, float dstY2, float srcWidth, float srcHeight, float dstWidth, float dstHeight, bool linear, int scaleFactor);
 	void Ensure2DResources();
 
 private:
diff --git a/GPU/Common/FramebufferManagerCommon.cpp b/GPU/Common/FramebufferManagerCommon.cpp
index a75b0d4b851a..1b25a4608538 100644
--- a/GPU/Common/FramebufferManagerCommon.cpp
+++ b/GPU/Common/FramebufferManagerCommon.cpp
@@ -27,6 +27,7 @@
 #include "Common/Math/math_util.h"
 #include "Common/System/Display.h"
 #include "Common/CommonTypes.h"
+#include "Common/StringUtils.h"
 #include "Core/Config.h"
 #include "Core/ConfigValues.h"
 #include "Core/Core.h"
@@ -348,7 +349,7 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(const Frame
 				const int x_offset = (params.fb_address - v->fb_address) / bpp;
 				if (x_offset < params.fb_stride && v->height >= drawing_height) {
 					// Pretty certainly a pure render-to-X-offset.
-					WARN_LOG_REPORT_ONCE(renderoffset, HLE, "Rendering to framebuffer offset: %08x +%dx%d", v->fb_address, x_offset, 0);
+					WARN_LOG_REPORT_ONCE(renderoffset, HLE, "Rendering to framebuffer offset at %08x +%dx%d (stride %d)", v->fb_address, x_offset, 0, v->fb_stride);
 					vfb = v;
 					gstate_c.SetCurRTOffset(x_offset, 0);
 					vfb->width = std::max((int)vfb->width, x_offset + drawing_width);
@@ -446,32 +447,6 @@ VirtualFramebuffer *FramebufferManagerCommon::DoSetRenderFrameBuffer(const Frame
 			// TODO: Is it worth trying to upload the depth buffer (only if it wasn't copied above..?)
 		}
 
-		// Let's check for depth buffer overlap. Might be interesting (not that interesting anymore..)
-		bool sharingReported = false;
-		for (size_t i = 0, end = vfbs_.size(); i < end; ++i) {
-			if (vfbs_[i]->z_stride != 0 && params.fb_address == vfbs_[i]->z_address) {
-				// If it's clearing it, most likely it just needs more video memory.
-				// Technically it could write something interesting and the other might not clear, but that's not likely.
-				if (params.isDrawing) {
-					if (params.fb_address != params.z_address && vfbs_[i]->fb_address != vfbs_[i]->z_address) {
-						WARN_LOG_REPORT(SCEGE, "FBO created from existing depthbuffer as color, %08x/%08x and %08x/%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address);
-					}
-				}
-			} else if (params.z_stride != 0 && params.z_address == vfbs_[i]->fb_address) {
-				// If it's clearing it, then it's probably just the reverse of the above case.
-				if (params.isWritingDepth) {
-					WARN_LOG_REPORT(SCEGE, "FBO using existing buffer as depthbuffer, %08x/%08x and %08x/%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address);
-				}
-			} else if (vfbs_[i]->z_stride != 0 && params.z_address == vfbs_[i]->z_address && params.fb_address != vfbs_[i]->fb_address && !sharingReported) {
-				// This happens a lot, but virtually always it's cleared.
-				// It's possible the other might not clear, but when every game is reported it's not useful.
-				if (params.isWritingDepth && (vfbs_[i]->usageFlags & FB_USAGE_RENDER_DEPTH)) {
-					WARN_LOG(SCEGE, "FBO reusing depthbuffer, c=%08x/d=%08x and c=%08x/d=%08x", params.fb_address, params.z_address, vfbs_[i]->fb_address, vfbs_[i]->z_address);
-					sharingReported = true;
-				}
-			}
-		}
-
 		// We already have it!
 	} else if (vfb != currentRenderVfb_) {
 		// Use it as a render target.
@@ -604,21 +579,30 @@ void FramebufferManagerCommon::CopyToDepthFromOverlappingFramebuffers(VirtualFra
 }
 
 // Can't easily dynamically create these strings, we just pass along the pointer.
-static const char *reinterpretStrings[3][3] = {
+static const char *reinterpretStrings[4][4] = {
 	{
 		"self_reinterpret_565",
 		"reinterpret_565_to_5551",
 		"reinterpret_565_to_4444",
+		"reinterpret_565_to_8888",
 	},
 	{
 		"reinterpret_5551_to_565",
 		"self_reinterpret_5551",
 		"reinterpret_5551_to_4444",
+		"reinterpret_5551_to_8888",
 	},
 	{
 		"reinterpret_4444_to_565",
 		"reinterpret_4444_to_5551",
 		"self_reinterpret_4444",
+		"reinterpret_4444_to_8888",
+	},
+	{
+		"reinterpret_8888_to_565",
+		"reinterpret_8888_to_5551",
+		"reinterpret_8888_to_4444",
+		"self_reinterpret_8888",
 	},
 };
 
@@ -676,6 +660,17 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra
 				continue;
 			}
 			sources.push_back(CopySource{ src, RASTER_COLOR, xOffset, yOffset });
+		} else if (src->fb_address == dst->fb_address && src->FbStrideInBytes() == dst->FbStrideInBytes()) {
+			if (src->fb_stride == dst->fb_stride * 2) {
+				// Reinterpret from 16-bit to 32-bit.
+				sources.push_back(CopySource{ src, RASTER_COLOR, 0, 0 });
+			} else if (src->fb_stride * 2 == dst->fb_stride) {
+				// Reinterpret from 32-bit to 16-bit.
+				sources.push_back(CopySource{ src, RASTER_COLOR, 0, 0 });
+			} else {
+				// 16-to-16 reinterpret, should have been caught above already.
+				_assert_msg_(false, "Reinterpret: Shouldn't get here");
+			}
 		}
 	}
 
@@ -685,11 +680,15 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra
 
 	bool tookActions = false;
 
+	// TODO: Only do the latest one.
 	for (const CopySource &source : sources) {
 		VirtualFramebuffer *src = source.vfb;
 
 		// Copy a rectangle from the original to the new buffer.
 		// Yes, we mean to look at src->width/height for the dest rectangle.
+
+		// TODO: Try to bound the blit using gstate_c.vertBounds like depal does.
+
 		int srcWidth = src->width * src->renderScaleFactor;
 		int srcHeight = src->height * src->renderScaleFactor;
 		int dstWidth = src->width * dst->renderScaleFactor;
@@ -707,44 +706,55 @@ void FramebufferManagerCommon::CopyToColorFromOverlappingFramebuffers(VirtualFra
 				gpuStats.numColorCopies++;
 				pipeline = Get2DPipeline(DRAW2D_COPY_COLOR);
 				pass_name = "copy_color";
-			} else if (IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) {
-				if (PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) {
-					if (PSP_CoreParameter().compat.flags().BlueToAlpha) {
-						WARN_LOG_ONCE(bta, G3D, "WARNING: Reinterpret encountered with BlueToAlpha on");
-					}
+			} else if (PSP_CoreParameter().compat.flags().ReinterpretFramebuffers) {
+				if (PSP_CoreParameter().compat.flags().BlueToAlpha) {
+					WARN_LOG_ONCE(bta, G3D, "WARNING: Reinterpret encountered with BlueToAlpha on");
+				}
 
-					// Reinterpret!
-					WARN_LOG_N_TIMES(reint, 20, G3D, "Reinterpret detected from %08x_%s to %08x_%s",
-						src->fb_address, GeBufferFormatToString(src->fb_format),
-						dst->fb_address, GeBufferFormatToString(dst->fb_format));
-					pipeline = reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format];
-					pass_name = reinterpretStrings[(int)src->fb_format][(int)dst->fb_format];
-					if (!pipeline) {
-						pipeline = draw2D_.Create2DPipeline([=](ShaderWriter &shaderWriter) -> Draw2DPipelineInfo {
-							return GenerateReinterpretFragmentShader(shaderWriter, src->fb_format, dst->fb_format);
-						});
-						reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format] = pipeline;
-					}
-					gpuStats.numReinterpretCopies++;
-				} else {
-					// Fake reinterpret - just clear the way we always did on Vulkan. Just clear color and stencil.
-					if (src->fb_format == GE_FORMAT_565) {
-						// We have to bind here instead of clear, since it can be that no framebuffer is bound.
-						// The backend can sometimes directly optimize it to a clear.
-
-						// Games that are marked as doing reinterpret just ignore this - better to keep the data than to clear.
-						// Fixes #13717.
-						if (!PSP_CoreParameter().compat.flags().ReinterpretFramebuffers && !PSP_CoreParameter().compat.flags().BlueToAlpha) {
-							draw_->BindFramebufferAsRenderTarget(dst->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "FakeReinterpret");
-							// Need to dirty anything that has command buffer dynamic state, in case we started a new pass above.
-							// Should find a way to feed that information back, maybe... Or simply correct the issue in the rendermanager.
-							gstate_c.Dirty(DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_BLEND_STATE);
-						}
+				if (IsBufferFormat16Bit(src->fb_format) && !IsBufferFormat16Bit(dst->fb_format)) {
+					// We halve the X coordinates in the destination framebuffer.
+					// The shader will collect two pixels worth of input data and merge into one.
+					dstX1 *= 0.5f;
+					dstX2 *= 0.5f;
+				} else if (!IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) {
+					// We double the X coordinates in the destination framebuffer.
+					// The shader will sample and depending on the X coordinate & 1, use the upper or lower bits.
+					dstX1 *= 2.0f;
+					dstX2 *= 2.0f;
+				}
+
+				// Reinterpret!
+				WARN_LOG_N_TIMES(reint, 5, G3D, "Reinterpret detected from %08x_%s to %08x_%s",
+					src->fb_address, GeBufferFormatToString(src->fb_format),
+					dst->fb_address, GeBufferFormatToString(dst->fb_format));
+				pipeline = reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format];
+				pass_name = reinterpretStrings[(int)src->fb_format][(int)dst->fb_format];
+				if (!pipeline) {
+					pipeline = draw2D_.Create2DPipeline([=](ShaderWriter &shaderWriter) -> Draw2DPipelineInfo {
+						return GenerateReinterpretFragmentShader(shaderWriter, src->fb_format, dst->fb_format);
+					});
+					reinterpretFromTo_[(int)src->fb_format][(int)dst->fb_format] = pipeline;
+				}
+
+				gpuStats.numReinterpretCopies++;
+			} else if (IsBufferFormat16Bit(src->fb_format) && IsBufferFormat16Bit(dst->fb_format)) {
+				// Fake reinterpret - just clear the way we always did on Vulkan. Just clear color and stencil.
+				if (src->fb_format == GE_FORMAT_565) {
+					// We have to bind here instead of clear, since it can be that no framebuffer is bound.
+					// The backend can sometimes directly optimize it to a clear.
+
+					// Games that are marked as doing reinterpret just ignore this - better to keep the data than to clear.
+					// Fixes #13717.
+					if (!PSP_CoreParameter().compat.flags().ReinterpretFramebuffers && !PSP_CoreParameter().compat.flags().BlueToAlpha) {
+						draw_->BindFramebufferAsRenderTarget(dst->fbo, { Draw::RPAction::CLEAR, Draw::RPAction::KEEP, Draw::RPAction::CLEAR }, "FakeReinterpret");
+						// Need to dirty anything that has command buffer dynamic state, in case we started a new pass above.
+						// Should find a way to feed that information back, maybe... Or simply correct the issue in the rendermanager.
+						gstate_c.Dirty(DIRTY_DEPTHSTENCIL_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_BLEND_STATE);
+						tookActions = true;
 					}
-					tookActions = true;
 				}
 			}
-
+			
 			if (pipeline) {
 				tookActions = true;
 				// OK we have the pipeline, now just do the blit.
@@ -1435,6 +1445,9 @@ void FramebufferManagerCommon::ResizeFramebufFBO(VirtualFramebuffer *vfb, int w,
 		vfb->renderHeight = (u16)(vfb->bufferHeight * renderScaleFactor_);
 	}
 
+	bool creating = old.bufferWidth == 0;
+	WARN_LOG(FRAMEBUF, "%s %s FBO at %08x/%d from %dx%d to %dx%d (force=%d)", creating ? "Creating" : "Resizing", GeBufferFormatToString(vfb->fb_format), vfb->fb_address, vfb->fb_stride, old.bufferWidth, old.bufferHeight, vfb->bufferWidth, vfb->bufferHeight, (int)force);
+
 	// During hardware rendering, we always render at full color depth even if the game wouldn't on real hardware.
 	// It's not worth the trouble trying to support lower bit-depth rendering, just
 	// more cases to test that nobody will ever use.
@@ -1608,68 +1621,121 @@ bool FramebufferManagerCommon::NotifyFramebufferCopy(u32 src, u32 dst, int size,
 	}
 }
 
-void FramebufferManagerCommon::FindTransferFramebuffer(VirtualFramebuffer *&buffer, u32 basePtr, int stride, int &x, int &y, int &width, int &height, int bpp, bool destination) {
-	u32 xOffset = -1;
-	u32 yOffset = -1;
-	int transferWidth = width;
-	int transferHeight = height;
+std::string BlockTransferRect::ToString() const {
+	int bpp = BufferFormatBytesPerPixel(vfb->fb_format);
+	return StringFromFormat("%08x/%d/%s seq:%d  %d,%d %dx%d", vfb->fb_address, vfb->FbStrideInBytes(), GeBufferFormatToString(vfb->fb_format), vfb->colorBindSeq, x_bytes / bpp, y, w_bytes / bpp, h);
+}
 
+// Only looks for color buffers. Due to swizzling and other concerns, games have not been seen using block copies
+// for depth data yet.
+bool FramebufferManagerCommon::FindTransferFramebuffer(u32 basePtr, int stride_pixels, int x_pixels, int y, int w_pixels, int h, int bpp, bool destination, BlockTransferRect *rect) {
 	basePtr &= 0x3FFFFFFF;
+	rect->vfb = nullptr;
+
+	if (!stride_pixels) {
+		WARN_LOG(G3D, "Zero stride in FindTransferFrameBuffer, ignoring");
+		return false;
+	}
+
+	const u32 byteStride = stride_pixels * bpp;
+	int x_bytes = x_pixels * bpp;
+	int w_bytes = w_pixels * bpp;
+
+	std::vector<BlockTransferRect> candidates;
+
+	// We work entirely in bytes when we do the matching, because games don't consistently use bpps that match
+	// that of their buffers. Then after matching we try to map the copy to the simplest operation that does
+	// what we need.
 
 	for (auto vfb : vfbs_) {
 		const u32 vfb_address = vfb->fb_address & 0x3FFFFFFF;
 		const u32 vfb_size = ColorBufferByteSize(vfb);
+
+		if (basePtr < vfb_address || basePtr >= vfb_address + vfb_size) {
+			continue;
+		}
+
 		const u32 vfb_bpp = BufferFormatBytesPerPixel(vfb->fb_format);
-		const u32 vfb_byteStride = vfb->fb_stride * vfb_bpp;
-		const u32 vfb_byteWidth = vfb->width * vfb_bpp;
-
-		if (vfb_address <= basePtr && basePtr < vfb_address + vfb_size) {
-			const u32 byteOffset = basePtr - vfb_address;
-			const u32 byteStride = stride * bpp;
-			const u32 memYOffset = byteOffset / byteStride;
-
-			// Some games use mismatching bitdepths. But make sure the stride matches.
-			// If it doesn't, generally this means we detected the framebuffer with too large a height.
-			// Use bufferHeight in case of buffers that resize up and down often per frame (Valkyrie Profile.)
-
-			// TODO: Surely this first comparison should be <= ?
-			// Or does the exact match (byteOffset == 0) case get handled elsewhere?
-			bool match = memYOffset < yOffset && (int)memYOffset <= (int)vfb->bufferHeight - height;
-			if (match && vfb_byteStride != byteStride) {
-				// Grand Knights History copies with a mismatching stride but a full line at a time.
-				// That's why we multiply by height, not width - this copy is a rectangle with the wrong stride but a line with the correct one.
-				// Makes it hard to detect the wrong transfers in e.g. God of War.
-				if (transferWidth != stride || (byteStride * transferHeight != vfb_byteStride && byteStride * transferHeight != vfb_byteWidth)) {
-					if (destination) {
-						// However, some other games write cluts to framebuffers.
-						// Let's catch this and upload.  Otherwise reject the match.
-						match = (vfb->usageFlags & FB_USAGE_CLUT) != 0;
-						if (match) {
-							width = byteStride * transferHeight / vfb_bpp;
-							height = 1;
-						}
+		const u32 vfb_byteStride = vfb->FbStrideInBytes();
+		const u32 vfb_byteWidth = vfb->WidthInBytes();
+
+		BlockTransferRect candidate{ vfb };
+		candidate.w_bytes = w_pixels * bpp;
+		candidate.h = h;
+
+		const u32 byteOffset = basePtr - vfb_address;
+		const int memXOffset = byteOffset % byteStride;
+		const int memYOffset = byteOffset / byteStride;
+
+		// Some games use mismatching bitdepths. But make sure the stride matches.
+		// If it doesn't, generally this means we detected the framebuffer with too large a height.
+		// Use bufferHeight in case of buffers that resize up and down often per frame (Valkyrie Profile.)
+
+		// If it's outside the vfb by a single pixel, we currently disregard it.
+		if (memYOffset > vfb->bufferHeight - h) {
+			continue;
+		}
+
+		if (byteOffset == vfb->WidthInBytes() && w_bytes < vfb->FbStrideInBytes()) {
+			// Looks like we're in a margin texture of the vfb, which is not the vfb itself.
+			// Ignore the match.
+			continue;
+		}
+
+		if (vfb_byteStride != byteStride) {
+			// Grand Knights History occasionally copies with a mismatching stride but a full line at a time.
+			// That's why we multiply by height, not width - this copy is a rectangle with the wrong stride but a line with the correct one.
+			// Makes it hard to detect the wrong transfers in e.g. God of War.
+			if (w_pixels != stride_pixels || (byteStride * h != vfb_byteStride && byteStride * h != vfb_byteWidth)) {
+				if (destination) {
+					// However, some other games write cluts to framebuffers.
+					// Let's catch this and upload.  Otherwise reject the match.
+					bool match = (vfb->usageFlags & FB_USAGE_CLUT) != 0;
+					if (match) {
+						candidate.w_bytes = byteStride * h;
+						h = 1;
 					} else {
-						match = false;
+						continue;
 					}
 				} else {
-					width = byteStride * transferHeight / vfb_bpp;
-					height = 1;
+					continue;
 				}
-			} else if (match) {
-				width = transferWidth;
-				height = transferHeight;
-			}
-			if (match) {
-				xOffset = stride == 0 ? 0 : (byteOffset / bpp) % stride;
-				yOffset = memYOffset;
-				buffer = vfb;
+			} else {
+				// This is the Grand Knights History case.
+				candidate.w_bytes = byteStride * h;
+				candidate.h = 1;
 			}
+		} else {
+			candidate.w_bytes = w_bytes;
+			candidate.h = h;
 		}
+
+		candidate.x_bytes = x_bytes + memXOffset;
+		candidate.y = y + memYOffset;
+		candidate.vfb = vfb;
+		candidates.push_back(candidate);
 	}
 
-	if (yOffset != (u32)-1) {
-		x += xOffset;
-		y += yOffset;
+	// Sort candidates by just recency for now, we might add other.
+	std::sort(candidates.begin(), candidates.end());
+
+	if (candidates.size() > 1) {
+		std::string log;
+		for (auto &candidate : candidates) {
+			log += " - " + candidate.ToString() + "\n";
+		}
+		WARN_LOG_N_TIMES(mulblock, 5, G3D, "Multiple framebuffer candidates for %08x/%d/%d %d,%d %dx%d (dest = %d):\n%s", basePtr, stride_pixels, bpp, x_pixels, y, w_pixels, h, (int)destination, log.c_str());
+	}
+
+	if (!candidates.empty()) {
+		// Pick the last candidate.
+		*rect = candidates.back();
+		return true;
+	} else {
+		if (Memory::IsVRAMAddress(basePtr) && destination && h >= 128) {
+			WARN_LOG_N_TIMES(nocands, 5, G3D, "Didn't find a destination candidate for %08x/%d/%d %d,%d %dx%d", basePtr, stride_pixels, bpp, x_pixels, y, w_pixels, h);
+		}
+		return false;
 	}
 }
 
@@ -1866,92 +1932,125 @@ bool FramebufferManagerCommon::NotifyBlockTransferBefore(u32 dstBasePtr, int dst
 		return false;
 	}
 
-	VirtualFramebuffer *dstBuffer = 0;
-	VirtualFramebuffer *srcBuffer = 0;
-	int srcWidth = width;
-	int srcHeight = height;
-	int dstWidth = width;
-	int dstHeight = height;
+	BlockTransferRect dstRect{};
+	BlockTransferRect srcRect{};
 
 	// These modify the X/Y/W/H parameters depending on the memory offset of the base pointers from the actual buffers.
-	FindTransferFramebuffer(srcBuffer, srcBasePtr, srcStride, srcX, srcY, srcWidth, srcHeight, bpp, false);
-	FindTransferFramebuffer(dstBuffer, dstBasePtr, dstStride, dstX, dstY, dstWidth, dstHeight, bpp, true);
+	bool srcBuffer = FindTransferFramebuffer(srcBasePtr, srcStride, srcX, srcY, width, height, bpp, false, &srcRect);
+	bool dstBuffer = FindTransferFramebuffer(dstBasePtr, dstStride, dstX, dstY, width, height, bpp, true, &dstRect);
 
 	if (srcBuffer && !dstBuffer) {
+		// In here, we can't read from dstRect.
 		if (PSP_CoreParameter().compat.flags().BlockTransferAllowCreateFB ||
 			(PSP_CoreParameter().compat.flags().IntraVRAMBlockTransferAllowCreateFB &&
-				Memory::IsVRAMAddress(srcBuffer->fb_address) && Memory::IsVRAMAddress(dstBasePtr))) {
+				Memory::IsVRAMAddress(srcRect.vfb->fb_address) && Memory::IsVRAMAddress(dstBasePtr))) {
 			GEBufferFormat ramFormat;
 			// Try to guess the appropriate format. We only know the bpp from the block transfer command (16 or 32 bit).
 			if (bpp == 4) {
 				// Only one possibility unless it's doing split pixel tricks (which we could detect through stride maybe).
 				ramFormat = GE_FORMAT_8888;
-			} else if (srcBuffer->fb_format != GE_FORMAT_8888) {
+			} else if (srcRect.vfb->fb_format != GE_FORMAT_8888) {
 				// We guess that the game will interpret the data the same as it was in the source of the copy.
 				// Seems like a likely good guess, and works in Test Drive Unlimited.
-				ramFormat = srcBuffer->fb_format;
+				ramFormat = srcRect.vfb->fb_format;
 			} else {
 				// No info left - just fall back to something. But this is definitely split pixel tricks.
 				ramFormat = GE_FORMAT_5551;
 			}
-			dstBuffer = CreateRAMFramebuffer(dstBasePtr, dstWidth, dstHeight, dstStride, ramFormat);
+			dstBuffer = true;
+			dstRect.vfb = CreateRAMFramebuffer(dstBasePtr, width, height, dstStride, ramFormat);
 		}
 	}
 
-	if (dstBuffer)
-		dstBuffer->last_frame_used = gpuStats.numFlips;
+	if (dstBuffer) {
+		dstRect.vfb->last_frame_used = gpuStats.numFlips;
+		// Mark the destination as fresh.
+		dstRect.vfb->colorBindSeq = GetBindSeqCount();
+	}
 
 	if (dstBuffer && srcBuffer) {
-		if (srcBuffer == dstBuffer) {
-			if (srcX != dstX || srcY != dstY) {
-				WARN_LOG_N_TIMES(dstsrc, 100, G3D, "Intra-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)",
-					width, height, bpp,
-					srcBasePtr, srcX, srcY, srcStride,
-					dstBasePtr, dstX, dstY, dstStride);
-				FlushBeforeCopy();
-				// Some backends can handle blitting within a framebuffer. Others will just have to deal with it or ignore it, apparently.
-				BlitFramebuffer(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, dstWidth, dstHeight, bpp, "Blit_IntraBufferBlockTransfer");
-				RebindFramebuffer("rebind after intra block transfer");
-				SetColorUpdated(dstBuffer, skipDrawReason);
-				return true;  // Skip the memory copy.
-			} else {
+		if (srcRect.vfb == dstRect.vfb) {
+			// Transfer within the same buffer.
+			// This is a simple case because there will be no format conversion or similar shenanigans needed.
+			// However, the BPP might still mismatch, but in such a case we can convert the coordinates.
+			if (srcX == dstX && srcY == dstY) {
 				// Ignore, nothing to do.  Tales of Phantasia X does this by accident.
-				return true;  // Skip the memory copy.
+				// Returning true to also skip the memory copy.
+				return true;
 			}
-		} else {
-			WARN_LOG_N_TIMES(dstnotsrc, 100, G3D, "Inter-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)",
+
+			int buffer_bpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format);
+
+			if (bpp != buffer_bpp) {
+				WARN_LOG_ONCE(intrabpp, G3D, "Mismatched transfer bpp in intra-buffer block transfer. Was %d, expected %d.", bpp, buffer_bpp);
+				// We just switch to using the buffer's bpp, since we've already converted the rectangle to byte offsets.
+				bpp = buffer_bpp;
+			}
+
+			WARN_LOG_N_TIMES(dstsrc, 5, G3D, "Intra-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)",
+				width, height, bpp,
+				srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride,
+				dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride);
+			FlushBeforeCopy();
+			// Some backends can handle blitting within a framebuffer. Others will just have to deal with it or ignore it, apparently.
+			BlitFramebuffer(dstRect.vfb, dstX, dstY, srcRect.vfb, srcX, srcY, dstRect.w_bytes / bpp, dstRect.h / bpp, bpp, "Blit_IntraBufferBlockTransfer");
+			RebindFramebuffer("rebind after intra block transfer");
+			SetColorUpdated(dstRect.vfb, skipDrawReason);
+			return true;  // Skip the memory copy.
+		}
+
+		// Straightforward blit between two same-format framebuffers.
+		if (srcRect.vfb->fb_format == dstRect.vfb->fb_format) {
+			WARN_LOG_N_TIMES(dstnotsrc, 5, G3D, "Inter-buffer block transfer %dx%d %dbpp from %08x (x:%d y:%d stride:%d %s) -> %08x (x:%d y:%d stride:%d %s)",
 				width, height, bpp,
-				srcBasePtr, srcX, srcY, srcStride,
-				dstBasePtr, dstX, dstY, dstStride);
-			// Straightforward blit between two framebuffers.
+				srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride, GeBufferFormatToString(srcRect.vfb->fb_format),
+				dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride, GeBufferFormatToString(dstRect.vfb->fb_format));
+
+			// Straight blit will do, but check the bpp, we might need to convert coordinates differently.
+			int buffer_bpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format);
+			if (bpp != buffer_bpp) {
+				WARN_LOG_ONCE(intrabpp, G3D, "Mismatched transfer bpp in inter-buffer block transfer. Was %d, expected %d.", bpp, buffer_bpp);
+				// We just switch to using the buffer's bpp, since we've already converted the rectangle to byte offsets.
+				bpp = buffer_bpp;
+			}
 			FlushBeforeCopy();
-			BlitFramebuffer(dstBuffer, dstX, dstY, srcBuffer, srcX, srcY, dstWidth, dstHeight, bpp, "Blit_InterBufferBlockTransfer");
+			BlitFramebuffer(dstRect.vfb, dstRect.x_bytes / bpp, dstRect.y, srcRect.vfb, srcRect.x_bytes / bpp, srcRect.y, srcRect.w_bytes / bpp, height, bpp, "Blit_InterBufferBlockTransfer");
 			RebindFramebuffer("RebindFramebuffer - Inter-buffer block transfer");
-			SetColorUpdated(dstBuffer, skipDrawReason);
-			return true;  // No need to actually do the memory copy behind, probably.
+			SetColorUpdated(dstRect.vfb, skipDrawReason);
+			return true;
 		}
-		return false;
+
+		// Getting to the more complex cases. Have not actually seen much of these yet.
+		WARN_LOG_N_TIMES(blockformat, 5, G3D, "Mismatched buffer formats in block transfer: %s->%s (%dx%d)",
+			GeBufferFormatToString(srcRect.vfb->fb_format), GeBufferFormatToString(dstRect.vfb->fb_format),
+			width, height);
+
+		// TODO
+
+		// No need to actually do the memory copy behind, probably.
+		return true;
+
 	} else if (dstBuffer) {
 		// Here we should just draw the pixels into the buffer.  Copy first.
 		return false;
 	} else if (srcBuffer) {
-		WARN_LOG_N_TIMES(btd, 100, G3D, "Block transfer readback %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)",
+		WARN_LOG_N_TIMES(btd, 10, G3D, "Block transfer readback %dx%d %dbpp from %08x (x:%d y:%d stride:%d) -> %08x (x:%d y:%d stride:%d)",
 			width, height, bpp,
-			srcBasePtr, srcX, srcY, srcStride,
-			dstBasePtr, dstX, dstY, dstStride);
+			srcBasePtr, srcRect.x_bytes / bpp, srcRect.y, srcStride,
+			dstBasePtr, dstRect.x_bytes / bpp, dstRect.y, dstStride);
 		FlushBeforeCopy();
-		if (g_Config.bBlockTransferGPU && !srcBuffer->memoryUpdated) {
-			const int srcBpp = BufferFormatBytesPerPixel(srcBuffer->fb_format);
+		if (g_Config.bBlockTransferGPU && !srcRect.vfb->memoryUpdated) {
+			const int srcBpp = BufferFormatBytesPerPixel(srcRect.vfb->fb_format);
 			const float srcXFactor = (float)bpp / srcBpp;
-			const bool tooTall = srcY + srcHeight > srcBuffer->bufferHeight;
-			if (srcHeight <= 0 || (tooTall && srcY != 0)) {
-				WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x skipped, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight);
+			const bool tooTall = srcY + srcRect.h > srcRect.vfb->bufferHeight;
+			if (srcRect.h <= 0 || (tooTall && srcY != 0)) {
+				WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x skipped, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcRect.y, srcRect.h, srcRect.vfb->bufferHeight);
 			} else {
 				if (tooTall) {
-					WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcY, srcHeight, srcBuffer->bufferHeight);
+					WARN_LOG_ONCE(btdheight, G3D, "Block transfer download %08x -> %08x dangerous, %d+%d is taller than %d", srcBasePtr, dstBasePtr, srcRect.y, srcRect.h, srcRect.vfb->bufferHeight);
 				}
-				ReadFramebufferToMemory(srcBuffer, static_cast<int>(srcX * srcXFactor), srcY, static_cast<int>(srcWidth * srcXFactor), srcHeight);
-				srcBuffer->usageFlags = (srcBuffer->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
+				ReadFramebufferToMemory(srcRect.vfb, static_cast<int>(srcX * srcXFactor), srcY, static_cast<int>(srcRect.w_bytes * srcXFactor), srcRect.h);
+				srcRect.vfb->usageFlags = (srcRect.vfb->usageFlags | FB_USAGE_DOWNLOAD) & ~FB_USAGE_DOWNLOAD_CLEAR;
 			}
 		}
 		return false;  // Let the bit copy happen
@@ -1975,18 +2074,17 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 
 	if (MayIntersectFramebuffer(srcBasePtr) || MayIntersectFramebuffer(dstBasePtr)) {
 		// TODO: Figure out how we can avoid repeating the search here.
-		VirtualFramebuffer *dstBuffer = 0;
-		VirtualFramebuffer *srcBuffer = 0;
-		int srcWidth = width;
-		int srcHeight = height;
-		int dstWidth = width;
-		int dstHeight = height;
-		FindTransferFramebuffer(srcBuffer, srcBasePtr, srcStride, srcX, srcY, srcWidth, srcHeight, bpp, false);
-		FindTransferFramebuffer(dstBuffer, dstBasePtr, dstStride, dstX, dstY, dstWidth, dstHeight, bpp, true);
+
+		BlockTransferRect dstRect{};
+		BlockTransferRect srcRect{};
+
+		// These modify the X/Y/W/H parameters depending on the memory offset of the base pointers from the actual buffers.
+		bool srcBuffer = FindTransferFramebuffer(srcBasePtr, srcStride, srcX, srcY, width, height, bpp, false, &srcRect);
+		bool dstBuffer = FindTransferFramebuffer(dstBasePtr, dstStride, dstX, dstY, width, height, bpp, true, &dstRect);
 
 		// A few games use this INSTEAD of actually drawing the video image to the screen, they just blast it to
 		// the backbuffer. Detect this and have the framebuffermanager draw the pixels.
-		if (!useBufferedRendering_ && currentRenderVfb_ != dstBuffer) {
+		if (!useBufferedRendering_ && currentRenderVfb_ != dstRect.vfb) {
 			return;
 		}
 
@@ -1994,21 +2092,21 @@ void FramebufferManagerCommon::NotifyBlockTransferAfter(u32 dstBasePtr, int dstS
 			WARN_LOG_ONCE(btu, G3D, "Block transfer upload %08x -> %08x", srcBasePtr, dstBasePtr);
 			FlushBeforeCopy();
 			const u8 *srcBase = Memory::GetPointerUnchecked(srcBasePtr) + (srcX + srcY * srcStride) * bpp;
-			int dstBpp = BufferFormatBytesPerPixel(dstBuffer->fb_format);
+			int dstBpp = BufferFormatBytesPerPixel(dstRect.vfb->fb_format);
 			float dstXFactor = (float)bpp / dstBpp;
-			if (dstWidth > dstBuffer->width || dstHeight > dstBuffer->height) {
+			if (dstRect.w_bytes / bpp > dstRect.vfb->width || dstRect.h > dstRect.vfb->height) {
 				// The buffer isn't big enough, and we have a clear hint of size.  Resize.
 				// This happens in Valkyrie Profile when uploading video at the ending.
-				ResizeFramebufFBO(dstBuffer, dstWidth, dstHeight, false, true);
+				ResizeFramebufFBO(dstRect.vfb, dstRect.w_bytes / bpp, dstRect.h, false, true);
 				// Make sure we don't flop back and forth.
-				dstBuffer->newWidth = std::max(dstWidth, (int)dstBuffer->width);
-				dstBuffer->newHeight = std::max(dstHeight, (int)dstBuffer->height);
-				dstBuffer->lastFrameNewSize = gpuStats.numFlips;
+				dstRect.vfb->newWidth = std::max(dstRect.w_bytes / bpp, (int)dstRect.vfb->width);
+				dstRect.vfb->newHeight = std::max(dstRect.h, (int)dstRect.vfb->height);
+				dstRect.vfb->lastFrameNewSize = gpuStats.numFlips;
 				// Resizing may change the viewport/etc.
 				gstate_c.Dirty(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE);
 			}
-			DrawPixels(dstBuffer, static_cast<int>(dstX * dstXFactor), dstY, srcBase, dstBuffer->fb_format, static_cast<int>(srcStride * dstXFactor), static_cast<int>(dstWidth * dstXFactor), dstHeight);
-			SetColorUpdated(dstBuffer, skipDrawReason);
+			DrawPixels(dstRect.vfb, static_cast<int>(dstX * dstXFactor), dstY, srcBase, dstRect.vfb->fb_format, static_cast<int>(srcStride * dstXFactor), static_cast<int>(dstRect.w_bytes / bpp * dstXFactor), dstRect.h);
+			SetColorUpdated(dstRect.vfb, skipDrawReason);
 			RebindFramebuffer("RebindFramebuffer - NotifyBlockTransferAfter");
 		}
 	}
@@ -2469,8 +2567,8 @@ void FramebufferManagerCommon::DeviceLost() {
 
 	presentation_->DeviceLost();
 
-	for (int i = 0; i < 3; i++) {
-		for (int j = 0; j < 3; j++) {
+	for (int i = 0; i < ARRAY_SIZE(reinterpretFromTo_); i++) {
+		for (int j = 0; j < ARRAY_SIZE(reinterpretFromTo_); j++) {
 			DoRelease(reinterpretFromTo_[i][j]);
 		}
 	}
@@ -2666,17 +2764,6 @@ void FramebufferManagerCommon::BlitUsingRaster(
 	draw_->GetFramebufferDimensions(src, &srcW, &srcH);
 	draw_->GetFramebufferDimensions(dest, &destW, &destH);
 
-	float dX = 1.0f / (float)destW;
-	float dY = 1.0f / (float)destH;
-	float sX = 1.0f / (float)srcW;
-	float sY = 1.0f / (float)srcH;
-	Draw2DVertex vtx[4] = {
-		{ -1.0f + 2.0f * dX * destX1, -(1.0f - 2.0f * dY * destY1), sX * srcX1, sY * srcY1 },
-		{ -1.0f + 2.0f * dX * destX2, -(1.0f - 2.0f * dY * destY1), sX * srcX2, sY * srcY1 },
-		{ -1.0f + 2.0f * dX * destX1, -(1.0f - 2.0f * dY * destY2), sX * srcX1, sY * srcY2 },
-		{ -1.0f + 2.0f * dX * destX2, -(1.0f - 2.0f * dY * destY2), sX * srcX2, sY * srcY2 },
-	};
-
 	// Unbind the texture first to avoid the D3D11 hazard check (can't set render target to things bound as textures and vice versa, not even temporarily).
 	draw_->BindTexture(0, nullptr);
 	// This will get optimized away in case it's already bound (in VK and GL at least..)
@@ -2687,7 +2774,7 @@ void FramebufferManagerCommon::BlitUsingRaster(
 	draw_->SetViewports(1, &vp);
 	draw_->SetScissorRect(0, 0, (int)dest->Width(), (int)dest->Height());
 
-	draw2D_.DrawStrip2D(nullptr, vtx, 4, linearFilter, pipeline, src->Width(), src->Height(), renderScaleFactor_);
+	draw2D_.Blit(pipeline, srcX1, srcY1, srcX2, srcY2, destX1, destY1, destX2, destY2, (float)srcW, (float)srcH, (float)destW, (float)destH, linearFilter , renderScaleFactor_);
 
 	gstate_c.Dirty(DIRTY_BLEND_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_VERTEXSHADER_STATE | DIRTY_FRAGMENTSHADER_STATE);
 }
@@ -2702,19 +2789,31 @@ VirtualFramebuffer *FramebufferManagerCommon::ResolveFramebufferColorToFormat(Vi
 			continue;
 		}
 
-		if (dest->fb_address == src->fb_address && dest->fb_stride == src->fb_stride && dest->fb_format == newFormat) {
+		if (dest->fb_address == src->fb_address && dest->FbStrideInBytes() == src->FbStrideInBytes() && dest->fb_format == newFormat) {
 			vfb = dest;
 			break;
 		}
 	}
 
 	if (!vfb) {
-		WARN_LOG(G3D, "Creating %s clone of %08x/%08x/%s", GeBufferFormatToString(newFormat), src->fb_address, src->z_address, GeBufferFormatToString(src->fb_format));
-
 		// Create a clone!
 		vfb = new VirtualFramebuffer();
 		*vfb = *src;  // Copies everything, but watch out! Can't copy fbo.
+
+		// Adjust width by bpp.
+		float widthFactor = (float)BufferFormatBytesPerPixel(vfb->fb_format) / (float)BufferFormatBytesPerPixel(newFormat);
+
+		vfb->width *= widthFactor;
+		vfb->bufferWidth *= widthFactor;
+		vfb->renderWidth *= widthFactor;
+		vfb->drawnWidth *= widthFactor;
+		vfb->newWidth *= widthFactor;
+		vfb->safeWidth *= widthFactor;
+
 		vfb->fb_format = newFormat;
+
+		WARN_LOG(G3D, "Creating %s clone of %08x/%08x/%s (%dx%d -> %dx%d)", GeBufferFormatToString(newFormat), src->fb_address, src->z_address, GeBufferFormatToString(src->fb_format), src->width, src->height, vfb->width, vfb->height);
+
 		char tag[128];
 		FormatFramebufferName(vfb, tag, sizeof(tag));
 		vfb->fbo = draw_->CreateFramebuffer({ vfb->renderWidth, vfb->renderHeight, 1, 1, true, tag });
diff --git a/GPU/Common/FramebufferManagerCommon.h b/GPU/Common/FramebufferManagerCommon.h
index eb070b8347c1..deb13e7f6acb 100644
--- a/GPU/Common/FramebufferManagerCommon.h
+++ b/GPU/Common/FramebufferManagerCommon.h
@@ -142,6 +142,11 @@ struct VirtualFramebuffer {
 	int last_frame_failed;
 	int last_frame_depth_updated;
 	int last_frame_depth_render;
+
+	// Convenience methods
+	inline int WidthInBytes() const { return width * BufferFormatBytesPerPixel(fb_format); }
+	inline int FbStrideInBytes() const { return fb_stride * BufferFormatBytesPerPixel(fb_format); }
+	inline int ZStrideInBytes() const { return z_stride * 2; }
 };
 
 struct FramebufferHeuristicParams {
@@ -213,6 +218,31 @@ inline Draw::DataFormat GEFormatToThin3D(int geFormat) {
 	}
 }
 
+// Dimensions are in bytes, later steps get to convert back into real coordinates as appropriate.
+// Makes it easy to see if blits match etc.
+struct BlockTransferRect {
+	VirtualFramebuffer *vfb;
+	// RasterChannel channel;  // We currently only deal with color for block copies.
+
+	int x_bytes;
+	int y;
+	int w_bytes;
+	int h;
+
+	std::string ToString() const;
+
+	int w_pixels() const {
+		return w_bytes / BufferFormatBytesPerPixel(vfb->fb_format);
+	}
+	int x_pixels() const {
+		return x_bytes / BufferFormatBytesPerPixel(vfb->fb_format);
+	}
+
+	bool operator < (const BlockTransferRect &other) const {
+		return vfb->colorBindSeq < other.vfb->colorBindSeq;
+	}
+};
+
 namespace Draw {
 class DrawContext;
 }
@@ -418,7 +448,7 @@ class FramebufferManagerCommon {
 	bool ShouldDownloadFramebuffer(const VirtualFramebuffer *vfb) const;
 	void DownloadFramebufferOnSwitch(VirtualFramebuffer *vfb);
 
-	void FindTransferFramebuffer(VirtualFramebuffer *&srcBuffer, u32 srcBasePtr, int srcStride, int &srcX, int &srcY, int &srcWidth, int &srcHeight, int bpp, bool destination);
+	bool FindTransferFramebuffer(u32 basePtr, int stride, int x, int y, int w, int h, int bpp, bool destination, BlockTransferRect *rect);
 
 	VirtualFramebuffer *FindDownloadTempBuffer(VirtualFramebuffer *vfb);
 	virtual void UpdateDownloadTempBuffer(VirtualFramebuffer *nvfb) {}
@@ -503,10 +533,10 @@ class FramebufferManagerCommon {
 		FBO_OLD_USAGE_FLAG = 15,
 	};
 
-	// Thin3D stuff for reinterpreting image data between the various 16-bit formats.
+	// Thin3D stuff for reinterpreting image data between the various 16-bit color formats.
 	// Safe, not optimal - there might be input attachment tricks, etc, but we can't use them
 	// since we don't want N different implementations.
-	Draw2DPipeline *reinterpretFromTo_[3][3]{};
+	Draw2DPipeline *reinterpretFromTo_[4][4]{};
 
 	// Common implementation of stencil buffer upload. Also not 100% optimal, but not performance
 	// critical either.
diff --git a/GPU/Common/GPUStateUtils.h b/GPU/Common/GPUStateUtils.h
index 41ee3adc73dc..dcf3cd7c051e 100644
--- a/GPU/Common/GPUStateUtils.h
+++ b/GPU/Common/GPUStateUtils.h
@@ -4,6 +4,7 @@
 #include "Common/CommonTypes.h"
 
 #include "GPU/ge_constants.h"
+#include "GPU/GPUState.h"
 
 // TODO: Replace enums and structs with same from thin3d.h, for convenient mapping.
 
@@ -198,3 +199,21 @@ struct GenericStencilFuncState {
 };
 
 void ConvertStencilFuncState(GenericStencilFuncState &stencilFuncState);
+
+// See issue #15898
+inline bool SpongebobDepthInverseConditions(const GenericStencilFuncState &stencilState) {
+	// Check that the depth/stencil state matches the conditions exactly
+	return gstate.isDepthTestEnabled() && !gstate.isDepthWriteEnabled() &&
+		gstate.getDepthTestFunction() == GE_COMP_GEQUAL &&
+		stencilState.zFail == GE_STENCILOP_ZERO && stencilState.sFail == GE_STENCILOP_KEEP && stencilState.zPass == GE_STENCILOP_KEEP &&
+		stencilState.testFunc == GE_COMP_ALWAYS && stencilState.writeMask == 0xFF &&
+		// And also verify no color is written. The game does this through simple alpha blending with a constant zero alpha.
+		// We also check for color mask, since it's more natural, in case another game does it.
+		(gstate.isAlphaBlendEnabled() &&
+			gstate.getBlendFuncA() == GE_SRCBLEND_SRCALPHA &&
+			gstate.getBlendFuncB() == GE_DSTBLEND_INVSRCALPHA &&
+			gstate.getMaterialAmbientA() == 0x0 &&  // our accessor is kinda misnamed here, but material diffuse A is both used as default color and as ambient alpha
+			gstate.getMaterialUpdate() == 0 &&
+			!gstate.isTextureMapEnabled()
+		) || gstate.getColorMask() == 0xFFFFFF00;  // note that PSP masks are "inverted"
+}
diff --git a/GPU/Common/ReinterpretFramebuffer.cpp b/GPU/Common/ReinterpretFramebuffer.cpp
index cbcf463c25de..b30c71e504b5 100644
--- a/GPU/Common/ReinterpretFramebuffer.cpp
+++ b/GPU/Common/ReinterpretFramebuffer.cpp
@@ -24,80 +24,136 @@ Draw2DPipelineInfo GenerateReinterpretFragmentShader(ShaderWriter &writer, GEBuf
 
 	writer.DeclareSamplers(samplers);
 
-	writer.BeginFSMain(Slice<UniformDef>::empty(), varyings, FSFLAG_NONE);
-
-	writer.C("  vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n");
-
 	if (writer.Lang().bitwiseOps) {
 		switch (from) {
 		case GE_FORMAT_4444:
-			writer.C("  uint color = uint(val.r * 15.99) | (uint(val.g * 15.99) << 4u) | (uint(val.b * 15.99) << 8u) | (uint(val.a * 15.99) << 12u);\n");
+			writer.C("uint packColor(vec4 val) {\n");
+			writer.C("  return uint(val.r * 15.99) | (uint(val.g * 15.99) << 4u) | (uint(val.b * 15.99) << 8u) | (uint(val.a * 15.99) << 12u);\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_5551:
+			writer.C("uint packColor(vec4 val) {\n");
 			writer.C("  uint color = uint(val.r * 31.99) | (uint(val.g * 31.99) << 5u) | (uint(val.b * 31.99) << 10u);\n");
 			writer.C("  if (val.a >= 0.5) color |= 0x8000U;\n");
+			writer.C("  return color;\n");
+			writer.C("}\n");
+			break;
+		case GE_FORMAT_565:
+			writer.C("uint packColor(vec4 val) {\n");
+			writer.C("  return uint(val.r * 31.99) | (uint(val.g * 63.99) << 5u) | (uint(val.b * 31.99) << 11u);\n");
+			writer.C("}\n");
+			break;
+		case GE_FORMAT_8888:
+			writer.C("uint packColor(vec2 val) {\n");
+			writer.C("  return uint(val.r * 255.99) | (uint(val.g * 255.99) << 8u);\n");
+			writer.C("}\n");
+			break;
+		default:
+			_assert_(false);
+			break;
+		}
+	} else {
+		// Floating point can comfortably represent integers up to 16 million, we only need 65536 since these textures are 16-bit.
+		switch (from) {
+		case GE_FORMAT_4444:
+			writer.C("float packColor(vec4 val) {\n");
+			writer.C("  return (floor(val.r * 15.99) + floor(val.g * 15.99) * 16.0) + (floor(val.b * 15.99) * 256.0 + floor(val.a * 15.99) * 4096.0);\n");
+			writer.C("}\n");
+			break;
+		case GE_FORMAT_5551:
+			writer.C("float packColor(vec4 val) {\n");
+			writer.C("  float color = floor(val.r * 31.99) + floor(val.g * 31.99) * 32.0 + floor(val.b * 31.99) * 1024.0;\n");
+			writer.C("  if (val.a >= 0.5) color += 32768.0;\n");
+			writer.C("  return color;\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_565:
-			writer.C("  uint color = uint(val.r * 31.99) | (uint(val.g * 63.99) << 5u) | (uint(val.b * 31.99) << 11u);\n");
+			writer.C("float packColor(vec4 val) {\n");
+			writer.C("  return floor(val.r * 31.99) + floor(val.g * 63.99) * 32.0 + floor(val.b * 31.99) * 2048.0;\n");
+			writer.C("}\n");
+			break;
+		case GE_FORMAT_8888:
+			writer.C("float packColor(vec2 val) {\n");
+			writer.C("  return floor(val.r * 255.99) + floor(val.g * 255.99) * 256.0;\n");
+			writer.C("}\n");
 			break;
 		default:
 			_assert_(false);
 			break;
 		}
+	}
 
+	if (writer.Lang().bitwiseOps) {
 		switch (to) {
 		case GE_FORMAT_4444:
+			writer.C("vec4 unpackColor(uint color) {\n");
 			writer.C("  vec4 outColor = vec4(float(color & 0xFU), float((color >> 4u) & 0xFU), float((color >> 8u) & 0xFU), float((color >> 12u) & 0xFU));\n");
 			writer.C("  outColor *= 1.0 / 15.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_5551:
+			writer.C("vec4 unpackColor(uint color) {\n");
 			writer.C("  vec4 outColor = vec4(float(color & 0x1FU), float((color >> 5u) & 0x1FU), float((color >> 10u) & 0x1FU), 0.0);\n");
 			writer.C("  outColor.rgb *= 1.0 / 31.0;\n");
 			writer.C("  outColor.a = float(color >> 15);\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_565:
+			writer.C("vec4 unpackColor(uint color) {\n");
 			writer.C("  vec4 outColor = vec4(float(color & 0x1FU), float((color >> 5u) & 0x3FU), float((color >> 11u) & 0x1FU), 1.0);\n");
 			writer.C("  outColor.rb *= 1.0 / 31.0;\n");
 			writer.C("  outColor.g *= 1.0 / 63.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
-		default:
-			_assert_(false);
-			break;
-		}
-	} else {
-		// Floating point can comfortably represent integers up to 16 million, we only need 65536 since these textures are 16-bit.
-		switch (from) {
-		case GE_FORMAT_4444:
-			writer.C("  float color = (floor(val.r * 15.99) + floor(val.g * 15.99) * 16.0) + (floor(val.b * 15.99) * 256.0 + floor(val.a * 15.99) * 4096.0);\n");
-			break;
-		case GE_FORMAT_5551:
-			writer.C("  float color = floor(val.r * 31.99) + floor(val.g * 31.99) * 32.0 + floor(val.b * 31.99) * 1024.0;\n");
-			writer.C("  if (val.a >= 0.5) color += 32768.0;\n");
-			break;
-		case GE_FORMAT_565:
-			writer.C("  float color = floor(val.r * 31.99) + floor(val.g * 63.99) * 32.0 + floor(val.b * 31.99) * 2048.0;\n");
+		case GE_FORMAT_8888:
+			writer.C("vec4 unpackColor(uint colorLeft, uint colorRight) {\n");
+			writer.C("  vec4 outColor = vec4(float(colorLeft & 0xFFu),  float((colorLeft >> 8u)  & 0xFFu),\n");
+			writer.C("                       float(colorRight & 0xFFu), float((colorRight >> 8u) & 0xFFu));\n");
+			writer.C("  outColor *= 1.0 / 255.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		default:
 			_assert_(false);
 			break;
 		}
-
+	} else {
 		switch (to) {
 		case GE_FORMAT_4444:
+			writer.C("vec4 unpackColor(float color) {\n");
 			writer.C("  vec4 outColor = vec4(mod(floor(color), 16.0), mod(floor(color / 16.0), 16.0),");
 			writer.C("                       mod(floor(color / 256.0), 16.0), mod(floor(color / 4096.0), 16.0)); \n");
 			writer.C("  outColor *= 1.0 / 15.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_5551:
+			writer.C("vec4 unpackColor(float color) {\n");
 			writer.C("  vec4 outColor = vec4(mod(floor(color), 32.0), mod(floor(color / 32.0), 32.0), mod(floor(color / 1024.0), 32.0), 0.0);\n");
 			writer.C("  outColor.rgb *= 1.0 / 31.0;\n");
 			writer.C("  outColor.a = floor(color / 32768.0);\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		case GE_FORMAT_565:
+			writer.C("vec4 unpackColor(float color) {\n");
 			writer.C("  vec4 outColor = vec4(mod(floor(color), 32.0), mod(floor(color / 32.0), 64.0), mod(floor(color / 2048.0), 32.0), 0.0);\n");
 			writer.C("  outColor.rb *= 1.0 / 31.0;\n");
 			writer.C("  outColor.g *= 1.0 / 63.0;\n");
 			writer.C("  outColor.a = 1.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
+			break;
+		case GE_FORMAT_8888:
+			writer.C("vec4 unpackColor(float colorLeft, float colorRight) {\n");
+			writer.C("  vec4 outColor = vec4(mod(floor(colorLeft), 256.0), mod(floor(colorLeft / 256.0), 256.0),\n");
+			writer.C("                       mod(floor(colorRight), 256.0), mod(floor(colorRight / 256.0), 256.0));\n");
+			writer.C("  outColor *= 1.0 / 255.0;\n");
+			writer.C("  return outColor;\n");
+			writer.C("}\n");
 			break;
 		default:
 			_assert_(false);
@@ -105,6 +161,27 @@ Draw2DPipelineInfo GenerateReinterpretFragmentShader(ShaderWriter &writer, GEBuf
 		}
 	}
 
+	writer.BeginFSMain(g_draw2Duniforms, varyings, FSFLAG_NONE);
+
+	if (IsBufferFormat16Bit(from) && IsBufferFormat16Bit(to)) {
+		writer.C("  vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n");
+		writer.C("  vec4 outColor = unpackColor(packColor(val));\n");
+	} else if (IsBufferFormat16Bit(from) && !IsBufferFormat16Bit(to)) {
+		// 16-to-32-bit (two pixels, draw size is halved)
+
+		writer.C("  vec4 valLeft = ").SampleTexture2D("tex", "v_texcoord.xy + vec2(-0.25 / texSize.x, 0.0)").C(";\n");
+		writer.C("  vec4 valRight = ").SampleTexture2D("tex", "v_texcoord.xy + vec2(0.25 / texSize.x, 0.0)").C(";\n");
+		writer.C("  vec4 outColor = unpackColor(packColor(valLeft), packColor(valRight));\n");
+
+		_assert_("not yet implemented");
+	} else if (!IsBufferFormat16Bit(from) && IsBufferFormat16Bit(to)) {
+		// 32-to-16-bit (half of the pixel, draw size is doubled).
+
+		writer.C("  vec4 val = ").SampleTexture2D("tex", "v_texcoord.xy").C(";\n");
+		writer.C("  float u = mod(floor(v_texcoord.x * texSize.x * 2.0), 2.0);\n");
+		writer.C("  vec4 outColor = unpackColor(u == 0.0 ? packColor(val.rg) : packColor(val.ba));\n");
+	}
+
 	writer.EndFSMain("outColor", FSFLAG_NONE);
 
 	return Draw2DPipelineInfo{
diff --git a/GPU/Common/TextureCacheCommon.cpp b/GPU/Common/TextureCacheCommon.cpp
index 87746043cbb3..e6ce09ae62cf 100644
--- a/GPU/Common/TextureCacheCommon.cpp
+++ b/GPU/Common/TextureCacheCommon.cpp
@@ -662,28 +662,11 @@ int TextureCacheCommon::GetBestCandidateIndex(const std::vector<AttachCandidate>
 	int bestRelevancy = -1;
 	int bestIndex = -1;
 
-	// TODO: Instead of scores, we probably want to use std::min_element to pick the top element, using 
-	// a comparison function.
+	// We simply use the sequence counter as relevancy nowadays.
 	for (int i = 0; i < (int)candidates.size(); i++) {
 		const AttachCandidate &candidate = candidates[i];
 		int relevancy = candidate.seqCount;
 
-		// Bonus point for matching stride.
-		if (candidate.channel == RASTER_COLOR && candidate.fb->fb_stride == candidate.entry.bufw) {
-			relevancy += 1000;
-		}
-
-		// Bonus points for no offset.
-		if (candidate.match.xOffset == 0 && candidate.match.yOffset == 0) {
-			relevancy += 100;
-		}
-
-		if (candidate.channel == RASTER_COLOR && candidate.fb->last_frame_render == gpuStats.numFlips) {
-			relevancy += 50;
-		} else if (candidate.channel == RASTER_DEPTH && candidate.fb->last_frame_depth_render == gpuStats.numFlips) {
-			relevancy += 50;
-		}
-
 		if (relevancy > bestRelevancy) {
 			bestRelevancy = relevancy;
 			bestIndex = i;
@@ -907,20 +890,18 @@ bool TextureCacheCommon::MatchFramebuffer(
 
 	// If they match "exactly", it's non-CLUT and from the top left.
 	if (exactMatch) {
+		// TODO: Better checks for compatible strides here.
 		if (fb_stride != entry.bufw) {
-			WARN_LOG_ONCE(diffStrides1, G3D, "Found matching framebuffer with different strides %d != %d", entry.bufw, (int)fb_stride);
+			WARN_LOG_ONCE(diffStrides1, G3D, "Found matching framebuffer at %08x with different strides %d != %d", fb_address, entry.bufw, (int)fb_stride);
 		}
 		// NOTE: This check is okay because the first texture formats are the same as the buffer formats.
 		if (IsTextureFormatBufferCompatible(entry.format)) {
 			if (TextureFormatMatchesBufferFormat(entry.format, fb_format) || (framebuffer->usageFlags & FB_USAGE_BLUE_TO_ALPHA)) {
 				return true;
-			} else if (IsTextureFormat16Bit(entry.format) && IsBufferFormat16Bit(fb_format) && channel == RASTER_COLOR) {
-				WARN_LOG_ONCE(diffFormat1, G3D, "Found matching framebuffer with reinterpretable fb_format: %s != %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format));
+			} else {
+				WARN_LOG_ONCE(diffFormat1, G3D, "Found matching framebuffer with reinterpretable fb_format: %s != %s at %08x", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format), fb_address);
 				*matchInfo = FramebufferMatchInfo{ 0, 0, true, TextureFormatToBufferFormat(entry.format) };
 				return true;
-			} else {
-				WARN_LOG_ONCE(diffFormat2, G3D, "Rejecting framebuffer with incompatible formats %s != %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format));
-				return false;
 			}
 		} else {
 			// Format incompatible, ignoring without comment. (maybe some really gnarly hacks will end up here...)
@@ -954,7 +935,7 @@ bool TextureCacheCommon::MatchFramebuffer(
 
 		if (fb_stride != entry.bufw) {
 			if (noOffset) {
-				WARN_LOG_ONCE(diffStrides2, G3D, "Matching framebuffer(matching_clut = % s) different strides % d != % d", matchingClutFormat ? "yes" : "no", entry.bufw, fb_stride);
+				WARN_LOG_ONCE(diffStrides2, G3D, "Matching framebuffer(matching_clut = %s) different strides %d != %d", matchingClutFormat ? "yes" : "no", entry.bufw, fb_stride);
 				// Continue on with other checks.
 				// Not actually sure why we even try here. There's no way it'll go well if the strides are different.
 			} else {
@@ -984,7 +965,7 @@ bool TextureCacheCommon::MatchFramebuffer(
 			}
 			return true;
 		} else if (IsClutFormat((GETextureFormat)(entry.format)) || IsDXTFormat((GETextureFormat)(entry.format))) {
-			WARN_LOG_ONCE(fourEightBit, G3D, "%s fb_format not supported when texturing from framebuffer of format %s", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format));
+			WARN_LOG_ONCE(fourEightBit, G3D, "%s fb_format not matching framebuffer of format %s at %08x/%d", GeTextureFormatToString(entry.format), GeBufferFormatToString(fb_format), fb_address, fb_stride);
 			return false;
 		}
 
@@ -1024,9 +1005,18 @@ void TextureCacheCommon::SetTextureFramebuffer(const AttachCandidate &candidate)
 	nextFramebufferTextureChannel_ = RASTER_COLOR;
 
 	if (framebufferManager_->UseBufferedRendering()) {
+		// Detect when we need to apply the horizontal texture swizzle.
+		u64 depthUpperBits = (channel == RASTER_DEPTH && framebuffer->fb_format == GE_FORMAT_8888) ? ((gstate.getTextureAddress(0) & 0x600000) >> 20) : 0;
+		bool needsDepthXSwizzle = depthUpperBits == 2;
+
 		// We need to force it, since we may have set it on a texture before attaching.
 		gstate_c.curTextureWidth = framebuffer->bufferWidth;
 		gstate_c.curTextureHeight = framebuffer->bufferHeight;
+
+		if (needsDepthXSwizzle) {
+			gstate_c.curTextureWidth = RoundUpToPowerOf2(gstate_c.curTextureWidth);
+		}
+
 		if (gstate_c.bgraTexture) {
 			gstate_c.Dirty(DIRTY_FRAGMENTSHADER_STATE);
 		} else if ((gstate_c.curTextureXOffset == 0) != (fbInfo.xOffset == 0) || (gstate_c.curTextureYOffset == 0) != (fbInfo.yOffset == 0)) {
@@ -1884,6 +1874,7 @@ static bool CanUseSmoothDepal(const GPUgstate &gstate, GEBufferFormat framebuffe
 	return false;
 }
 
+
 void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer, GETextureFormat texFormat, RasterChannel channel) {
 	Draw2DPipeline *textureShader = nullptr;
 	uint32_t clutMode = gstate.clutformat & 0xFFFFFF;
@@ -1910,6 +1901,7 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 	const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
 	ClutTexture clutTexture{};
 	bool smoothedDepal = false;
+	u32 depthUpperBits = 0;
 
 	if (need_depalettize && !g_Config.bDisableSlowFramebufEffects) {
 		clutTexture = textureShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBufRaw_);
@@ -1944,20 +1936,33 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 			return;
 		}
 
-		textureShader = textureShaderCache_->GetDepalettizeShader(clutMode, texFormat, depth ? GE_FORMAT_DEPTH16 : framebuffer->fb_format, smoothedDepal);
+		depthUpperBits = (depth && framebuffer->fb_format == GE_FORMAT_8888) ? ((gstate.getTextureAddress(0) & 0x600000) >> 20) : 0;
+
+		textureShader = textureShaderCache_->GetDepalettizeShader(clutMode, texFormat, depth ? GE_FORMAT_DEPTH16 : framebuffer->fb_format, smoothedDepal, depthUpperBits);
 		gstate_c.SetUseShaderDepal(false, false);
 	}
 
 	if (textureShader) {
 		const GEPaletteFormat clutFormat = gstate.getClutPaletteFormat();
 		ClutTexture clutTexture = textureShaderCache_->GetClutTexture(clutFormat, clutHash_, clutBufRaw_);
-		Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, framebuffer->renderWidth, framebuffer->renderHeight);
+
+		bool needsDepthXSwizzle = depthUpperBits == 2;
+
+		int depalWidth = framebuffer->renderWidth;
+		int texWidth = framebuffer->width;
+		if (needsDepthXSwizzle) {
+			texWidth = RoundUpToPowerOf2(framebuffer->width);
+			depalWidth = texWidth * framebuffer->renderScaleFactor;
+			gstate_c.Dirty(DIRTY_UVSCALEOFFSET);
+		}
+
+		Draw::Framebuffer *depalFBO = framebufferManager_->GetTempFBO(TempFBO::DEPAL, depalWidth, framebuffer->renderHeight);
 		draw_->BindTexture(0, nullptr);
 		draw_->BindTexture(1, nullptr);
 		draw_->BindFramebufferAsRenderTarget(depalFBO, { Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE, Draw::RPAction::DONT_CARE }, "Depal");
 
-		draw_->SetScissorRect(0, 0, (int)framebuffer->renderWidth, (int)framebuffer->renderHeight);
-		Draw::Viewport vp{ 0.0f, 0.0f, (float)framebuffer->renderWidth, (float)framebuffer->renderHeight, 0.0f, 1.0f };
+		draw_->SetScissorRect(0, 0, (int)depalWidth, (int)framebuffer->renderHeight);
+		Draw::Viewport vp{ 0.0f, 0.0f, (float)depalWidth, (float)framebuffer->renderHeight, 0.0f, 1.0f };
 		draw_->SetViewports(1, &vp);
 
 		draw_->BindFramebufferAsTexture(framebuffer->fbo, 0, depth ? Draw::FB_DEPTH_BIT : Draw::FB_COLOR_BIT, 0);
@@ -1967,9 +1972,28 @@ void TextureCacheCommon::ApplyTextureFramebuffer(VirtualFramebuffer *framebuffer
 		draw_->BindSamplerStates(0, 1, &nearest);
 		draw_->BindSamplerStates(1, 1, &clutSampler);
 
-		textureShaderCache_->ApplyShader(textureShader,
-			framebuffer->bufferWidth, framebuffer->bufferHeight, framebuffer->renderWidth, framebuffer->renderHeight,
-			gstate_c.vertBounds, gstate_c.curTextureXOffset, gstate_c.curTextureYOffset);
+		// If min is not < max, then we don't have values (wasn't set during decode.)
+		const KnownVertexBounds &bounds = gstate_c.vertBounds;
+		float u1 = 0.0f;
+		float v1 = 0.0f;
+		float u2 = depalWidth;
+		float v2 = framebuffer->renderHeight;
+		if (bounds.minV < bounds.maxV) {
+			u1 = bounds.minU + gstate_c.curTextureXOffset;
+			v1 = bounds.minV + gstate_c.curTextureYOffset;
+			u2 = bounds.maxU + gstate_c.curTextureXOffset;
+			v2 = bounds.maxV + gstate_c.curTextureYOffset;
+			// We need to reapply the texture next time since we cropped UV.
+			gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
+		}
+		u1 *= framebuffer->renderScaleFactor;
+		v1 *= framebuffer->renderScaleFactor;
+		u2 *= framebuffer->renderScaleFactor;
+		v2 *= framebuffer->renderScaleFactor;
+
+		draw2D_->Blit(textureShader, u1, v1, u2, v2, u1, v1, u2, v2, framebuffer->renderWidth, framebuffer->renderHeight, depalWidth, framebuffer->renderHeight, false, framebuffer->renderScaleFactor);
+
+		gstate_c.curTextureWidth = texWidth;
 
 		draw_->BindTexture(0, nullptr);
 		framebufferManager_->RebindFramebuffer("ApplyTextureFramebuffer");
diff --git a/GPU/Common/TextureShaderCommon.cpp b/GPU/Common/TextureShaderCommon.cpp
index ab7a280a53df..2584dc595b31 100644
--- a/GPU/Common/TextureShaderCommon.cpp
+++ b/GPU/Common/TextureShaderCommon.cpp
@@ -187,11 +187,11 @@ void TextureShaderCache::Decimate() {
 	}
 }
 
-Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETextureFormat textureFormat, GEBufferFormat bufferFormat, bool smoothedDepal) {
+Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETextureFormat textureFormat, GEBufferFormat bufferFormat, bool smoothedDepal, u32 depthUpperBits) {
 	using namespace Draw;
 
 	// Generate an ID for depal shaders.
-	u32 id = (clutMode & 0xFFFFFF) | (textureFormat << 24) | (bufferFormat << 28);
+	u64 id = (depthUpperBits << 32) | (clutMode & 0xFFFFFF) | (textureFormat << 24) | (bufferFormat << 28);
 
 	auto shader = depalCache_.find(id);
 	if (shader != depalCache_.end()) {
@@ -207,6 +207,7 @@ Draw2DPipeline *TextureShaderCache::GetDepalettizeShader(uint32_t clutMode, GETe
 	config.bufferFormat = bufferFormat;
 	config.textureFormat = textureFormat;
 	config.smoothedDepal = smoothedDepal;
+	config.depthUpperBits = depthUpperBits;
 
 	char *buffer = new char[4096];
 	Draw2DPipeline *ts = draw2D_->Create2DPipeline([=](ShaderWriter &writer) -> Draw2DPipelineInfo {
@@ -247,51 +248,3 @@ std::string TextureShaderCache::DebugGetShaderString(std::string idstr, DebugSha
 		return "";
 	}
 }
-
-void TextureShaderCache::ApplyShader(Draw2DPipeline *pipeline, float bufferW, float bufferH, int renderW, int renderH, const KnownVertexBounds &bounds, u32 uoff, u32 voff) {
-	Draw2DVertex verts[4] = {
-		{-1, -1, 0, 0 },
-		{ 1, -1, 1, 0 },
-		{-1,  1, 0, 1 },
-		{ 1,  1, 1, 1 },
-	};
-
-	// If min is not < max, then we don't have values (wasn't set during decode.)
-	if (bounds.minV < bounds.maxV) {
-		const float invWidth = 1.0f / bufferW;
-		const float invHeight = 1.0f / bufferH;
-		// Inverse of half = double.
-		const float invHalfWidth = invWidth * 2.0f;
-		const float invHalfHeight = invHeight * 2.0f;
-
-		const int u1 = bounds.minU + uoff;
-		const int v1 = bounds.minV + voff;
-		const int u2 = bounds.maxU + uoff;
-		const int v2 = bounds.maxV + voff;
-
-		const float left = u1 * invHalfWidth - 1.0f;
-		const float right = u2 * invHalfWidth - 1.0f;
-		const float top = v1 * invHalfHeight - 1.0f;
-		const float bottom = v2 * invHalfHeight - 1.0f;
-
-		const float uvleft = u1 * invWidth;
-		const float uvright = u2 * invWidth;
-		const float uvtop = v1 * invHeight;
-		const float uvbottom = v2 * invHeight;
-
-		// Points are: BL, BR, TR, TL.
-		verts[0] = Draw2DVertex{ left, bottom, uvleft, uvbottom };
-		verts[1] = Draw2DVertex{ right, bottom, uvright, uvbottom };
-		verts[2] = Draw2DVertex{ left, top, uvleft, uvtop };
-		verts[3] = Draw2DVertex{ right, top, uvright, uvtop };
-
-		// We need to reapply the texture next time since we cropped UV.
-		gstate_c.Dirty(DIRTY_TEXTURE_PARAMS);
-	}
-
-	Draw::Viewport vp{ 0.0f, 0.0f, (float)renderW, (float)renderH, 0.0f, 1.0f };
-	draw_->BindPipeline(pipeline->pipeline);
-	draw_->SetViewports(1, &vp);
-	draw_->SetScissorRect(0, 0, renderW, renderH);
-	draw_->DrawUP((const uint8_t *)verts, 4);
-}
diff --git a/GPU/Common/TextureShaderCommon.h b/GPU/Common/TextureShaderCommon.h
index 5e0c5fe4765b..a11ee812101f 100644
--- a/GPU/Common/TextureShaderCommon.h
+++ b/GPU/Common/TextureShaderCommon.h
@@ -43,13 +43,11 @@ class TextureShaderCache {
 	TextureShaderCache(Draw::DrawContext *draw, Draw2D *draw2D);
 	~TextureShaderCache();
 
-	Draw2DPipeline *GetDepalettizeShader(uint32_t clutMode, GETextureFormat texFormat, GEBufferFormat pixelFormat, bool smoothedDepal);
+	Draw2DPipeline *GetDepalettizeShader(uint32_t clutMode, GETextureFormat texFormat, GEBufferFormat pixelFormat, bool smoothedDepal, u32 depthUpperBits);
 	ClutTexture GetClutTexture(GEPaletteFormat clutFormat, const u32 clutHash, u32 *rawClut);
 
 	Draw::SamplerState *GetSampler(bool linearFilter);
 
-	void ApplyShader(Draw2DPipeline *pipeline, float bufferW, float bufferH, int renderW, int renderH, const KnownVertexBounds &bounds, u32 uoff, u32 voff);
-
 	void Clear();
 	void Decimate();
 	std::vector<std::string> DebugGetShaderIDs(DebugShaderType type);
@@ -64,6 +62,6 @@ class TextureShaderCache {
 	Draw::SamplerState *linearSampler_ = nullptr;
 	Draw2D *draw2D_;
 
-	std::map<u32, Draw2DPipeline *> depalCache_;
+	std::map<u64, Draw2DPipeline *> depalCache_;
 	std::map<u32, ClutTexture *> texCache_;
 };
diff --git a/GPU/D3D11/StateMappingD3D11.cpp b/GPU/D3D11/StateMappingD3D11.cpp
index 11db65c55348..b75edef5d59a 100644
--- a/GPU/D3D11/StateMappingD3D11.cpp
+++ b/GPU/D3D11/StateMappingD3D11.cpp
@@ -217,44 +217,6 @@ void DrawEngineD3D11::ApplyDrawState(int prim) {
 
 			keys_.blend.colorWriteMask = (maskState.rgba[0] ? 1 : 0) | (maskState.rgba[1] ? 2 : 0) | (maskState.rgba[2] ? 4 : 0) | (maskState.rgba[3] ? 8 : 0);
 		}
-
-		if (!device1_) {
-			ID3D11BlendState *bs = blendCache_.Get(keys_.blend.value);
-			if (bs == nullptr) {
-				D3D11_BLEND_DESC desc{};
-				D3D11_RENDER_TARGET_BLEND_DESC &rt = desc.RenderTarget[0];
-				rt.BlendEnable = keys_.blend.blendEnable;
-				rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor;
-				rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha;
-				rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor;
-				rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor;
-				rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha;
-				rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha;
-				rt.RenderTargetWriteMask = keys_.blend.colorWriteMask;
-				ASSERT_SUCCESS(device_->CreateBlendState(&desc, &bs));
-				blendCache_.Insert(keys_.blend.value, bs);
-			}
-			blendState_ = bs;
-		} else {
-			ID3D11BlendState1 *bs1 = blendCache1_.Get(keys_.blend.value);
-			if (bs1 == nullptr) {
-				D3D11_BLEND_DESC1 desc1{};
-				D3D11_RENDER_TARGET_BLEND_DESC1 &rt = desc1.RenderTarget[0];
-				rt.BlendEnable = keys_.blend.blendEnable;
-				rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor;
-				rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha;
-				rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor;
-				rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor;
-				rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha;
-				rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha;
-				rt.RenderTargetWriteMask = keys_.blend.colorWriteMask;
-				rt.LogicOpEnable = keys_.blend.logicOpEnable;
-				rt.LogicOp = (D3D11_LOGIC_OP)keys_.blend.logicOp;
-				ASSERT_SUCCESS(device1_->CreateBlendState1(&desc1, &bs1));
-				blendCache1_.Insert(keys_.blend.value, bs1);
-			}
-			blendState1_ = bs1;
-		}
 	}
 
 	if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
@@ -275,18 +237,6 @@ void DrawEngineD3D11::ApplyDrawState(int prim) {
 				keys_.raster.depthClipEnable = 1;
 			}
 		}
-		ID3D11RasterizerState *rs = rasterCache_.Get(keys_.raster.value);
-		if (rs == nullptr) {
-			D3D11_RASTERIZER_DESC desc{};
-			desc.CullMode = (D3D11_CULL_MODE)(keys_.raster.cullMode);
-			desc.FillMode = D3D11_FILL_SOLID;
-			desc.ScissorEnable = TRUE;
-			desc.FrontCounterClockwise = TRUE;
-			desc.DepthClipEnable = keys_.raster.depthClipEnable;
-			ASSERT_SUCCESS(device_->CreateRasterizerState(&desc, &rs));
-			rasterCache_.Insert(keys_.raster.value, rs);
-		}
-		rasterState_ = rs;
 	}
 
 	if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) {
@@ -343,29 +293,36 @@ void DrawEngineD3D11::ApplyDrawState(int prim) {
 				keys_.depthStencil.stencilWriteMask = stencilState.writeMask;
 				dynState_.useStencil = true;
 				dynState_.stencilRef = stencilState.testRef;
+
+				// Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during
+				// depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth
+				// test and modify the alpha function...
+				if (SpongebobDepthInverseConditions(stencilState)) {
+					keys_.blend.blendEnable = true;
+					keys_.blend.blendOpAlpha = D3D11_BLEND_OP_ADD;
+					keys_.blend.blendOpColor = D3D11_BLEND_OP_ADD;
+					keys_.blend.srcColor = D3D11_BLEND_ZERO;
+					keys_.blend.destColor = D3D11_BLEND_ZERO;
+					keys_.blend.logicOpEnable = false;
+					keys_.blend.srcAlpha = D3D11_BLEND_ZERO;
+					keys_.blend.destAlpha = D3D11_BLEND_ZERO;
+					keys_.blend.colorWriteMask = D3D11_COLOR_WRITE_ENABLE_ALPHA;
+
+					keys_.depthStencil.depthCompareOp = D3D11_COMPARISON_LESS;  // Inverse of GREATER_EQUAL
+					keys_.depthStencil.stencilCompareFunc = D3D11_COMPARISON_ALWAYS;
+					// Invert
+					keys_.depthStencil.stencilPassOp = D3D11_STENCIL_OP_ZERO;
+					keys_.depthStencil.stencilFailOp = D3D11_STENCIL_OP_ZERO;
+					keys_.depthStencil.stencilDepthFailOp = D3D11_STENCIL_OP_KEEP;
+
+					// TODO: Need to set in a way that carries over to the next draw..
+					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				}
 			} else {
 				keys_.depthStencil.stencilTestEnable = false;
 				dynState_.useStencil = false;
 			}
 		}
-		ID3D11DepthStencilState *ds = depthStencilCache_.Get(keys_.depthStencil.value);
-		if (ds == nullptr) {
-			D3D11_DEPTH_STENCIL_DESC desc{};
-			desc.DepthEnable = keys_.depthStencil.depthTestEnable;
-			desc.DepthWriteMask = keys_.depthStencil.depthWriteEnable ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO;
-			desc.DepthFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.depthCompareOp;
-			desc.StencilEnable = keys_.depthStencil.stencilTestEnable;
-			desc.StencilReadMask = keys_.depthStencil.stencilCompareMask;
-			desc.StencilWriteMask = keys_.depthStencil.stencilWriteMask;
-			desc.FrontFace.StencilFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilFailOp;
-			desc.FrontFace.StencilPassOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilPassOp;
-			desc.FrontFace.StencilDepthFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilDepthFailOp;
-			desc.FrontFace.StencilFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.stencilCompareFunc;
-			desc.BackFace = desc.FrontFace;
-			ASSERT_SUCCESS(device_->CreateDepthStencilState(&desc, &ds));
-			depthStencilCache_.Insert(keys_.depthStencil.value, ds);
-		}
-		depthStencilState_ = ds;
 	}
 
 	if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) {
@@ -397,6 +354,84 @@ void DrawEngineD3D11::ApplyDrawState(int prim) {
 		scissor.bottom = vpAndScissor.scissorY + std::max(0, vpAndScissor.scissorH);
 	}
 
+	// Actually create/set the state objects only after we're done mapping all the state.
+	// There might have been interactions between depth and blend above.
+	if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) {
+		if (!device1_) {
+			ID3D11BlendState *bs = blendCache_.Get(keys_.blend.value);
+			if (bs == nullptr) {
+				D3D11_BLEND_DESC desc{};
+				D3D11_RENDER_TARGET_BLEND_DESC &rt = desc.RenderTarget[0];
+				rt.BlendEnable = keys_.blend.blendEnable;
+				rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor;
+				rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha;
+				rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor;
+				rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor;
+				rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha;
+				rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha;
+				rt.RenderTargetWriteMask = keys_.blend.colorWriteMask;
+				ASSERT_SUCCESS(device_->CreateBlendState(&desc, &bs));
+				blendCache_.Insert(keys_.blend.value, bs);
+			}
+			blendState_ = bs;
+		} else {
+			ID3D11BlendState1 *bs1 = blendCache1_.Get(keys_.blend.value);
+			if (bs1 == nullptr) {
+				D3D11_BLEND_DESC1 desc1{};
+				D3D11_RENDER_TARGET_BLEND_DESC1 &rt = desc1.RenderTarget[0];
+				rt.BlendEnable = keys_.blend.blendEnable;
+				rt.BlendOp = (D3D11_BLEND_OP)keys_.blend.blendOpColor;
+				rt.BlendOpAlpha = (D3D11_BLEND_OP)keys_.blend.blendOpAlpha;
+				rt.SrcBlend = (D3D11_BLEND)keys_.blend.srcColor;
+				rt.DestBlend = (D3D11_BLEND)keys_.blend.destColor;
+				rt.SrcBlendAlpha = (D3D11_BLEND)keys_.blend.srcAlpha;
+				rt.DestBlendAlpha = (D3D11_BLEND)keys_.blend.destAlpha;
+				rt.RenderTargetWriteMask = keys_.blend.colorWriteMask;
+				rt.LogicOpEnable = keys_.blend.logicOpEnable;
+				rt.LogicOp = (D3D11_LOGIC_OP)keys_.blend.logicOp;
+				ASSERT_SUCCESS(device1_->CreateBlendState1(&desc1, &bs1));
+				blendCache1_.Insert(keys_.blend.value, bs1);
+			}
+			blendState1_ = bs1;
+		}
+	}
+
+	if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
+		ID3D11RasterizerState *rs = rasterCache_.Get(keys_.raster.value);
+		if (rs == nullptr) {
+			D3D11_RASTERIZER_DESC desc{};
+			desc.CullMode = (D3D11_CULL_MODE)(keys_.raster.cullMode);
+			desc.FillMode = D3D11_FILL_SOLID;
+			desc.ScissorEnable = TRUE;
+			desc.FrontCounterClockwise = TRUE;
+			desc.DepthClipEnable = keys_.raster.depthClipEnable;
+			ASSERT_SUCCESS(device_->CreateRasterizerState(&desc, &rs));
+			rasterCache_.Insert(keys_.raster.value, rs);
+		}
+		rasterState_ = rs;
+	}
+
+	if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) {
+		ID3D11DepthStencilState *ds = depthStencilCache_.Get(keys_.depthStencil.value);
+		if (ds == nullptr) {
+			D3D11_DEPTH_STENCIL_DESC desc{};
+			desc.DepthEnable = keys_.depthStencil.depthTestEnable;
+			desc.DepthWriteMask = keys_.depthStencil.depthWriteEnable ? D3D11_DEPTH_WRITE_MASK_ALL : D3D11_DEPTH_WRITE_MASK_ZERO;
+			desc.DepthFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.depthCompareOp;
+			desc.StencilEnable = keys_.depthStencil.stencilTestEnable;
+			desc.StencilReadMask = keys_.depthStencil.stencilCompareMask;
+			desc.StencilWriteMask = keys_.depthStencil.stencilWriteMask;
+			desc.FrontFace.StencilFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilFailOp;
+			desc.FrontFace.StencilPassOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilPassOp;
+			desc.FrontFace.StencilDepthFailOp = (D3D11_STENCIL_OP)keys_.depthStencil.stencilDepthFailOp;
+			desc.FrontFace.StencilFunc = (D3D11_COMPARISON_FUNC)keys_.depthStencil.stencilCompareFunc;
+			desc.BackFace = desc.FrontFace;
+			ASSERT_SUCCESS(device_->CreateDepthStencilState(&desc, &ds));
+			depthStencilCache_.Insert(keys_.depthStencil.value, ds);
+		}
+		depthStencilState_ = ds;
+	}
+
 	if (gstate_c.IsDirty(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS) && !gstate.isModeClear() && gstate.isTextureMapEnabled()) {
 		textureCache_->SetTexture();
 		gstate_c.Clean(DIRTY_TEXTURE_IMAGE | DIRTY_TEXTURE_PARAMS);
diff --git a/GPU/Directx9/StateMappingDX9.cpp b/GPU/Directx9/StateMappingDX9.cpp
index 9fb92db6fe7f..a82ccb4605c9 100644
--- a/GPU/Directx9/StateMappingDX9.cpp
+++ b/GPU/Directx9/StateMappingDX9.cpp
@@ -229,7 +229,6 @@ void DrawEngineDX9::ApplyDrawState(int prim) {
 			} else {
 				dxstate.stencilTest.disable();
 			}
-
 		} else {
 			// Depth Test
 			if (gstate.isDepthTestEnabled()) {
@@ -248,6 +247,24 @@ void DrawEngineDX9::ApplyDrawState(int prim) {
 				dxstate.stencilCompareMask.set(stencilState.testMask);
 				dxstate.stencilOp.set(stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass]);
 				dxstate.stencilWriteMask.set(stencilState.writeMask);
+
+				// Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during
+				// depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth
+				// test and modify the alpha function...
+				if (SpongebobDepthInverseConditions(stencilState)) {
+					dxstate.blend.set(true);
+					dxstate.blendEquation.set(D3DBLENDOP_ADD, D3DBLENDOP_ADD);
+					dxstate.blendFunc.set(D3DBLEND_ZERO, D3DBLEND_ZERO, D3DBLEND_ZERO, D3DBLEND_ZERO);
+					dxstate.colorMask.set(8);
+
+					dxstate.depthFunc.set(D3DCMP_LESS);
+					dxstate.stencilFunc.set(D3DCMP_ALWAYS);
+					// Invert
+					dxstate.stencilOp.set(D3DSTENCILOP_ZERO, D3DSTENCILOP_KEEP, D3DSTENCILOP_ZERO);
+
+					// TODO: Need to set in a way that carries over to the next draw..
+					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				}
 			} else {
 				dxstate.stencilTest.disable();
 			}
diff --git a/GPU/GLES/StateMappingGLES.cpp b/GPU/GLES/StateMappingGLES.cpp
index 4ebdd165d29b..a06bac650c4f 100644
--- a/GPU/GLES/StateMappingGLES.cpp
+++ b/GPU/GLES/StateMappingGLES.cpp
@@ -144,7 +144,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 	bool useBufferedRendering = framebufferManager_->UseBufferedRendering();
 
 	if (gstate_c.IsDirty(DIRTY_BLEND_STATE)) {
-		gstate_c.Clean(DIRTY_BLEND_STATE);
 		gstate_c.SetAllowFramebufferRead(!g_Config.bDisableSlowFramebufEffects);
 
 		if (gstate.isModeClear()) {
@@ -208,7 +207,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			} else {
 				renderManager->SetNoBlendAndMask(mask);
 			}
-
 #ifndef USING_GLES2
 			if (gstate_c.Supports(GPU_SUPPORTS_LOGIC_OP)) {
 				renderManager->SetLogicOp(gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY,
@@ -219,8 +217,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 	}
 
 	if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
-		gstate_c.Clean(DIRTY_RASTER_STATE);
-
 		// Dither
 		bool dither = gstate.isDitherEnabled();
 		bool cullEnable;
@@ -247,7 +243,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 	}
 
 	if (gstate_c.IsDirty(DIRTY_DEPTHSTENCIL_STATE)) {
-		gstate_c.Clean(DIRTY_DEPTHSTENCIL_STATE);
 		GenericStencilFuncState stencilState;
 		ConvertStencilFuncState(stencilState);
 
@@ -264,6 +259,19 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			if (stencilState.enabled) {
 				renderManager->SetStencilFunc(stencilState.enabled, compareOps[stencilState.testFunc], stencilState.testRef, stencilState.testMask);
 				renderManager->SetStencilOp(stencilState.writeMask, stencilOps[stencilState.sFail], stencilOps[stencilState.zFail], stencilOps[stencilState.zPass]);
+
+				// Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during
+				// depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth
+				// test and modify the alpha function...
+				if (SpongebobDepthInverseConditions(stencilState)) {
+					renderManager->SetBlendAndMask(0x8, true, GL_ZERO, GL_ZERO, GL_ZERO, GL_ZERO, GL_FUNC_ADD, GL_FUNC_ADD);
+					renderManager->SetDepth(true, false, GL_LESS);
+					renderManager->SetStencilFunc(true, GL_ALWAYS, 0xFF, 0xFF);
+					renderManager->SetStencilOp(0xFF, GL_ZERO, GL_KEEP, GL_ZERO);
+
+					// TODO: Need to set in a way that carries over to the next draw..
+					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				}
 			} else {
 				renderManager->SetStencilDisabled();
 			}
@@ -271,7 +279,6 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 	}
 
 	if (gstate_c.IsDirty(DIRTY_VIEWPORTSCISSOR_STATE)) {
-		gstate_c.Clean(DIRTY_VIEWPORTSCISSOR_STATE);
 		ConvertViewportAndScissor(useBufferedRendering,
 			framebufferManager_->GetRenderWidth(), framebufferManager_->GetRenderHeight(),
 			framebufferManager_->GetTargetBufferWidth(), framebufferManager_->GetTargetBufferHeight(),
@@ -284,6 +291,8 @@ void DrawEngineGLES::ApplyDrawState(int prim) {
 			vpAndScissor.viewportW, vpAndScissor.viewportH,
 			vpAndScissor.depthRangeMin, vpAndScissor.depthRangeMax });
 	}
+
+	gstate_c.Clean(DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_DEPTHSTENCIL_STATE | DIRTY_RASTER_STATE | DIRTY_BLEND_STATE);
 }
 
 void DrawEngineGLES::ApplyDrawStateLate(bool setStencilValue, int stencilValue) {
diff --git a/GPU/Vulkan/ShaderManagerVulkan.cpp b/GPU/Vulkan/ShaderManagerVulkan.cpp
index 26057328cfe3..0d4b31b96d52 100644
--- a/GPU/Vulkan/ShaderManagerVulkan.cpp
+++ b/GPU/Vulkan/ShaderManagerVulkan.cpp
@@ -358,7 +358,7 @@ VulkanFragmentShader *ShaderManagerVulkan::GetFragmentShaderFromModule(VkShaderM
 // instantaneous.
 
 #define CACHE_HEADER_MAGIC 0xff51f420 
-#define CACHE_VERSION 19
+#define CACHE_VERSION 20
 struct VulkanCacheHeader {
 	uint32_t magic;
 	uint32_t version;
diff --git a/GPU/Vulkan/StateMappingVulkan.cpp b/GPU/Vulkan/StateMappingVulkan.cpp
index 566708480abb..9ca4a81b9d95 100644
--- a/GPU/Vulkan/StateMappingVulkan.cpp
+++ b/GPU/Vulkan/StateMappingVulkan.cpp
@@ -301,6 +301,30 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag
 				dynState.stencilRef = stencilState.testRef;
 				dynState.stencilCompareMask = stencilState.testMask;
 				dynState.stencilWriteMask = stencilState.writeMask;
+
+				// Nasty special case for Spongebob and similar where it tries to write zeros to alpha/stencil during
+				// depth-fail. We can't write to alpha then because the pixel is killed. However, we can invert the depth
+				// test and modify the alpha function...
+				if (SpongebobDepthInverseConditions(stencilState)) {
+					key.blendEnable = true;
+					key.blendOpAlpha = VK_BLEND_OP_ADD;
+					key.blendOpColor = VK_BLEND_OP_ADD;
+					key.srcColor = VK_BLEND_FACTOR_ZERO;
+					key.destColor = VK_BLEND_FACTOR_ZERO;
+					key.logicOpEnable = false;
+					key.srcAlpha = VK_BLEND_FACTOR_ZERO;
+					key.destAlpha = VK_BLEND_FACTOR_ZERO;
+					key.colorWriteMask = VK_COLOR_COMPONENT_A_BIT;
+					key.depthCompareOp = VK_COMPARE_OP_LESS;  // Inverse of GREATER_EQUAL
+					key.stencilCompareOp = VK_COMPARE_OP_ALWAYS;
+					// Invert
+					key.stencilPassOp = VK_STENCIL_OP_ZERO;
+					key.stencilFailOp = VK_STENCIL_OP_ZERO;
+					key.stencilDepthFailOp = VK_STENCIL_OP_KEEP;
+
+					// TODO: Need to set in a way that carries over to the next draw..
+					gstate_c.Dirty(DIRTY_BLEND_STATE);
+				}
 			} else {
 				key.stencilTestEnable = false;
 				key.stencilCompareOp = VK_COMPARE_OP_ALWAYS;
diff --git a/assets/compat.ini b/assets/compat.ini
index 974a6216defd..289a5d21bb71 100644
--- a/assets/compat.ini
+++ b/assets/compat.ini
@@ -745,6 +745,12 @@ ULJM05412 = true
 NPJH50083 = true
 ULJM05570 = true
 
+# Cars Race-o-rama
+ULUS10428 = true
+# MX vs ATV Reflex
+ULES01375 = true
+ULUS10429 = true
+
 [IntraVRAMBlockTransferAllowCreateFB]
 # Final Fantasy - Type 0
 ULJM05900 = true
@@ -1149,6 +1155,46 @@ ULES01441 = true
 ULJM05600 = true
 ULJM05775 = true
 
+# Spongebob - The Yellow Avenger (see #15898)
+ULUS10092 = true
+ULES00280 = true
+
+# MX vs ATV Reflex
+ULES01375 = true
+ULUS10429 = true
+
+# MX vs ATV Untamed
+ULES00993 = true
+ULES00994 = true
+ULUS10330 = true
+
+# Cars race-o-rama
+ULES01333 = true
+ULUS10428 = true
+
+# God of War: Chains of Olympus
+# The old hack for the shadows isn't working anymore since the framebuffers don't match.
+# This is nicer anyway.
+UCUS98653 = true
+UCES00842 = true
+UCKS45084 = true
+UCUS98705 = true	
+ULJM05348 = true
+ULJM05438 = true
+NPUG80325 = true
+NPEG00023 = true
+NPHG00028 = true
+
+# God of War: Ghost of Sparta
+UCUS98737 = true
+UCAS40323 = true
+UCKS45161 = true
+NPHG00092 = true
+NPEG00044 = true
+UCJS10114 = true
+UCES01401 = true
+NPJG00120 = true
+
 [ShaderColorBitmask]
 # No users right now, but keeping it around as a more accurate option than BlueToAlpha, for debugging mainly Outrun.
 
diff --git a/unittest/TestShaderGenerators.cpp b/unittest/TestShaderGenerators.cpp
index 5e722e3148f1..28ab16e2c486 100644
--- a/unittest/TestShaderGenerators.cpp
+++ b/unittest/TestShaderGenerators.cpp
@@ -286,6 +286,7 @@ bool TestDepalShaders() {
 		config.mask = 0xFF;
 		config.bufferFormat = GE_FORMAT_8888;
 		config.textureFormat = GE_TFMT_CLUT32;
+		config.depthUpperBits = 0;
 
 		ShaderWriter writer(buffer, desc, ShaderStage::Fragment);
 		GenerateDepalFs(writer, config);