Skip to content

Commit

Permalink
Reorganize the depth vertex pipeline for future optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
hrydgard committed Dec 20, 2024
1 parent 86c9ecc commit 9f7802a
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 186 deletions.
267 changes: 92 additions & 175 deletions GPU/Common/DepthRaster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,6 @@
#include "Common/Math/math_util.h"
#include "GPU/Common/VertexDecoderCommon.h"

struct ScreenVert {
int x;
int y;
uint16_t z;
uint16_t behind;
};

void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2, short depthValue, GEComparison depthCompare) {
// Swap coordinates if needed, we don't back-face-cull rects.
// We also ignore the UV rotation here.
Expand Down Expand Up @@ -84,7 +77,7 @@ void DepthRasterRect(uint16_t *dest, int stride, int x1, int y1, int x2, int y2,
// Adapted from Intel's depth rasterizer example.
// Started with the scalar version, will SIMD-ify later.
// x1/y1 etc are the scissor rect.
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const ScreenVert vertsSub[3], GEComparison compareMode) {
void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2, int y2, const DepthScreenVertex vertsSub[3], GEComparison compareMode) {
int tileStartX = x1;
int tileEndX = x2;

Expand All @@ -96,7 +89,7 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
// are slow on SSE2.

// Convert to whole pixels for now. Later subpixel precision.
ScreenVert verts[3];
DepthScreenVertex verts[3];
verts[0].x = vertsSub[0].x;
verts[0].y = vertsSub[0].y;
verts[0].z = vertsSub[0].z;
Expand Down Expand Up @@ -214,179 +207,123 @@ void DepthRasterTriangle(uint16_t *depthBuf, int stride, int x1, int y1, int x2,
} // for each row
}

// We ignore lots of primitive types for now.
void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, int y2, void *bufferData,
const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *dec, u32 vertTypeID, bool clockwise) {
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID) {
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);

GEComparison compareMode = gstate.getDepthTestFunction();
if (gstate.isModeClear()) {
if (!gstate.isClearModeDepthMask()) {
return;
}
compareMode = GE_COMP_ALWAYS;
} else {
if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
return;
}
int vertexStride = dec->VertexSize();
int offset = dec->posoff;

switch (prim) {
case GE_PRIM_INVALID:
case GE_PRIM_KEEP_PREVIOUS:
case GE_PRIM_LINES:
case GE_PRIM_LINE_STRIP:
case GE_PRIM_POINTS:
return;
default:
float temp[3];
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(dest + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
Vec3ByMatrix44(dest + i * 4, data, worldviewproj);
}
break;
}
}

// TODO: Ditch indexed primitives for now, also ditched skinned ones since we don't have a fast way to skin without
// running the full decoder.
if (vertTypeID & (GE_VTYPE_IDX_MASK | GE_VTYPE_WEIGHT_MASK)) {
return;
void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count) {
for (int i = 0; i < count; i++) {
screenVerts[i].x = (int)transformed[i].pos[0];
screenVerts[i].y = (int)transformed[i].pos[1];
screenVerts[i].z = (u16)transformed[i].pos[2];
}
}

bool isThroughMode = (vertTypeID & GE_VTYPE_THROUGH_MASK) != 0;
bool cullEnabled = false;
bool cullCCW = false;

// Turn the input data into a raw float array that we can pass to an optimized triangle rasterizer.
float *transformed = (float *)bufferData;
int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count) {
bool cullEnabled = gstate.isCullEnabled();

ScreenVert *screenVerts = (ScreenVert *)((uint8_t *)bufferData + 65536 * 8);
const float viewportX = gstate.getViewportXCenter();
const float viewportY = gstate.getViewportYCenter();
const float viewportZ = gstate.getViewportZCenter();
const float viewportScaleX = gstate.getViewportXScale();
const float viewportScaleY = gstate.getViewportYScale();
const float viewportScaleZ = gstate.getViewportZScale();

// Simple, most common case.
int vertexStride = dec->VertexSize();
int offset = dec->posoff;
bool cullCCW = false;

// OK, we now have the coordinates. Let's transform, we can actually do this in-place.
if (!(vertTypeID & GE_VTYPE_THROUGH_MASK)) {
float world[16];
float view[16];
float worldview[16];
float worldviewproj[16];
ConvertMatrix4x3To4x4(world, gstate.worldMatrix);
ConvertMatrix4x3To4x4(view, gstate.viewMatrix);
Matrix4ByMatrix4(worldview, world, view);
Matrix4ByMatrix4(worldviewproj, worldview, gstate.projMatrix); // TODO: Include adjustments to the proj matrix?

cullEnabled = gstate.isCullEnabled();

float viewportX = gstate.getViewportXCenter();
float viewportY = gstate.getViewportYCenter();
float viewportZ = gstate.getViewportZCenter();
float viewportScaleX = gstate.getViewportXScale();
float viewportScaleY = gstate.getViewportYScale();
float viewportScaleZ = gstate.getViewportZScale();

bool allBehind = true;

float temp[3];
for (int i = 0; i < count; i++) {
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 128.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
temp[j] = data[j] * (1.0f / 32768.0f); // TODO: Can we bake this factor in somewhere?
}
Vec3ByMatrix44(transformed + i * 4, temp, worldviewproj);
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
const float *data = (const float *)((const u8 *)vertexData + vertexStride * i + offset);
Vec3ByMatrix44(transformed + i * 4, data, worldviewproj);
}
break;
}
}

for (int i = 0; i < count; i++) {
float proj[4];
memcpy(proj, transformed + i * 4, 4 * sizeof(float));
int outCount = 0;

float w = proj[3];
for (int i = 0; i < count; i += 3) {
const float *verts[3] = {
transformed + indexBuffer[i] * 4,
transformed + indexBuffer[i + 1] * 4,
transformed + indexBuffer[i + 2] * 4,
};

bool inFront = w > 0.0f;
screenVerts[i].behind = !inFront;
if (inFront) {
allBehind = false;
}
// Check if any vertex is behind the 0 plane.
if (verts[0][3] < 0.0f || verts[1][3] < 0.0f || verts[2][3] < 0.0f) {
// Ditch this triangle. Later we should clip here.
continue;
}

for (int c = 0; c < 3; c++) {
const float *src = verts[c];
float invW = 1.0f / src[3];

// Clip to the w=0 plane.
proj[0] /= w;
proj[1] /= w;
proj[2] /= w;
float x = src[0] * invW;
float y = src[1] * invW;
float z = src[2] * invW;

// Then transform by the viewport and offset to finally get subpixel coordinates. Normally, this is done by the viewport
// and offset params.
float screen[3];
screen[0] = (proj[0] * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
screen[1] = (proj[1] * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
screen[2] = (proj[2] * viewportScaleZ + viewportZ);
screen[0] = (x * viewportScaleX + viewportX) * 16.0f - gstate.getOffsetX16();
screen[1] = (y * viewportScaleY + viewportY) * 16.0f - gstate.getOffsetY16();
screen[2] = (z * viewportScaleZ + viewportZ);
if (screen[2] < 0.0f) {
screen[2] = 0.0f;
}
if (screen[2] >= 65535.0f) {
screen[2] = 65535.0f;
}
screenVerts[i].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here.
screenVerts[i].y = screen[1] * (1.0f / 16.0f);
screenVerts[i].z = screen[2];
screenVerts[outCount].x = screen[0] * (1.0f / 16.0f); // We ditch the subpixel precision here.
screenVerts[outCount].y = screen[1] * (1.0f / 16.0f);
screenVerts[outCount].z = screen[2];

outCount++;
}
if (allBehind) {
// Cull the whole draw.
}
return outCount;
}

// Rasterizes screen-space vertices.
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count) {
// Prim should now be either TRIANGLES or RECTs.
_dbg_assert_(prim == GE_PRIM_RECTANGLES || prim == GE_PRIM_TRIANGLES);

GEComparison compareMode = gstate.getDepthTestFunction();
if (gstate.isModeClear()) {
if (!gstate.isClearModeDepthMask()) {
return;
}
compareMode = GE_COMP_ALWAYS;
} else {
float factor = 1.0f;
switch (vertTypeID & GE_VTYPE_POS_MASK) {
case GE_VTYPE_POS_8BIT:
for (int i = 0; i < count; i++) {
const s8 *data = (const s8 *)vertexData + i * vertexStride + offset;
for (int j = 0; j < 3; j++) {
transformed[i * 4 + j] = data[j] * factor;
}
transformed[i * 4 + 3] = 1.0f;
}
break;
case GE_VTYPE_POS_16BIT:
for (int i = 0; i < count; i++) {
const s16 *data = ((const s16 *)((const s8 *)vertexData + i * vertexStride + offset));
for (int j = 0; j < 3; j++) {
transformed[i * 4 + j] = data[j] * factor;
}
transformed[i * 4 + 3] = 1.0f;
}
break;
case GE_VTYPE_POS_FLOAT:
for (int i = 0; i < count; i++) {
memcpy(&transformed[i * 4], (const u8 *)vertexData + vertexStride * i + offset, sizeof(float) * 3);
transformed[i * 4 + 3] = 1.0f;
}
break;
}

for (int i = 0; i < count; i++) {
screenVerts[i].x = (int)transformed[i * 4 + 0];
screenVerts[i].y = (int)transformed[i * 4 + 1];
screenVerts[i].z = (u16)clamp_value(transformed[i * 4 + 2], 0.0f, 65535.0f);
}
if (!gstate.isDepthTestEnabled() || !gstate.isDepthWriteEnabled())
return;
}

// Then we need to stitch primitives from strips, etc etc...
// For now we'll just do it tri by tri. Later let's be more efficient.

switch (prim) {
case GE_PRIM_RECTANGLES:
for (int i = 0; i < count / 2; i++) {
Expand All @@ -399,30 +336,10 @@ void DepthRasterPrim(uint16_t *depth, int depthStride, int x1, int y1, int x2, i
break;
case GE_PRIM_TRIANGLES:
for (int i = 0; i < count / 3; i++) {
if (screenVerts[i * 3].behind || screenVerts[i * 3 + 1].behind || screenVerts[i * 3 + 2].behind) {
continue;
}
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, screenVerts + i * 3, compareMode);
}
break;
case GE_PRIM_TRIANGLE_STRIP:
{
int wind = 2;
for (int i = 0; i < count - 2; i++) {
int i0 = i;
int i1 = i + wind;
wind ^= 3;
int i2 = i + wind;
if (screenVerts[i0].behind || screenVerts[i1].behind || screenVerts[i2].behind) {
continue;
}
ScreenVert v[3];
v[0] = screenVerts[i0];
v[1] = screenVerts[i1];
v[2] = screenVerts[i2];
DepthRasterTriangle(depth, depthStride, x1, y1, x2, y2, v, compareMode);
}
break;
}
default:
_dbg_assert_(false);
}
}
13 changes: 12 additions & 1 deletion GPU/Common/DepthRaster.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,21 @@
#include "Common/CommonTypes.h"
#include "GPU/ge_constants.h"

struct DepthScreenVertex {
int x;
int y;
uint16_t z;
};

// Specialized, very limited depth-only rasterizer.
// Meant to run in parallel with hardware rendering, in games that read back the depth buffer
// for effects like lens flare.
// So, we can be quite inaccurate without any issues, and skip a lot of functionality.

class VertexDecoder;
void DepthRasterPrim(uint16_t *dest, int stride, int x1, int x2, int y1, int y2, void *bufferData, const void *vertexData, const void *indexData, GEPrimitiveType prim, int count, VertexDecoder *decoder, u32 vertexTypeID, bool clockwise);
struct TransformedVertex;

int DepthRasterClipIndexedTriangles(DepthScreenVertex *screenVerts, const float *transformed, const uint16_t *indexBuffer, int count);
void DecodeAndTransformForDepthRaster(float *dest, GEPrimitiveType prim, const float *worldviewproj, const void *vertexData, int count, VertexDecoder *dec, u32 vertTypeID);
void DepthRasterConvertTransformed(DepthScreenVertex *screenVerts, const TransformedVertex *transformed, int count);
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, GEPrimitiveType prim, int x1, int y1, int x2, int y2, const DepthScreenVertex *screenVerts, int count);
Loading

0 comments on commit 9f7802a

Please sign in to comment.