Viewport transform performed based on per-prim viewport index if available.
Signed-off-by: Tim Rowley <timothy.o.rowley@intel.com>
if (pMatrices != nullptr)
{
- //memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
// @todo Faster to copy portions of the SOA or just copy all of it?
memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES));
}
return;
}
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
RDTSC_START(FEClipTriangles);
Clipper<3> clipper(workerId, pDC);
- clipper.ExecuteStage(pa, prims, primMask, primId);
+ clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
RDTSC_STOP(FEClipTriangles, 1, 0);
}
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
RDTSC_START(FEClipLines);
Clipper<2> clipper(workerId, pDC);
- clipper.ExecuteStage(pa, prims, primMask, primId);
+ clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
RDTSC_STOP(FEClipLines, 1, 0);
}
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
RDTSC_START(FEClipPoints);
Clipper<1> clipper(workerId, pDC);
- clipper.ExecuteStage(pa, prims, primMask, primId);
+ clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
RDTSC_STOP(FEClipPoints, 1, 0);
}
}
// clip SIMD primitives
- void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+ void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx)
{
// input/output vertex store for clipper
simdvertex vertices[7]; // maximum 7 verts generated per triangle
uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
+ uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx;
const simdscalari vOffsets = _mm256_set_epi32(
0 * sizeof(simdvertex), // unused lane
if (assemble)
{
static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
- pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
+ pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim]));
}
} while (clipPa.NextPrim());
}
}
// execute the clipper stage
- void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+ void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
{
// set up binner based on PA state
PFN_PROCESS_PRIMS pfnBinner;
RDTSC_START(FEGuardbandClip);
// we have to clip tris, execute the clipper, which will also
// call the binner
- ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
+ ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx);
RDTSC_STOP(FEGuardbandClip, 1, 0);
}
else if (validMask)
UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask));
// forward valid prims directly to binner
- pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
+ pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx);
}
}
// pipeline stage functions
-void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
-void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
// function signature for pipeline stages that execute after primitive assembly
typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[],
- uint32_t primMask, simdscalari primID);
+ uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
OSALIGNLINE(struct) API_STATE
{
vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
}
- pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+ // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
+ simdscalari vViewPortIdx;
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ simdvector vpiAttrib[3];
+ gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
+
+ // OOB indices => forced to zero.
+ simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
+ simdscalar vClearMask = _simd_cmplt_ps(vpiAttrib[0].x, _simd_castsi_ps(vNumViewports));
+ vpiAttrib[0].x = _simd_and_ps(vClearMask, vpiAttrib[0].x);
+
+ vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
+ }
+ else
+ {
+ vViewPortIdx = _simd_set1_epi32(0);
+ }
+
+ pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
}
}
} while (gsPa.NextPrim());
SWR_ASSERT(pfnClipFunc);
pfnClipFunc(pDC, tessPa, workerId, prim,
- GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+ GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
}
}
{
SWR_ASSERT(pDC->pState->pfnProcessPrims);
pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
- GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+ GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
}
}
}
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param tri - Contains triangle position data for SIMDs worth of triangles.
/// @param primID - Primitive ID for each triangle.
+/// @param viewportIdx - viewport array index for each triangle.
/// @tparam CT - ConservativeRastFETraits
template <typename CT>
void BinTriangles(
uint32_t workerId,
simdvector tri[3],
uint32_t triMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinTriangles);
tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
// viewport transform to screen coords
- viewportTransform<3>(tri, state.vpMatrices);
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<3>(tri, state.vpMatrices);
+ }
}
// adjust for pixel center location
uint32_t workerId,
simdvector prim[3],
uint32_t primMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinPoints);
primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
// viewport transform to screen coords
- viewportTransform<1>(&primVerts, state.vpMatrices);
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<1>(&primVerts, state.vpMatrices);
+ }
}
// adjust for pixel center location
uint32_t workerId,
simdvector prim[],
uint32_t primMask,
- simdscalari primID)
+ simdscalari primID,
+ simdscalari viewportIdx)
{
RDTSC_START(FEBinLines);
prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
// viewport transform to screen coords
- viewportTransform<2>(prim, state.vpMatrices);
+ if (state.gsState.emitsViewportArrayIndex)
+ {
+ viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
+ }
+ else
+ {
+ viewportTransform<2>(prim, state.vpMatrices);
+ }
}
// adjust for pixel center location
}
}
+template<uint32_t NumVerts>
+INLINE
+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx)
+{
+ // perform a gather of each matrix element based on the viewport array indexes
+ simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 1);
+ simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 1);
+ simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 1);
+ simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 1);
+ simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 1);
+ simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 1);
+
+ for (uint32_t i = 0; i < NumVerts; ++i)
+ {
+ v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
+ v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
+ v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
+ }
+}
+
INLINE
void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox)
{
PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
struct PA_STATE_BASE; // forward decl
-void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
-void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);