From 92621ac5d526e73469c43d524068315a81bbc869 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 8 Aug 2016 13:08:39 -0600 Subject: [PATCH] swr: [rasterizer core] routing of viewport indexes through frontend Viewport transform performed based on per-prim viewport index if available. Signed-off-by: Tim Rowley --- .../drivers/swr/rasterizer/core/api.cpp | 1 - .../drivers/swr/rasterizer/core/clip.cpp | 12 ++-- .../drivers/swr/rasterizer/core/clip.h | 17 ++--- .../drivers/swr/rasterizer/core/context.h | 2 +- .../drivers/swr/rasterizer/core/frontend.cpp | 62 ++++++++++++++++--- .../drivers/swr/rasterizer/core/frontend.h | 24 ++++++- 6 files changed, 91 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index d6aa80d678f..15485012a08 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -676,7 +676,6 @@ void SwrSetViewports( if (pMatrices != nullptr) { - //memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports); // @todo Faster to copy portions of the SOA or just copy all of it? memcpy(&pState->vpMatrices, pMatrices, sizeof(SWR_VIEWPORT_MATRICES)); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index e624fd8f674..21cbb0a0629 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -179,26 +179,26 @@ void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float * return; } -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) { RDTSC_START(FEClipTriangles); Clipper<3> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId); + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); RDTSC_STOP(FEClipTriangles, 1, 0); } -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) { RDTSC_START(FEClipLines); Clipper<2> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId); + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); RDTSC_STOP(FEClipLines, 1, 0); } -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId) +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) { RDTSC_START(FEClipPoints); Clipper<1> clipper(workerId, pDC); - clipper.ExecuteStage(pa, prims, primMask, primId); + clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx); RDTSC_STOP(FEClipPoints, 1, 0); } diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index a2ba76967fe..b173ae59b45 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -302,7 +302,7 @@ public: } // clip SIMD primitives - void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId) + void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId, const simdscalari& vViewportIdx) { // input/output vertex store for clipper simdvertex vertices[7]; // maximum 7 verts generated per triangle @@ -402,6 +402,7 @@ public: uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts; uint32_t* pPrimitiveId = (uint32_t*)&vPrimId; + uint32_t* pViewportIdx = (uint32_t*)&vViewportIdx; const simdscalari vOffsets = _mm256_set_epi32( 0 * sizeof(simdvertex), // unused lane @@ -487,7 +488,7 @@ public: if (assemble) { static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff }; - pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim])); + pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]), _simd_set1_epi32(pViewportIdx[inputPrim])); } } while (clipPa.NextPrim()); } @@ -499,7 +500,7 @@ public: } // execute the clipper stage - void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId) + void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx) { // set up binner based on PA state PFN_PROCESS_PRIMS pfnBinner; @@ -552,7 +553,7 @@ public: RDTSC_START(FEGuardbandClip); // we have to clip tris, execute the clipper, which will also // call the binner - ClipSimd(vMask(primMask), vMask(clipMask), pa, primId); + ClipSimd(vMask(primMask), vMask(clipMask), pa, primId, viewportIdx); RDTSC_STOP(FEGuardbandClip, 1, 0); } else if (validMask) @@ -562,7 +563,7 @@ public: UPDATE_STAT_FE(CPrimitives, _mm_popcnt_u32(validMask)); // forward valid prims directly to binner - pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId); + pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId, viewportIdx); } } @@ -948,6 +949,6 @@ private: // pipeline stage functions -void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); -void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); -void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId); +void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); +void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); +void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx); diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 144fcefb208..320aa924c4f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -215,7 +215,7 @@ struct PA_STATE; // function signature for pipeline stages that execute after primitive assembly typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], - uint32_t primMask, simdscalari primID); + uint32_t primMask, simdscalari primID, simdscalari viewportIdx); OSALIGNLINE(struct) API_STATE { diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 3014c7defc8..a62aa966c01 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -833,7 +833,26 @@ static void GeometryShaderStage( vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); } - pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); + // use viewport array index if GS declares it as an output attribute. Otherwise use index 0. + simdscalari vViewPortIdx; + if (state.gsState.emitsViewportArrayIndex) + { + simdvector vpiAttrib[3]; + gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib); + + // OOB indices => forced to zero. + simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); + simdscalar vClearMask = _simd_cmplt_ps(vpiAttrib[0].x, _simd_castsi_ps(vNumViewports)); + vpiAttrib[0].x = _simd_and_ps(vClearMask, vpiAttrib[0].x); + + vViewPortIdx = _simd_castps_si(vpiAttrib[0].x); + } + else + { + vViewPortIdx = _simd_set1_epi32(0); + } + + pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx); } } } while (gsPa.NextPrim()); @@ -1104,7 +1123,7 @@ static void TessellationStages( SWR_ASSERT(pfnClipFunc); pfnClipFunc(pDC, tessPa, workerId, prim, - GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID)); + GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0)); } } @@ -1359,7 +1378,7 @@ void ProcessDraw( { SWR_ASSERT(pDC->pState->pfnProcessPrims); pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, - GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID)); + GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0)); } } } @@ -1727,6 +1746,7 @@ INLINE void calcBoundingBoxIntVertical(const simdvector * c /// @param workerId - thread's worker id. Even thread has a unique id. /// @param tri - Contains triangle position data for SIMDs worth of triangles. /// @param primID - Primitive ID for each triangle. +/// @param viewportIdx - viewport array index for each triangle. /// @tparam CT - ConservativeRastFETraits template void BinTriangles( @@ -1735,7 +1755,8 @@ void BinTriangles( uint32_t workerId, simdvector tri[3], uint32_t triMask, - simdscalari primID) + simdscalari primID, + simdscalari viewportIdx) { RDTSC_START(FEBinTriangles); @@ -1770,7 +1791,14 @@ void BinTriangles( tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2); // viewport transform to screen coords - viewportTransform<3>(tri, state.vpMatrices); + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<3>(tri, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<3>(tri, state.vpMatrices); + } } // adjust for pixel center location @@ -2119,7 +2147,8 @@ void BinPoints( uint32_t workerId, simdvector prim[3], uint32_t primMask, - simdscalari primID) + simdscalari primID, + simdscalari viewportIdx) { RDTSC_START(FEBinPoints); @@ -2143,7 +2172,14 @@ void BinPoints( primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0); // viewport transform to screen coords - viewportTransform<1>(&primVerts, state.vpMatrices); + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<1>(&primVerts, state.vpMatrices); + } } // adjust for pixel center location @@ -2429,7 +2465,8 @@ void BinLines( uint32_t workerId, simdvector prim[], uint32_t primMask, - simdscalari primID) + simdscalari primID, + simdscalari viewportIdx) { RDTSC_START(FEBinLines); @@ -2461,7 +2498,14 @@ void BinLines( prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1); // viewport transform to screen coords - viewportTransform<2>(prim, state.vpMatrices); + if (state.gsState.emitsViewportArrayIndex) + { + viewportTransform<2>(prim, state.vpMatrices, viewportIdx); + } + else + { + viewportTransform<2>(prim, state.vpMatrices); + } } // adjust for pixel center location diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index d47f17f4235..5e7762af2d5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -219,6 +219,26 @@ void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices) } } +template +INLINE +void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRICES & vpMatrices, simdscalari vViewportIdx) +{ + // perform a gather of each matrix element based on the viewport array indexes + simdscalar m00 = _simd_i32gather_ps(&vpMatrices.m00[0], vViewportIdx, 1); + simdscalar m30 = _simd_i32gather_ps(&vpMatrices.m30[0], vViewportIdx, 1); + simdscalar m11 = _simd_i32gather_ps(&vpMatrices.m11[0], vViewportIdx, 1); + simdscalar m31 = _simd_i32gather_ps(&vpMatrices.m31[0], vViewportIdx, 1); + simdscalar m22 = _simd_i32gather_ps(&vpMatrices.m22[0], vViewportIdx, 1); + simdscalar m32 = _simd_i32gather_ps(&vpMatrices.m32[0], vViewportIdx, 1); + + for (uint32_t i = 0; i < NumVerts; ++i) + { + v[i].x = _simd_fmadd_ps(v[i].x, m00, m30); + v[i].y = _simd_fmadd_ps(v[i].y, m11, m31); + v[i].z = _simd_fmadd_ps(v[i].z, m22, m32); + } +} + INLINE void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox) { @@ -288,6 +308,6 @@ void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative); struct PA_STATE_BASE; // forward decl -void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); -void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID); +void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); +void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx); -- 2.30.2