From 62e2d657c868ee7c7ad6a24269c81a9827c66b8f Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Fri, 29 Sep 2017 14:45:16 -0500 Subject: [PATCH] swr/rast: Miscellaneous viewport array code changes Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/binner.cpp | 45 ++++++++++++++----- .../drivers/swr/rasterizer/core/clip.h | 14 ++++-- .../drivers/swr/rasterizer/core/frontend.cpp | 22 +++++---- src/gallium/drivers/swr/rasterizer/core/pa.h | 24 +++++----- .../drivers/swr/rasterizer/core/pa_avx.cpp | 4 +- 5 files changed, 71 insertions(+), 38 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp index e08e4896f3b..b624ae69b34 100644 --- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp @@ -450,16 +450,22 @@ void SIMDCALL BinTrianglesImpl( typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f); typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f); - typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); + typename SIMD_T::Vec4 vpiAttrib[3]; + typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); if (state.backendState.readViewportArrayIndex) { - typename SIMD_T::Vec4 vpiAttrib[3]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + } + + + if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 + { // OOB indices => forced to zero. - typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); @@ -815,6 +821,7 @@ endBinTriangles: SIMD_T::store_si(reinterpret_cast(aRTAI), SIMD_T::setzero_si()); } + // scan remaining valid triangles and bin each separately while (_BitScanForward(&triIndex, triMask)) { @@ -1299,15 +1306,22 @@ void BinPointsImpl( const SWR_RASTSTATE& rastState = state.rastState; // Read back viewport index if required - typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); + typename SIMD_T::Vec4 vpiAttrib[1]; + typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); + if (state.backendState.readViewportArrayIndex) { - typename SIMD_T::Vec4 vpiAttrib[1]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + } + + + if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 + { // OOB indices => forced to zero. - typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); @@ -1626,15 +1640,22 @@ void SIMDCALL BinLinesImpl( typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) }; - typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); + typename SIMD_T::Vec4 vpiAttrib[2]; + typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); + if (state.backendState.readViewportArrayIndex) { - typename SIMD_T::Vec4 vpiAttrib[2]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + } + + + if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 + { // OOB indices => forced to zero. - typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); - vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai); + vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); viewportIdx = SIMD_T::and_si(vClearMask, vpai); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index e9a410daa31..0d3d78057ff 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -641,7 +641,7 @@ public: } } - PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology); + PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology); static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f }; @@ -687,15 +687,21 @@ public: UPDATE_STAT_FE(CInvocations, numInvoc); // Read back viewport index if required - typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0); + typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si(); + typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim]; + typename SIMD_T::Integer vpai = SIMD_T::setzero_si(); if (state.backendState.readViewportArrayIndex) { - typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim]; pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib); + vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); + } + + + if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 + { // OOB indices => forced to zero. - typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]); vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si()); typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS); typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports); diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index aea8e88de4d..a803512b7cc 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -951,7 +951,7 @@ static void GeometryShaderStage( } #if USE_SIMD16_FRONTEND - PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); + PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim); #else PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts); @@ -986,9 +986,10 @@ static void GeometryShaderStage( { #if USE_SIMD16_FRONTEND simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]); - - gsPa.useAlternateOffset = false; - pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId); + { + gsPa.useAlternateOffset = false; + pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId); + } #else simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]); pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId); @@ -1273,7 +1274,8 @@ static void TessellationStages( tsState.numDsOutputAttribs, tsData.ppIndices, tsData.NumPrimitives, - tsState.postDSTopology); + tsState.postDSTopology, + numVertsPerPrim); while (tessPa.HasWork()) { @@ -1498,7 +1500,8 @@ void ProcessDraw( } // choose primitive assembler - PA_FACTORY paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize); + + PA_FACTORY paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1)); PA_STATE& pa = paFactory.GetPA(); #if USE_SIMD16_FRONTEND @@ -1727,9 +1730,10 @@ void ProcessDraw( if (HasRastT::value) { SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); - - pa.useAlternateOffset = false; - pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID); + { + pa.useAlternateOffset = false; + pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID); + } } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index e76dc044d7c..13f99cb5461 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -77,11 +77,12 @@ struct PA_STATE #if ENABLE_AVX512_SIMD16 bool useAlternateOffset{ false }; + uint32_t numVertsPerPrim{ 0 }; #endif - PA_STATE() {} - PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) : - pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {} + PA_STATE(){} + PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) : + pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {} virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; @@ -165,7 +166,7 @@ struct PA_STATE_OPT : public PA_STATE PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, - uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); + uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN); bool HasWork() { @@ -430,8 +431,8 @@ struct PA_STATE_CUT : public PA_STATE PA_STATE_CUT() {} PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts, - uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts) - : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride) + uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim) + : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim) { numVerts = in_streamSizeInVerts; numAttribs = in_numAttribs; @@ -1144,9 +1145,10 @@ struct PA_TESS : PA_STATE uint32_t in_numAttributes, uint32_t* (&in_ppIndices)[3], uint32_t in_numPrims, - PRIMITIVE_TOPOLOGY in_binTopology) : + PRIMITIVE_TOPOLOGY in_binTopology, + uint32_t numVertsPerPrim) : - PA_STATE(in_pDC, nullptr, 0, in_vertexStride), + PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim), m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors), m_numAttributes(in_numAttributes), @@ -1416,7 +1418,7 @@ private: template struct PA_FACTORY { - PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo) + PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo) { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE const API_STATE& state = GetApiState(pDC); @@ -1433,14 +1435,14 @@ struct PA_FACTORY uint32_t numAttribs = state.feNumAttributes; new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, - vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false); + vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim); cutPA = true; } else #endif { uint32_t numPrims = GetNumPrims(in_topo, numVerts); - new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false); + new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim); cutPA = false; } diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index e53389b63fc..3bf66b382b9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -2588,8 +2588,8 @@ void PaRectListSingle0( } PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, - uint32_t in_vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : - PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), + uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) : + PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming) { const API_STATE& state = GetApiState(pDC); -- 2.30.2