swr/rast: Miscellaneous viewport array code changes
authorTim Rowley <timothy.o.rowley@intel.com>
Fri, 29 Sep 2017 19:45:16 +0000 (14:45 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Thu, 19 Oct 2017 18:10:55 +0000 (13:10 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/binner.cpp
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/pa.h
src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp

index e08e4896f3b1891d802cf475128721a444469fa2..b624ae69b34990af53bccf398566eb2eedb1ef5f 100644 (file)
@@ -450,16 +450,22 @@ void SIMDCALL BinTrianglesImpl(
     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 
-    typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+    typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+    typename SIMD_T::Vec4 vpiAttrib[3];
+    typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 
     if (state.backendState.readViewportArrayIndex)
     {
-        typename SIMD_T::Vec4 vpiAttrib[3];
         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+        vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+    }
+
+
+    if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+    {
         // OOB indices => forced to zero.
-        typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-        vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+        vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -815,6 +821,7 @@ endBinTriangles:
         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
     }
 
+
     // scan remaining valid triangles and bin each separately
     while (_BitScanForward(&triIndex, triMask))
     {
@@ -1299,15 +1306,22 @@ void BinPointsImpl(
     const SWR_RASTSTATE& rastState = state.rastState;
 
     // Read back viewport index if required
-    typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+    typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+    typename SIMD_T::Vec4 vpiAttrib[1];
+    typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
     if (state.backendState.readViewportArrayIndex)
     {
-        typename SIMD_T::Vec4 vpiAttrib[1];
         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+        vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+    }
+
+
+    if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+    {
         // OOB indices => forced to zero.
-        typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-        vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+        vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
@@ -1626,15 +1640,22 @@ void SIMDCALL BinLinesImpl(
 
     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
 
-    typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+    typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+    typename SIMD_T::Vec4 vpiAttrib[2];
+    typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
+
     if (state.backendState.readViewportArrayIndex)
     {
-        typename SIMD_T::Vec4 vpiAttrib[2];
         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+        vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+    }
+
+
+    if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+    {
         // OOB indices => forced to zero.
-        typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
-        vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
+        vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
index e9a410daa31967ba20ef7484790dfabfd0fa3180..0d3d78057ff29df57d445cc33df878203aa07f71 100644 (file)
@@ -641,7 +641,7 @@ public:
                 }
             }
 
-            PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
+            PA_STATE_OPT clipPA(pDC, numEmittedPrims, reinterpret_cast<uint8_t *>(&transposedPrims[0]), numEmittedVerts, SWR_VTX_NUM_SLOTS, true, NumVertsPerPrim, clipTopology);
 
             static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f };
 
@@ -687,15 +687,21 @@ public:
         UPDATE_STAT_FE(CInvocations, numInvoc);
 
         // Read back viewport index if required
-        typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
+        typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
+        typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim];
+        typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 
         if (state.backendState.readViewportArrayIndex)
         {
-            typename SIMD_T::Vec4 vpiAttrib[NumVertsPerPrim];
             pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 
+            vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
+        }
+
+
+        if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0 
+        {
             // OOB indices => forced to zero.
-            typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
             vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
             typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
             typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
index aea8e88de4d87d653e9be0cbc6d3dcf0491690a6..a803512b7cc85052d75b249d15d232008d5676cc 100644 (file)
@@ -951,7 +951,7 @@ static void GeometryShaderStage(
                 }
 
 #if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts, pa.numVertsPerPrim);
 
 #else
                 PA_STATE_CUT gsPa(pDC, (uint8_t*)pGsBuffers->pGsTransposed, numEmittedVerts, pState->outputVertexSize, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
@@ -986,9 +986,10 @@ static void GeometryShaderStage(
                             {
 #if USE_SIMD16_FRONTEND
                                 simd16scalari vPrimId = _simd16_set1_epi32(pPrimitiveId[inputPrim]);
-
-                                gsPa.useAlternateOffset = false;
-                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
+                                {
+                                    gsPa.useAlternateOffset = false;
+                                    pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId);
+                                }
 #else
                                 simdscalari vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
@@ -1273,7 +1274,8 @@ static void TessellationStages(
             tsState.numDsOutputAttribs,
             tsData.ppIndices,
             tsData.NumPrimitives,
-            tsState.postDSTopology);
+            tsState.postDSTopology,
+            numVertsPerPrim);
 
         while (tessPa.HasWork())
         {
@@ -1498,7 +1500,8 @@ void ProcessDraw(
     }
 
     // choose primitive assembler
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize);
+    
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, gpVertexStore, numVerts, state.frontendState.vsVertexSize, GetNumVerts(state.topology, 1));
     PA_STATE& pa = paFactory.GetPA();
 
 #if USE_SIMD16_FRONTEND
@@ -1727,9 +1730,10 @@ void ProcessDraw(
                                 if (HasRastT::value)
                                 {
                                     SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
-
-                                    pa.useAlternateOffset = false;
-                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
+                                    {
+                                        pa.useAlternateOffset = false;
+                                        pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID);
+                                    }
                                 }
                             }
                         }
index e76dc044d7c9f26fd775880e8d57e8314e977e70..13f99cb5461dc9f21285867b8c95578962590abc 100644 (file)
@@ -77,11 +77,12 @@ struct PA_STATE
 
 #if ENABLE_AVX512_SIMD16
     bool useAlternateOffset{ false };
+    uint32_t numVertsPerPrim{ 0 };
 
 #endif
-    PA_STATE() {}
-    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
-        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
+    PA_STATE(){}
+    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
+        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
 
     virtual bool HasWork() = 0;
     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
@@ -165,7 +166,7 @@ struct PA_STATE_OPT : public PA_STATE
 
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-        uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+        uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 
     bool HasWork()
     {
@@ -430,8 +431,8 @@ struct PA_STATE_CUT : public PA_STATE
 
     PA_STATE_CUT() {}
     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
-        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
-        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
+        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
+        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
     {
         numVerts = in_streamSizeInVerts;
         numAttribs = in_numAttribs;
@@ -1144,9 +1145,10 @@ struct PA_TESS : PA_STATE
         uint32_t in_numAttributes,
         uint32_t* (&in_ppIndices)[3],
         uint32_t in_numPrims,
-        PRIMITIVE_TOPOLOGY in_binTopology) :
+        PRIMITIVE_TOPOLOGY in_binTopology,
+        uint32_t numVertsPerPrim) :
 
-        PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
+        PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
         m_pVertexData(in_pVertData),
         m_attributeStrideInVectors(in_attributeStrideInVectors),
         m_numAttributes(in_numAttributes),
@@ -1416,7 +1418,7 @@ private:
 template <typename IsIndexedT, typename IsCutIndexEnabledT>
 struct PA_FACTORY
 {
-    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
+    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         const API_STATE& state = GetApiState(pDC);
@@ -1433,14 +1435,14 @@ struct PA_FACTORY
             uint32_t numAttribs = state.feNumAttributes;
 
             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
-                vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+                vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
             cutPA = true;
         }
         else
 #endif
         {
             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
+            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
             cutPA = false;
         }
 
index e53389b63fcc743ad348a9955686f23f7459b770..3bf66b382b9acdf3704e6bff5e26e4f8ba75b58c 100644 (file)
@@ -2588,8 +2588,8 @@ void PaRectListSingle0(
 }
 
 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
-    uint32_t in_vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : 
-    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
+    uint32_t in_vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo) : 
+    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride, numVertsPerPrim), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
 {
     const API_STATE& state = GetApiState(pDC);