swr/rast: Add support to PA for variable sized vertices
authorTim Rowley <timothy.o.rowley@intel.com>
Mon, 5 Jun 2017 21:13:25 +0000 (16:13 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Fri, 16 Jun 2017 21:20:16 +0000 (16:20 -0500)
Reviewed-by: Bruce Cherniak <bruce.cherniak@intel.com>
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/pa.h
src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp

index 3e8ea33b21bf032e4ce8a4dbea5c2452d4c99c96..92356189673bf81604423045a9a217c3394eb5f1 100644 (file)
@@ -673,7 +673,7 @@ public:
                 }
             }
 
-            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
             {
@@ -914,7 +914,7 @@ public:
                 }
             }
 
-            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, SWR_VTX_NUM_SLOTS, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
             {
index 676a4456575cbb901947c26d4bfb799655b0459a..b9cee0e2c09198f9c167ac6b54c1dac0520c6d80 100644 (file)
@@ -916,10 +916,10 @@ static void GeometryShaderStage(
                 }
 
 #if USE_SIMD16_FRONTEND
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, reinterpret_cast<simd16mask *>(pCutBuffer), numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 
 #else
-                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, SWR_VTX_NUM_SLOTS, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 
 #endif
                 while (gsPa.GetNextStreamOutput())
@@ -1277,6 +1277,7 @@ static void TessellationStages(
             dsContext.pOutputData,
             dsContext.vectorStride,
 #endif
+            SWR_VTX_NUM_SLOTS,
             tsState.numDsOutputAttribs,
             tsData.ppIndices,
             tsData.NumPrimitives,
@@ -1503,7 +1504,7 @@ void ProcessDraw(
     }
 
     // choose primitive assembler
-    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize);
+    PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize, SWR_VTX_NUM_SLOTS);
     PA_STATE& pa = paFactory.GetPA();
 
 #if USE_SIMD16_FRONTEND
index 020399d39b9967c5d8b2cd327f88bbc9115749d3..bdd01beedacf95d894b99c531eec88fc97891106 100644 (file)
@@ -67,9 +67,10 @@ struct PA_STATE
     typedef         simdscalari         SIMDSCALARI;
 
 #endif
-    DRAW_CONTEXT *pDC{ nullptr };              // draw context
-    uint8_t* pStreamBase{ nullptr };           // vertex stream
-    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
+    DRAW_CONTEXT *pDC{ nullptr };       // draw context
+    uint8_t* pStreamBase{ nullptr };    // vertex stream
+    uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
+    uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
 
     // The topology the binner will use. In some cases the FE changes the topology from the api state.
     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
@@ -79,8 +80,8 @@ struct PA_STATE
 
 #endif
     PA_STATE() {}
-    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
-        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
+    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
+        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
 
     virtual bool HasWork() = 0;
     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
@@ -164,7 +165,7 @@ struct PA_STATE_OPT : public PA_STATE
     
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
-        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+        uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 
     bool HasWork()
     {
@@ -173,15 +174,19 @@ struct PA_STATE_OPT : public PA_STATE
 
     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
     {
-        simdvertex* pVertex = (simdvertex*)pStreamBase;
-        return pVertex[index].attrib[slot];
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t offset = index * vertexStride + slot;
+        simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
+        return vertexSlot;
     }
 
 #if ENABLE_AVX512_SIMD16
     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
     {
-        simd16vertex* pVertex = (simd16vertex*)pStreamBase;
-        return pVertex[index].attrib[slot];
+        SWR_ASSERT(slot < vertexStride);
+        uint32_t offset = index * vertexStride + slot;
+        simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
+        return vertexSlot;
     }
 
 #endif
@@ -262,8 +267,9 @@ struct PA_STATE_OPT : public PA_STATE
         }
 
         SWR_ASSERT(cur < numSimdVerts);
+        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
 
-        return reinterpret_cast<SIMDVERTEX *>(pStreamBase)[cur];
+        return *(SIMDVERTEX*)pVertex;
     }
 
     SIMDMASK& GetNextVsIndices()
@@ -423,9 +429,9 @@ struct PA_STATE_CUT : public PA_STATE
     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
-    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
+    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
-        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
+        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
     {
         numVerts = in_streamSizeInVerts;
         numAttribs = in_numAttribs;
@@ -480,7 +486,9 @@ struct PA_STATE_CUT : public PA_STATE
         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
         this->needOffsets = true;
-        return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
+        SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
+
+        return *(SIMDVERTEX*)pVertex;
     }
 
     SIMDMASK& GetNextVsIndices()
@@ -635,16 +643,17 @@ struct PA_STATE_CUT : public PA_STATE
     {
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
         {
+            uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 
             // step to simdvertex batch
             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
+            this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 #else
             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
-            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
+            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 #endif
 
             // step to index
@@ -1132,12 +1141,13 @@ struct PA_TESS : PA_STATE
         DRAW_CONTEXT *in_pDC,
         const SIMDSCALAR* in_pVertData,
         uint32_t in_attributeStrideInVectors,
+        uint32_t in_vertexStride,
         uint32_t in_numAttributes,
         uint32_t* (&in_ppIndices)[3],
         uint32_t in_numPrims,
         PRIMITIVE_TOPOLOGY in_binTopology) :
 
-        PA_STATE(in_pDC, nullptr, 0),
+        PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
         m_pVertexData(in_pVertData),
         m_attributeStrideInVectors(in_attributeStrideInVectors),
         m_numAttributes(in_numAttributes),
@@ -1407,7 +1417,7 @@ private:
 template <typename IsIndexedT, typename IsCutIndexEnabledT>
 struct PA_FACTORY
 {
-    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize) : topo(in_topo)
+    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
     {
 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
         const API_STATE& state = GetApiState(pDC);
@@ -1424,14 +1434,14 @@ struct PA_FACTORY
             uint32_t numAttribs = state.feNumAttributes;
 
             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
-                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+                vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
             cutPA = true;
         }
         else
 #endif
         {
             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
-            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, false);
+            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
             cutPA = false;
         }
 
index 897079cb414136ee12f2eef588f0e2d9cd2a6d8e..e710746296cdb04dad6f50fdfaee61cc35b84065 100644 (file)
@@ -2588,7 +2588,8 @@ void PaRectListSingle0(
 }
 
 PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
-    bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
+    uint32_t in_vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : 
+    PA_STATE(in_pDC, pStream, in_streamSizeInVerts, in_vertexStride), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
     cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
 {
     const API_STATE& state = GetApiState(pDC);