swr/rasterizer: enable using AOS vertex data format
authorJan Zielinski <jan.zielinski@intel.com>
Fri, 26 Jul 2019 14:43:50 +0000 (16:43 +0200)
committerJan Zielinski <jan.zielinski@intel.com>
Thu, 8 Aug 2019 08:16:20 +0000 (10:16 +0200)
Reviewed-by: Alok Hota <alok.hota@intel.com>
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/frontend.h
src/gallium/drivers/swr/rasterizer/core/pa.h

index d8703e57ea3f9661663a05ec124766c86e73ea9b..a27b33d2051184d79676ea0473484ae9d51ca9f8 100644 (file)
 #include <limits>
 #include <iostream>
 
-//////////////////////////////////////////////////////////////////////////
-/// @brief Helper macro to generate a bitmask
-static INLINE uint32_t GenMask(uint32_t numBits)
-{
-    SWR_ASSERT(
-        numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
-    return ((1U << numBits) - 1);
-}
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief FE handler for SwrSync.
 /// @param pContext - pointer to SWR context.
@@ -400,7 +391,7 @@ uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode, uint32_t numPrims)
 /// @brief Return number of verts per primitive.
 /// @param topology - topology
 /// @param includeAdjVerts - include adjacent verts in primitive vertices
-INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 {
     uint32_t numVerts = 0;
     switch (topology)
index 38fe77e240de01fc54d5c409627adbdfedd4763a..a6d9fb5ba52e3cbdc1aecd10555258569872a641 100644 (file)
 #include "common/simdintrin.h"
 #include <type_traits>
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Helper macro to generate a bitmask
+static INLINE uint32_t
+              GenMask(uint32_t numBits)
+{
+    SWR_ASSERT(
+        numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
+    return ((1U << numBits) - 1);
+}
+
 // Calculates the A and B coefficients for the 3 edges of the triangle
 //
 // maths for edge equations:
index e19c8ea4a798017410f2f9e1b3e6b108800a2336..635bf195e4bef7835130fc3f97bb0781dce1ff10 100644 (file)
@@ -1282,11 +1282,12 @@ struct PA_TESS : PA_STATE
             uint32_t* (&in_ppIndices)[3],
             uint32_t           in_numPrims,
             PRIMITIVE_TOPOLOGY in_binTopology,
-            uint32_t           numVertsPerPrim) :
+            uint32_t           numVertsPerPrim,
+            bool               SOA = true) :
 
         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
         m_pVertexData(in_pVertData), m_attributeStrideInVectors(in_attributeStrideInVectors),
-        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims)
+        m_numAttributes(in_numAttributes), m_numPrims(in_numPrims), m_SOA(SOA)
     {
 #if USE_SIMD16_FRONTEND
         m_vPrimId = _simd16_setzero_si();
@@ -1363,8 +1364,17 @@ struct PA_TESS : PA_STATE
 
         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
 
-        const float* pBaseAttrib =
-            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pBaseAttrib;
+        if (m_SOA)
+        {
+            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pBaseAttrib            = pVertData + slot * 4;
+        }
+
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
@@ -1393,7 +1403,14 @@ struct PA_TESS : PA_STATE
                                                         _simd_castsi_ps(mask),
                                                         4); // gcc doesn't like sizeof(float)
 #endif
-                pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                if (m_SOA)
+                {
+                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pBase += sizeof(float);
+                }
             }
         }
 
@@ -1413,12 +1430,25 @@ struct PA_TESS : PA_STATE
 
         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
 
-        const float* pBaseAttrib =
-            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pBaseAttrib;
+        if (m_SOA)
+        {
+            pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pBaseAttrib            = pVertData + slot * 4;
+        }
+
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
+            if (!m_SOA)
+            {
+                indices = _simd16_mul_epi32(indices, _simd16_set1_epi32(vertexStride / 4));
+            }
 #else
             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
 #endif
@@ -1440,7 +1470,14 @@ struct PA_TESS : PA_STATE
                                                           4 /* gcc doesn't like sizeof(float) */);
                 verts[i].v[c]   = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
 #endif
-                pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                if (m_SOA)
+                {
+                    pBase += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pBase++;
+                }
             }
         }
 
@@ -1455,13 +1492,25 @@ struct PA_TESS : PA_STATE
 
         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
 
-        const float* pVertDataBase =
-            (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        const float* pVertDataBase;
+        if (m_SOA)
+        {
+            pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        }
+        else
+        {
+            const float* pVertData = (const float*)m_pVertexData;
+            pVertDataBase          = pVertData + slot * 4;
+        };
         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
         {
 #if USE_SIMD16_FRONTEND
             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2]
                                                 : m_ppIndices[i][primIndex];
+            if (!m_SOA)
+            {
+                index *= (vertexStride / 4);
+            }
 #else
             uint32_t index = m_ppIndices[i][primIndex];
 #endif
@@ -1471,8 +1520,16 @@ struct PA_TESS : PA_STATE
             for (uint32_t c = 0; c < 4; ++c)
             {
                 pVert[c] = pVertData[index];
-                pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
+                if (m_SOA)
+                {
+                    pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
+                }
+                else
+                {
+                    pVertData++;
+                }
             }
+
         }
     }
 
@@ -1535,6 +1592,8 @@ private:
 #endif
     SIMDVERTEX junkVertex;  // junk SIMDVERTEX for unimplemented API
     SIMDMASK   junkIndices; // temporary index store for unused virtual function
+
+    bool m_SOA;
 };
 
 // Primitive Assembler factory class, responsible for creating and initializing the correct