swr: [rasterizer core] SIMD16 Frontend WIP
authorTim Rowley <timothy.o.rowley@intel.com>
Wed, 22 Mar 2017 17:36:49 +0000 (12:36 -0500)
committerTim Rowley <timothy.o.rowley@intel.com>
Tue, 28 Mar 2017 16:24:33 +0000 (11:24 -0500)
Implement widened clipper and binner interfaces for SIMD16.

Reviewed-by: George Kyriazis <george.kyriazis@intel.com>
src/gallium/drivers/swr/rasterizer/core/api.cpp
src/gallium/drivers/swr/rasterizer/core/binner.cpp
src/gallium/drivers/swr/rasterizer/core/clip.cpp
src/gallium/drivers/swr/rasterizer/core/clip.h
src/gallium/drivers/swr/rasterizer/core/context.h
src/gallium/drivers/swr/rasterizer/core/frontend.cpp
src/gallium/drivers/swr/rasterizer/core/frontend.h
src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp

index bd63796d1382b8f2c3f25a3b04583a3cc69254b0..dabd0616d3bd1337818a3927318ff8222de1631a 100644 (file)
@@ -839,11 +839,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     }
     
     PFN_PROCESS_PRIMS pfnBinner;
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnBinner_simd16;
+#endif
     switch (pState->state.topology)
     {
     case TOP_POINT_LIST:
         pState->pfnProcessPrims = ClipPoints;
         pfnBinner = BinPoints;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipPoints_simd16;
+        pfnBinner_simd16 = BinPoints_simd16;
+#endif
         break;
     case TOP_LINE_LIST:
     case TOP_LINE_STRIP:
@@ -852,10 +859,18 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     case TOP_LISTSTRIP_ADJ:
         pState->pfnProcessPrims = ClipLines;
         pfnBinner = BinLines;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipLines_simd16;
+        pfnBinner_simd16 = BinLines_simd16;
+#endif
         break;
     default:
         pState->pfnProcessPrims = ClipTriangles;
         pfnBinner = GetBinTrianglesFunc((rastState.conservativeRast > 0));
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = ClipTriangles_simd16;
+        pfnBinner_simd16 = GetBinTrianglesFunc_simd16((rastState.conservativeRast > 0));
+#endif
         break;
     };
 
@@ -864,6 +879,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     if (pState->state.frontendState.vpTransformDisable)
     {
         pState->pfnProcessPrims = pfnBinner;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = pfnBinner_simd16;
+#endif
     }
 
     if ((pState->state.psState.pfnPixelShader == nullptr) &&
@@ -874,11 +892,17 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
         (pState->state.backendState.numAttributes == 0))
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
     if (pState->state.soState.rasterizerDisable == true)
     {
         pState->pfnProcessPrims = nullptr;
+#if USE_SIMD16_FRONTEND
+        pState->pfnProcessPrims_simd16 = nullptr;
+#endif
     }
 
 
index 490a86804fc324341025ef4e27609f0380849a4e..63eab33ac0b81409f74f81043aef1f1847d4c658 100644 (file)
@@ -856,6 +856,58 @@ endBinTriangles:
     AR_END(FEBinTriangles, 1);
 }
 
+#if USE_SIMD16_FRONTEND
+inline uint32_t GetPrimMaskLo(uint32_t primMask)
+{
+    return primMask & 255;
+}
+
+inline uint32_t GetPrimMaskHi(uint32_t primMask)
+{
+    return (primMask >> 8) & 255;
+}
+
+template <typename CT>
+void BinTriangles_simd16(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simd16vector tri[3],
+    uint32_t triMask,
+    simd16scalari primID,
+    simd16scalari viewportIdx)
+{
+    enum { VERTS_PER_PRIM = 3 };
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(tri[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    BinTriangles<CT>(pDC, pa, workerId, verts, GetPrimMaskLo(triMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(triMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(tri[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        BinTriangles<CT>(pDC, pa, workerId, verts, GetPrimMaskHi(triMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+}
+
+#endif
 struct FEBinTrianglesChooser
 {
     typedef PFN_PROCESS_PRIMS FuncType;
@@ -873,6 +925,25 @@ PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
 }
 
+#if USE_SIMD16_FRONTEND
+struct FEBinTrianglesChooser_simd16
+{
+    typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
+
+    template <typename... ArgsB>
+    static FuncType GetFunc()
+    {
+        return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
+    }
+};
+
+// Selector for correct templated BinTrinagles function
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
+{
+    return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
+}
+
+#endif
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
@@ -1217,6 +1288,47 @@ void BinPoints(
     AR_END(FEBinPoints, 1);
 }
 
+#if USE_SIMD16_FRONTEND
+void BinPoints_simd16(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simd16vector prim[3],
+    uint32_t primMask,
+    simd16scalari primID,
+    simd16scalari viewportIdx)
+{
+    enum { VERTS_PER_PRIM = 1 };
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(prim[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    BinPoints(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(primMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(prim[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        BinPoints(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+}
+
+#endif
 //////////////////////////////////////////////////////////////////////////
 /// @brief Bin SIMD lines to the backend.
 /// @param pDC - pointer to draw context.
@@ -1503,3 +1615,45 @@ void BinLines(
         primID,
         viewportIdx);
 }
+
+#if USE_SIMD16_FRONTEND
+void BinLines_simd16(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simd16vector prim[3],
+    uint32_t primMask,
+    simd16scalari primID,
+    simd16scalari viewportIdx)
+{
+    enum { VERTS_PER_PRIM = 2 };
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(prim[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    BinLines(pDC, pa, workerId, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primID, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(primMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(prim[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        BinLines(pDC, pa, workerId, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primID, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+}
+
+#endif
index 9e919d3a252727e2c3d5b8be237c701b389cb8c4..6fc7e162b4f99c60388e6888d5cbf79997ecba49 100644 (file)
@@ -174,6 +174,7 @@ void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector pr
     clipper.ExecuteStage(pa, prims, primMask, primId, viewportIdx);
     AR_END(FEClipLines, 1);
 }
+
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
@@ -183,3 +184,133 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
     AR_END(FEClipPoints, 1);
 }
 
+#if USE_SIMD16_FRONTEND
+inline uint32_t GetPrimMaskLo(uint32_t primMask)
+{
+    return primMask & 255;
+}
+
+inline uint32_t GetPrimMaskHi(uint32_t primMask)
+{
+    return (primMask >> 8) & 255;
+}
+
+void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipTriangles, pDC->drawId);
+
+    enum { VERTS_PER_PRIM = 3 };
+
+    Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(primMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+
+    AR_END(FEClipTriangles, 1);
+}
+
+void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipLines, pDC->drawId);
+
+    enum { VERTS_PER_PRIM = 2 };
+
+    Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(primMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+
+    AR_END(FEClipLines, 1);
+}
+
+void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx)
+{
+    SWR_CONTEXT *pContext = pDC->pContext;
+    AR_BEGIN(FEClipPoints, pDC->drawId);
+
+    enum { VERTS_PER_PRIM = 1 };
+
+    Clipper<VERTS_PER_PRIM> clipper(workerId, pDC);
+
+    simdvector verts[VERTS_PER_PRIM];
+
+    for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+    {
+        for (uint32_t j = 0; j < 4; j += 1)
+        {
+            verts[i][j] = _simd16_extract_ps(prims[i][j], 0);
+        }
+    }
+
+    pa.useAlternateOffset = false;
+    clipper.ExecuteStage(pa, verts, GetPrimMaskLo(primMask), _simd16_extract_si(primId, 0), _simd16_extract_si(viewportIdx, 0));
+
+    if (GetPrimMaskHi(primMask))
+    {
+        for (uint32_t i = 0; i < VERTS_PER_PRIM; i += 1)
+        {
+            for (uint32_t j = 0; j < 4; j += 1)
+            {
+                verts[i][j] = _simd16_extract_ps(prims[i][j], 1);
+            }
+        }
+
+        pa.useAlternateOffset = true;
+        clipper.ExecuteStage(pa, verts, GetPrimMaskHi(primMask), _simd16_extract_si(primId, 1), _simd16_extract_si(viewportIdx, 1));
+    }
+
+    AR_END(FEClipPoints, 1);
+}
+
+#endif
+
index 3a79d6a34c41b0da18b5fae55f8e9317082215b2..017f5e795c49b9aa7c4c785c1aabd470aa66f386 100644 (file)
@@ -969,3 +969,9 @@ private:
 void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId, simdscalari viewportIdx);
+#if USE_SIMD16_FRONTEND
+void ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+void ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId, simd16scalari viewportIdx);
+#endif
+
index 9da7962826ca2fa1b100204f466889d1fbe3a7c2..b520df225b8ffb321c917ee9da39cd65a7d404af 100644 (file)
@@ -363,6 +363,9 @@ struct DRAW_STATE
     // pipeline function pointers, filled in by API thread when setting up the draw
     BACKEND_FUNCS backendFuncs;
     PFN_PROCESS_PRIMS pfnProcessPrims;
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnProcessPrims_simd16;
+#endif
 
     CachingArena* pArena;     // This should only be used by API thread.
 };
index eb52594af508514ce8003f5a873f3d8318116f0f..9df7eeadc1020b9cbde69eed2a76d280d70f1da7 100644 (file)
@@ -841,6 +841,20 @@ static void GeometryShaderStage(
     }
 
     // set up new binner and state for the GS output topology
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+    if (HasRastT::value)
+    {
+        switch (pState->outputTopology)
+        {
+        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles_simd16; break;
+        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines_simd16; break;
+        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints_simd16; break;
+        default: SWR_INVALID("Unexpected GS output topology: %d", pState->outputTopology);
+        }
+    }
+
+#else
     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
@@ -853,6 +867,7 @@ static void GeometryShaderStage(
         }
     }
 
+#endif
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
@@ -997,39 +1012,8 @@ static void GeometryShaderStage(
                                     vViewPortIdx = _simd16_set1_epi32(0);
                                 }
 
-                                const uint32_t primMask = GenMask(gsPa.NumPrims());
-                                const uint32_t primMask_lo = primMask & 255;
-                                const uint32_t primMask_hi = (primMask >> 8) & 255;
-
-                                const simd16scalari primID = vPrimId;
-                                const simdscalari primID_lo = _simd16_extract_si(primID, 0);
-                                const simdscalari primID_hi = _simd16_extract_si(primID, 1);
-
-                                for (uint32_t i = 0; i < 3; i += 1)
-                                {
-                                    for (uint32_t j = 0; j < 4; j += 1)
-                                    {
-                                        attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 0);
-                                    }
-                                }
-
                                 gsPa.useAlternateOffset = false;
-                                pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_lo, primID_lo, _simd16_extract_si(vViewPortIdx, 0));
-
-                                if (primMask_hi)
-                                {
-                                    for (uint32_t i = 0; i < 3; i += 1)
-                                    {
-                                        for (uint32_t j = 0; j < 4; j += 1)
-                                        {
-                                            attrib[i][j] = _simd16_extract_ps(attrib_simd16[i][j], 1);
-                                        }
-                                    }
-
-                                    gsPa.useAlternateOffset = true;
-                                    pfnClipFunc(pDC, gsPa, workerId, attrib, primMask_hi, primID_hi, _simd16_extract_si(vViewPortIdx, 1));
-                                }
-
+                                pfnClipFunc(pDC, gsPa, workerId, attrib_simd16, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 #else
                                 simdscalari vPrimId;
                                 // pull primitiveID from the GS output if available
@@ -1202,6 +1186,20 @@ static void TessellationStages(
     }
     SWR_ASSERT(tsCtx);
 
+#if USE_SIMD16_FRONTEND
+    PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc = nullptr;
+    if (HasRastT::value)
+    {
+        switch (tsState.postDSTopology)
+        {
+        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles_simd16; break;
+        case TOP_LINE_LIST:     pfnClipFunc = ClipLines_simd16; break;
+        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints_simd16; break;
+        default: SWR_INVALID("Unexpected DS output topology: %d", tsState.postDSTopology);
+        }
+    }
+
+#else
     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
     if (HasRastT::value)
     {
@@ -1214,6 +1212,7 @@ static void TessellationStages(
         }
     }
 
+#endif
     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
     hsContext.pCPout = gt_pTessellationThreadData->patchData;
     hsContext.PrimitiveID = primID;
@@ -1408,30 +1407,8 @@ static void TessellationStages(
 
                     SWR_ASSERT(pfnClipFunc);
 #if USE_SIMD16_FRONTEND
-                    for (uint32_t i = 0; i < 3; i += 1)
-                    {
-                        for (uint32_t j = 0; j < 4; j += 1)
-                        {
-                            prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 0);
-                        }
-                    }
-
                     tessPa.useAlternateOffset = false;
-                    pfnClipFunc(pDC, tessPa, workerId, prim, primMask_lo, primID_lo, _simd_set1_epi32(0));
-
-                    if (primMask_hi)
-                    {
-                        for (uint32_t i = 0; i < 3; i += 1)
-                        {
-                            for (uint32_t j = 0; j < 4; j += 1)
-                            {
-                                prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 1);
-                            }
-                        }
-
-                        tessPa.useAlternateOffset = true;
-                        pfnClipFunc(pDC, tessPa, workerId, prim, primMask_hi, primID_hi, _simd_set1_epi32(0));
-                    }
+                    pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0));
 #else
                     pfnClipFunc(pDC, tessPa, workerId, prim,
                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
@@ -1791,34 +1768,10 @@ void ProcessDraw(
 
                                 if (HasRastT::value)
                                 {
-                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
-
-                                    simdvector prim[MAX_NUM_VERTS_PER_PRIM];
-
-                                    for (uint32_t i = 0; i < 3; i += 1)
-                                    {
-                                        for (uint32_t j = 0; j < 4; j += 1)
-                                        {
-                                            prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 0);
-                                        }
-                                    }
+                                    SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16);
 
                                     pa.useAlternateOffset = false;
-                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_lo, primID_lo, _simd_setzero_si());
-
-                                    if (primMask_hi)
-                                    {
-                                        for (uint32_t i = 0; i < 3; i += 1)
-                                        {
-                                            for (uint32_t j = 0; j < 4; j += 1)
-                                            {
-                                                prim[i][j] = _simd16_extract_ps(prim_simd16[i][j], 1);
-                                            }
-                                        }
-
-                                        pa.useAlternateOffset = true;
-                                        pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, primMask_hi, primID_hi, _simd_setzero_si());
-                                    }
+                                    pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si());
                                 }
                             }
                         }
index 58d6901a819182dbbc660a1bd96cbfcc0d1f889e..37b7215c5162661c3f142b7043d6c07ba449d077 100644 (file)
@@ -315,8 +315,15 @@ void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
 void ProcessShutdown(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 
 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative);
+#if USE_SIMD16_FRONTEND
+PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative);
+#endif
 
 struct PA_STATE_BASE;  // forward decl
 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID, simdscalari viewportIdx);
+#if USE_SIMD16_FRONTEND
+void BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+void BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID, simd16scalari viewportIdx);
+#endif
 
index 511a1fc0df30076299c91dfb9870cda595e17255..3e3b7abab534d62ed372603b8e474b88cbc5c343 100644 (file)
@@ -1228,7 +1228,11 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
     simdvector a;
     simdvector b;
 
+#if 1
+    const simd16vector &leadvert_16 = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
     const simd16vector &leadvert_16 = pa.leadingVertex.attrib[slot];
+#endif
 
     if (!pa.useAlternateOffset)
     {
@@ -1298,7 +1302,11 @@ bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 {
 #if USE_SIMD16_FRONTEND
+#if 1
+    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
     const simd16vector &a = pa.leadingVertex.attrib[slot];
+#endif
 #else
     simd16vector a;
 
@@ -1345,7 +1353,11 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
 {
 #if USE_SIMD16_FRONTEND
+#if 1
+    const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
+#else
     const simd16vector &a = pa.leadingVertex.attrib[slot];
+#endif
     const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot);
     const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot);