swr/rast: Switch intrinsic usage to SIMDLib
[mesa.git] / src / gallium / drivers / swr / rasterizer / core / binner.cpp
index 29d2f1ce42ce49a1d151073c288f2e4bad268e12..34789cf03566c566f995f7633bd7150343018ffc 100644 (file)
@@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
             inputSlot = backendState.vertexAttribOffset + i;
         }
 
-        __m128 attrib[3];    // triangle attribs (always 4 wide)
+        simd4scalar attrib[3];    // triangle attribs (always 4 wide)
         float* pAttribStart = pBuffer;
 
         if (HasConstantInterpT::value || IsDegenerate::value)
@@ -128,7 +128,7 @@ INLINE void ProcessAttributes(
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[vid]);
+                    SIMD128::store_ps(pBuffer, attrib[vid]);
                     pBuffer += 4;
                 }
             }
@@ -138,7 +138,7 @@ INLINE void ProcessAttributes(
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[i]);
+                    SIMD128::store_ps(pBuffer, attrib[i]);
                     pBuffer += 4;
                 }
             }
@@ -149,7 +149,7 @@ INLINE void ProcessAttributes(
 
             for (uint32_t i = 0; i < NumVertsT::value; ++i)
             {
-                _mm_store_ps(pBuffer, attrib[i]);
+                SIMD128::store_ps(pBuffer, attrib[i]);
                 pBuffer += 4;
             }
         }
@@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
         // effect of the missing vertices in the triangle interpolation.
         for (uint32_t v = NumVertsT::value; v < 3; ++v)
         {
-            _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+            SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
             pBuffer += 4;
         }
 
@@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
 {
     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
         simd16scalari &scisXmin, simd16scalari &scisYmin,
-        simd16scalari &scisXmax, simd16scalari &scisYmax)
-    {
+        simd16scalari &scisXmax, simd16scalari &scisYmax) {
         scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
         uint32_t clipAttribSlot = clipSlot == 0 ?
             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
 
-        __m128 primClipDist[3];
+        simd4scalar primClipDist[3];
         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 
         float vertClipDist[NumVerts];
         for (uint32_t e = 0; e < NumVerts; ++e)
         {
             OSALIGNSIMD(float) aVertClipDist[4];
-            _mm_store_ps(aVertClipDist, primClipDist[e]);
+            SIMD128::store_ps(aVertClipDist, primClipDist[e]);
             vertClipDist[e] = aVertClipDist[clipComp];
         };
 
@@ -625,13 +624,14 @@ void BinTriangles(
             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
     }
 
+    simdBBox bbox;
+
     if (!triMask)
     {
         goto endBinTriangles;
     }
 
     // Calc bounding box of triangles
-    simdBBox bbox;
     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
 
     // determine if triangle falls between pixel centers and discard
@@ -673,28 +673,30 @@ void BinTriangles(
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    // Make triangle bbox inclusive
-    bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
-    bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+        bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -768,7 +770,7 @@ endBinTriangles:
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@@ -837,10 +839,10 @@ endBinTriangles:
         // store triangle vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -870,7 +872,7 @@ endBinTriangles:
 
 #if USE_SIMD16_FRONTEND
 template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
@@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    // Make triangle bbox inclusive
-    bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
-    bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+        bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -1221,10 +1225,10 @@ endBinTriangles:
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
 
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
-        {
-            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
         {
-            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
         {
-            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
     AR_END(FEBinPoints, 1);
 }
 
-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
@@ -2168,6 +2176,8 @@ void BinPostSetupLines(
     simdscalar& vRecipW0 = recipW[0];
     simdscalar& vRecipW1 = recipW[1];
 
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
     // convert to fixed point
     simdscalari vXi[2], vYi[2];
     vXi[0] = fpToFixedPointVertical(prim[0].x);
@@ -2214,24 +2224,26 @@ void BinPostSetupLines(
     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2261,7 +2273,6 @@ void BinPostSetupLines(
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@@ -2310,10 +2321,10 @@ void BinPostSetupLines(
 
         // store line vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
     bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
 
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
@@ -2650,7 +2663,7 @@ void BinLines(
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,