swr/rast: Make SIMDLib templated types easier to use

author George Kyriazis <george.kyriazis@intel.com>

Wed, 7 Feb 2018 22:51:41 +0000 (16:51 -0600)

committer George Kyriazis <george.kyriazis@intel.com>

Fri, 16 Feb 2018 16:54:01 +0000 (10:54 -0600)
author George Kyriazis <george.kyriazis@intel.com>
Wed, 7 Feb 2018 22:51:41 +0000 (16:51 -0600)
committer George Kyriazis <george.kyriazis@intel.com>
Fri, 16 Feb 2018 16:54:01 +0000 (10:54 -0600)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp

index 500cf8a87e3d6c8a9985a4b3c3e81bcc88b425b3..4114645d92e90be0d66a1eff388f28335e6a4069 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -571,3 +571,12 @@ struct SIMDBase : Traits::IsaImpl
  using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
  using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
  using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
+
+template <typename SIMD_T> using CompareType    = typename SIMD_T::CompareType;
+template <typename SIMD_T> using ScaleFactor    = typename SIMD_T::ScaleFactor;
+template <typename SIMD_T> using RoundMode      = typename SIMD_T::RoundMode;
+template <typename SIMD_T> using Float          = typename SIMD_T::Float;
+template <typename SIMD_T> using Double         = typename SIMD_T::Double;
+template <typename SIMD_T> using Integer        = typename SIMD_T::Integer;
+template <typename SIMD_T> using Vec4           = typename SIMD_T::Vec4;
+template <typename SIMD_T> using Mask           = typename SIMD_T::Mask;
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp

index 8447bc4dc292a7488a92caaa408b8ebcf4e89281..3b093cefc04f5a41c1584fdf97409124d7d492af 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -41,23 +41,23 @@ void BinPostSetupLinesImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[],
-    typename SIMD_T::Float recipW[],
+    Vec4<SIMD_T> prim[],
+    Float<SIMD_T> recipW[],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx);
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx);
  
  template <typename SIMD_T, uint32_t SIMD_WIDTH>
  void BinPostSetupPointsImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[],
+    Vec4<SIMD_T> prim[],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx);
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx);
  
  //////////////////////////////////////////////////////////////////////////
  /// @brief Processes attributes for the backend based on linkage mask and
@@ -327,34 +327,34 @@ struct EarlyRastHelper<SIMD512>
  template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
  uint32_t SIMDCALL EarlyRasterizer(
          SIMDBBOX_T<SIMD_T> &er_bbox,
-        typename SIMD_T::Integer (&vAi)[3],
-        typename SIMD_T::Integer (&vBi)[3],
-        typename SIMD_T::Integer (&vXi)[3],
-        typename SIMD_T::Integer (&vYi)[3],
+        Integer<SIMD_T> (&vAi)[3],
+        Integer<SIMD_T> (&vBi)[3],
+        Integer<SIMD_T> (&vXi)[3],
+        Integer<SIMD_T> (&vYi)[3],
          uint32_t cwTrisMask,
          uint32_t triMask,
          uint32_t oneTileMask)
  {
      // step to pixel center of top-left pixel of the triangle bbox
-    typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
+    Integer<SIMD_T> vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
      vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
  
-    typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
+    Integer<SIMD_T> vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
      vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
  
      // negate A and B for CW tris
-    typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
-    typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
-    typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
-    typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
-    typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
-    typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
+    Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
  
      RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
  
-    typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
-    typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
-    typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
+    Integer<SIMD_T> vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
+    Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask);
+    Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
  
      vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
      vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
@@ -364,34 +364,34 @@ uint32_t SIMDCALL EarlyRasterizer(
      vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
  
      // evaluate edge equations at top-left pixel
-    typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
-    typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
-    typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
+    Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
+    Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
+    Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
  
-    typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
-    typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
-    typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
+    Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
+    Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
+    Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
  
-    typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
-    typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
-    typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
+    Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
+    Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
+    Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
  
-    typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
-    typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
-    typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
+    Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
+    Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
+    Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
  
-    typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
-    typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
-    typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
+    Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
+    Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
+    Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
  
      vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
      vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
      vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
  
      // top left rule
-    typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
-    typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
-    typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
+    Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
+    Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
+    Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
  
      // vA < 0
      vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
@@ -399,9 +399,9 @@ uint32_t SIMDCALL EarlyRasterizer(
      vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
  
      // vA == 0 && vB < 0
-    typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
-    typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
-    typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
+    Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
+    Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
+    Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
  
      vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
      vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
@@ -415,28 +415,28 @@ uint32_t SIMDCALL EarlyRasterizer(
  #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
      // Go down
      // coverage pixel 0
-    typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
+    Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
      vMask0 = SIMD_T::and_si(vMask0, vEdge2);
  
      // coverage pixel 1
-    typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
-    typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
-    typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
-    typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
+    Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
+    Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
+    Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
  
      // coverage pixel 2
      vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
      vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
      vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
  
      // coverage pixel 3
      vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
      vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
      vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
-    typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
  
      // One step to the right and then up
@@ -445,31 +445,31 @@ uint32_t SIMDCALL EarlyRasterizer(
      vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
      vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
      vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
-    typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
  
      // coverage pixel 5
      vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
      vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
      vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
  
      // coverage pixel 6
      vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
      vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
      vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
  
      // coverage pixel 7
      vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
      vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
      vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
-    typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
+    Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
  
-    typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
+    Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
      vLit1 = SIMD_T::or_si(vLit1, vMask2);
      vLit1 = SIMD_T::or_si(vLit1, vMask3);
      vLit1 = SIMD_T::or_si(vLit1, vMask4);
@@ -537,7 +537,7 @@ uint32_t SIMDCALL EarlyRasterizer(
      vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
      vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
  
-    typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
+    Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
      vLit2 = SIMD_T::or_si(vLit2, vMask2);
      vLit2 = SIMD_T::or_si(vLit2, vMask3);
      vLit2 = SIMD_T::or_si(vLit2, vMask4);
@@ -545,24 +545,24 @@ uint32_t SIMDCALL EarlyRasterizer(
      vLit2 = SIMD_T::or_si(vLit2, vMask6);
      vLit2 = SIMD_T::or_si(vLit2, vMask7);
  
-    typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
+    Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
  
  #else
      // Generic algorithm sweeping in row by row order
-    typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
+    Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
  
-    typename SIMD_T::Integer vEdge0N = vEdge0;
-    typename SIMD_T::Integer vEdge1N = vEdge1;
-    typename SIMD_T::Integer vEdge2N = vEdge2;
+    Integer<SIMD_T> vEdge0N = vEdge0;
+    Integer<SIMD_T> vEdge1N = vEdge1;
+    Integer<SIMD_T> vEdge2N = vEdge2;
  
      for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
      {
          // Store edge values at the beginning of the row
-        typename SIMD_T::Integer vRowEdge0 = vEdge0N;
-        typename SIMD_T::Integer vRowEdge1 = vEdge1N;
-        typename SIMD_T::Integer vRowEdge2 = vEdge2N;
+        Integer<SIMD_T> vRowEdge0 = vEdge0N;
+        Integer<SIMD_T> vRowEdge1 = vEdge1N;
+        Integer<SIMD_T> vRowEdge2 = vEdge2N;
  
-        typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
+        Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
  
          for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
          {
@@ -589,7 +589,7 @@ uint32_t SIMDCALL EarlyRasterizer(
      }
  
      // compress all masks
-    typename SIMD_T::Integer vLit = vRowMask[0];
+    Integer<SIMD_T> vLit = vRowMask[0];
      for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
      {
          vLit = SIMD_T::or_si(vLit, vRowMask[row]);
@@ -627,11 +627,11 @@ void SIMDCALL BinTrianglesImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 tri[3],
+    Vec4<SIMD_T> tri[3],
      uint32_t triMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx)
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx)
  {
      const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
  
@@ -643,9 +643,9 @@ void SIMDCALL BinTrianglesImpl(
  
      MacroTileMgr *pTileMgr = pDC->pTileMgr;
  
-    typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
-    typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
-    typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
+    Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
+    Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
+    Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
  
      if (feState.vpTransformDisable)
      {
@@ -685,7 +685,7 @@ void SIMDCALL BinTrianglesImpl(
      }
  
      // Adjust for pixel center location
-    typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
  
      tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
      tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
@@ -697,15 +697,15 @@ void SIMDCALL BinTrianglesImpl(
      tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
  
      // Set vXi, vYi to required fixed point precision
-    typename SIMD_T::Integer vXi[3], vYi[3];
+    Integer<SIMD_T> vXi[3], vYi[3];
      FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
  
      // triangle setup
-    typename SIMD_T::Integer vAi[3], vBi[3];
+    Integer<SIMD_T> vAi[3], vBi[3];
      triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
  
      // determinant
-    typename SIMD_T::Integer vDet[2];
+    Integer<SIMD_T> vDet[2];
      calcDeterminantIntVertical(vAi, vBi, vDet);
  
      // cull zero area
@@ -774,14 +774,14 @@ void SIMDCALL BinTrianglesImpl(
          if (cullZeroAreaMask > 0)
          {
              // e0 = v1-v0
-            const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
-            const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
+            const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
+            const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
  
              uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
  
              // e1 = v2-v1
-            const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
-            const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
+            const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
+            const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
  
              uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
  
@@ -836,19 +836,19 @@ void SIMDCALL BinTrianglesImpl(
          int cullCenterMask;
  
          {
-            typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
+            Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
              xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
-            typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
+            Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
              xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
  
-            typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
+            Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
  
-            typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
+            Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
              ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
-            typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
+            Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
              ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
  
-            typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
+            Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
  
              vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
              cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
@@ -866,7 +866,7 @@ void SIMDCALL BinTrianglesImpl(
      // Gather the AOS effective scissor rects based on the per-prim VP index.
      /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
      {
-        typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+        Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
          if (pa.viewportArrayActive)
  
          {
@@ -895,18 +895,18 @@ void SIMDCALL BinTrianglesImpl(
          // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
          // some area. Bump the xmax/ymax edges out 
  
-        typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
+        Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
          bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
  
-        typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
+        Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
          bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
      }
  
      // Cull tris completely outside scissor
      {
-        typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
          uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
          triMask = triMask & ~maskOutsideScissor;
      }
@@ -924,8 +924,8 @@ void SIMDCALL BinTrianglesImpl(
          er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
          er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
  
-        typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
-        typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
+        Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
+        Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
  
          // Take only triangles that fit into ER tile
          uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
@@ -958,8 +958,8 @@ endBinTriangles:
      {
          // Simple non-conformant wireframe mode, useful for debugging
          // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
-        typename SIMD_T::Vec4 line[2];
-        typename SIMD_T::Float recipW[2];
+        Vec4<SIMD_T> line[2];
+        Float<SIMD_T> recipW[2];
  
          line[0] = tri[0];
          line[1] = tri[1];
@@ -1004,10 +1004,10 @@ endBinTriangles:
  
      OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
  
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
  
      // transpose verts needed for backend
      /// @todo modify BE to take non-transformed verts
@@ -1173,15 +1173,15 @@ void BinPostSetupPointsImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[],
+    Vec4<SIMD_T> prim[],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx)
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx)
  {
      RDTSC_BEGIN(FEBinPoints, pDC->drawId);
  
-    typename SIMD_T::Vec4 &primVerts = prim[0];
+    Vec4<SIMD_T> &primVerts = prim[0];
  
      const API_STATE& state = GetApiState(pDC);
      const SWR_RASTSTATE& rastState = state.rastState;
@@ -1192,7 +1192,7 @@ void BinPostSetupPointsImpl(
          state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
  
      // convert to fixed point
-    typename SIMD_T::Integer vXi, vYi;
+    Integer<SIMD_T> vXi, vYi;
  
      vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
      vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
@@ -1208,36 +1208,36 @@ void BinPostSetupPointsImpl(
          primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
  
          // compute macro tile coordinates 
-        typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
-        typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
+        Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
+        Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
  
          OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
  
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroX), macroX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroY), macroY);
  
          // compute raster tile coordinates
-        typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
-        typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
+        Integer<SIMD_T> rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
+        Integer<SIMD_T> rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
  
          // compute raster tile relative x,y for coverage mask
-        typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
-        typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
+        Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
+        Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
  
-        typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
-        typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
+        Integer<SIMD_T> tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
+        Integer<SIMD_T> tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
  
          OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
          OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
  
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeX), tileRelativeX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeY), tileRelativeY);
  
          OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
          OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
  
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedX), tileAlignedX);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedY), tileAlignedY);
  
          OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
          SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
@@ -1307,11 +1307,11 @@ void BinPostSetupPointsImpl(
      else
      {
          // non simple points need to be potentially binned to multiple macro tiles
-        typename SIMD_T::Float vPointSize;
+        Float<SIMD_T> vPointSize;
  
          if (rastState.pointParam)
          {
-            typename SIMD_T::Vec4 size[3];
+            Vec4<SIMD_T> size[3];
              pa.Assemble(VERTEX_SGV_SLOT, size);
              vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
          }
@@ -1326,8 +1326,8 @@ void BinPostSetupPointsImpl(
          bbox.xmin = bbox.xmax = vXi;
          bbox.ymin = bbox.ymax = vYi;
  
-        typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
-        typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
+        Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
+        Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
  
          bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
          bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
@@ -1338,7 +1338,7 @@ void BinPostSetupPointsImpl(
          // Gather the AOS effective scissor rects based on the per-prim VP index.
          /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
          {
-            typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+            Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
  
              if (pa.viewportArrayActive)
              {
@@ -1359,9 +1359,9 @@ void BinPostSetupPointsImpl(
          }
  
          // Cull bloated points completely outside scissor
-        typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
          uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
          primMask = primMask & ~maskOutsideScissor;
  
@@ -1373,10 +1373,10 @@ void BinPostSetupPointsImpl(
  
          OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
  
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
-        SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
+        SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
  
          // store render target array index
          const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
@@ -1477,11 +1477,11 @@ void BinPointsImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[3],
+    Vec4<SIMD_T> prim[3],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx)
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx)
  {
      const API_STATE& state = GetApiState(pDC);
      const SWR_FRONTEND_STATE& feState = state.frontendState;
@@ -1490,7 +1490,7 @@ void BinPointsImpl(
      if (!feState.vpTransformDisable)
      {
          // perspective divide
-        typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
+        Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
  
          prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
          prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
@@ -1507,7 +1507,7 @@ void BinPointsImpl(
          }
      }
  
-    typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
  
      prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
      prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
@@ -1580,12 +1580,12 @@ void BinPostSetupLinesImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[],
-    typename SIMD_T::Float recipW[],
+    Vec4<SIMD_T> prim[],
+    Float<SIMD_T> recipW[],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const &rtIdx)
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const &rtIdx)
  {
      const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
  
@@ -1598,11 +1598,11 @@ void BinPostSetupLinesImpl(
      PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
          state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
  
-    typename SIMD_T::Float &vRecipW0 = recipW[0];
-    typename SIMD_T::Float &vRecipW1 = recipW[1];
+    Float<SIMD_T> &vRecipW0 = recipW[0];
+    Float<SIMD_T> &vRecipW1 = recipW[1];
  
      // convert to fixed point
-    typename SIMD_T::Integer vXi[2], vYi[2];
+    Integer<SIMD_T> vXi[2], vYi[2];
  
      vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
      vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
@@ -1610,13 +1610,13 @@ void BinPostSetupLinesImpl(
      vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
  
      // compute x-major vs y-major mask
-    typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
-    typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
-    typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
+    Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
+    Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
+    Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
      uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
  
      // cull zero-length lines
-    typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
+    Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
      vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
  
      primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
@@ -1632,8 +1632,8 @@ void BinPostSetupLinesImpl(
      bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
  
      // bloat bbox by line width along minor axis
-    typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
-    typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
+    Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
+    Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
  
      SIMDBBOX_T<SIMD_T> bloatBox;
  
@@ -1649,7 +1649,7 @@ void BinPostSetupLinesImpl(
  
      // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
      {
-        typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
+        Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
  
          if (pa.viewportArrayActive)
          {
@@ -1671,9 +1671,9 @@ void BinPostSetupLinesImpl(
  
      // Cull prims completely outside scissor
      {
-        typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
-        typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
-        typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
+        Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
+        Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
+        Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
          uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
          primMask = primMask & ~maskOutsideScissor;
      }
@@ -1698,10 +1698,10 @@ void BinPostSetupLinesImpl(
  
      OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
  
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
-    SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
+    SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
  
      TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
      TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
@@ -1786,17 +1786,17 @@ void SIMDCALL BinLinesImpl(
      DRAW_CONTEXT *pDC,
      PA_STATE &pa,
      uint32_t workerId,
-    typename SIMD_T::Vec4 prim[3],
+    Vec4<SIMD_T> prim[3],
      uint32_t primMask,
-    typename SIMD_T::Integer const &primID,
-    typename SIMD_T::Integer const &viewportIdx,
-    typename SIMD_T::Integer const & rtIdx)
+    Integer<SIMD_T> const &primID,
+    Integer<SIMD_T> const &viewportIdx,
+    Integer<SIMD_T> const & rtIdx)
  {
      const API_STATE& state = GetApiState(pDC);
      const SWR_RASTSTATE& rastState = state.rastState;
      const SWR_FRONTEND_STATE& feState = state.frontendState;
  
-    typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
+    Float<SIMD_T> vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
  
      if (!feState.vpTransformDisable)
      {
@@ -1825,7 +1825,7 @@ void SIMDCALL BinLinesImpl(
      }
  
      // adjust for pixel center location
-    typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
+    Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
  
      prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
      prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.h b/src/gallium/drivers/swr/rasterizer/core/binner.h

index 8d252350723d3d011b24f2d27cd3ccff6c7ff19d..443dac57fef6d73ad856a43cc357861e16076fba 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/binner.h
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.h
@@ -38,7 +38,7 @@ template <typename SIMD_T>
  struct SwrPixelOffsets
  {
  public:
-    INLINE static typename SIMD_T::Float GetOffset(uint32_t loc)
+    INLINE static Float<SIMD_T> GetOffset(uint32_t loc)
      {
          SWR_ASSERT(loc <= 1);
  
@@ -50,7 +50,7 @@ public:
  /// @brief Convert the X,Y coords of a triangle to the requested Fixed 
  /// Point precision from FP32.
  template <typename SIMD_T, typename PT = FixedPointTraits<Fixed_16_8>>
-INLINE typename SIMD_T::Integer fpToFixedPointVertical(const typename SIMD_T::Float &vIn)
+INLINE Integer<SIMD_T> fpToFixedPointVertical(const Float<SIMD_T> &vIn)
  {
      return SIMD_T::cvtps_epi32(SIMD_T::mul_ps(vIn, SIMD_T::set1_ps(PT::ScaleT::value)));
  }
@@ -62,7 +62,7 @@ INLINE typename SIMD_T::Integer fpToFixedPointVertical(const typename SIMD_T::Fl
  /// @param vXi: fixed point X coords of tri verts
  /// @param vYi: fixed point Y coords of tri verts
  template <typename SIMD_T>
-INLINE static void FPToFixedPoint(const typename SIMD_T::Vec4 *const tri, typename SIMD_T::Integer(&vXi)[3], typename SIMD_T::Integer(&vYi)[3])
+INLINE static void FPToFixedPoint(const Vec4<SIMD_T> *const tri, Integer<SIMD_T>(&vXi)[3], Integer<SIMD_T>(&vYi)[3])
  {
      vXi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].x);
      vYi[0] = fpToFixedPointVertical<SIMD_T>(tri[0].y);
@@ -81,24 +81,24 @@ INLINE static void FPToFixedPoint(const typename SIMD_T::Vec4 *const tri, typena
  /// *Note*: expects vX, vY to be in the correct precision for the type 
  /// of rasterization. This avoids unnecessary FP->fixed conversions.
  template <typename SIMD_T, typename CT>
-INLINE void calcBoundingBoxIntVertical(const typename SIMD_T::Integer(&vX)[3], const typename SIMD_T::Integer(&vY)[3], SIMDBBOX_T<SIMD_T> &bbox)
+INLINE void calcBoundingBoxIntVertical(const Integer<SIMD_T>(&vX)[3], const Integer<SIMD_T>(&vY)[3], SIMDBBOX_T<SIMD_T> &bbox)
  {
-    typename SIMD_T::Integer vMinX = vX[0];
+    Integer<SIMD_T> vMinX = vX[0];
  
      vMinX = SIMD_T::min_epi32(vMinX, vX[1]);
      vMinX = SIMD_T::min_epi32(vMinX, vX[2]);
  
-    typename SIMD_T::Integer vMaxX = vX[0];
+    Integer<SIMD_T> vMaxX = vX[0];
  
      vMaxX = SIMD_T::max_epi32(vMaxX, vX[1]);
      vMaxX = SIMD_T::max_epi32(vMaxX, vX[2]);
  
-    typename SIMD_T::Integer vMinY = vY[0];
+    Integer<SIMD_T> vMinY = vY[0];
  
      vMinY = SIMD_T::min_epi32(vMinY, vY[1]);
      vMinY = SIMD_T::min_epi32(vMinY, vY[2]);
  
-    typename SIMD_T::Integer vMaxY = vY[0];
+    Integer<SIMD_T> vMaxY = vY[0];
  
      vMaxY = SIMD_T::max_epi32(vMaxY, vY[1]);
      vMaxY = SIMD_T::max_epi32(vMaxY, vY[2]);
@@ -108,7 +108,7 @@ INLINE void calcBoundingBoxIntVertical(const typename SIMD_T::Integer(&vX)[3], c
          /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
          /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
  
-        const typename SIMD_T::Integer value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
+        const Integer<SIMD_T> value = SIMD_T::set1_epi32(CT::BoundingBoxOffsetT::value);
  
          vMinX = SIMD_T::sub_epi32(vMinX, value);
          vMaxX = SIMD_T::add_epi32(vMaxX, value);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h

index ddee3b1a94024251ece4220c49ad9c352419dcbc..8d2590a49816e66913310a9e42c43c6afddf2691 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -62,15 +62,15 @@ enum SWR_CLIPCODES
  #define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
  
  template<typename SIMD_T>
-void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &vertex, typename SIMD_T::Float &clipCodes, typename SIMD_T::Integer const &viewportIndexes)
+void ComputeClipCodes(const API_STATE &state, const Vec4<SIMD_T> &vertex, Float<SIMD_T> &clipCodes, Integer<SIMD_T> const &viewportIndexes)
  {
      clipCodes = SIMD_T::setzero_ps();
  
      // -w
-    typename SIMD_T::Float vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
+    Float<SIMD_T> vNegW = SIMD_T::mul_ps(vertex.w,SIMD_T::set1_ps(-1.0f));
  
      // FRUSTUM_LEFT
-    typename SIMD_T::Float vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
+    Float<SIMD_T> vRes = SIMD_T::cmplt_ps(vertex.x, vNegW);
      clipCodes = SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(FRUSTUM_LEFT)));
  
      // FRUSTUM_TOP
@@ -109,22 +109,22 @@ void ComputeClipCodes(const API_STATE &state, const typename SIMD_T::Vec4 &verte
      clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(NEGW))));
  
      // GUARDBAND_LEFT
-    typename SIMD_T::Float gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.left[0], viewportIndexes));
+    Float<SIMD_T> gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.left[0], viewportIndexes));
      vRes = SIMD_T::cmplt_ps(vertex.x, gbMult);
      clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_LEFT))));
  
      // GUARDBAND_TOP
-    gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.top[0], viewportIndexes));
+    gbMult = SIMD_T::mul_ps(vNegW, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.top[0], viewportIndexes));
      vRes = SIMD_T::cmplt_ps(vertex.y, gbMult);
      clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_TOP))));
  
      // GUARDBAND_RIGHT
-    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.right[0], viewportIndexes));
+    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.right[0], viewportIndexes));
      vRes = SIMD_T::cmpgt_ps(vertex.x, gbMult);
      clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_RIGHT))));
  
      // GUARDBAND_BOTTOM
-    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<typename SIMD_T::ScaleFactor(4)>(&state.gbState.bottom[0], viewportIndexes));
+    gbMult = SIMD_T::mul_ps(vertex.w, SIMD_T::template i32gather_ps<ScaleFactor<SIMD_T>(4)>(&state.gbState.bottom[0], viewportIndexes));
      vRes = SIMD_T::cmpgt_ps(vertex.y, gbMult);
      clipCodes = SIMD_T::or_ps(clipCodes, SIMD_T::and_ps(vRes, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_BOTTOM))));
  }
@@ -311,7 +311,7 @@ public:
          static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
      }
  
-    void ComputeClipCodes(typename SIMD_T::Vec4 vertex[], const typename SIMD_T::Integer &viewportIndexes)
+    void ComputeClipCodes(Vec4<SIMD_T> vertex[], const Integer<SIMD_T> &viewportIndexes)
      {
          for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
          {
@@ -319,9 +319,9 @@ public:
          }
      }
  
-    typename SIMD_T::Float ComputeClipCodeIntersection()
+    Float<SIMD_T> ComputeClipCodeIntersection()
      {
-        typename SIMD_T::Float result = clipCodes[0];
+        Float<SIMD_T> result = clipCodes[0];
  
          for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
          {
@@ -331,9 +331,9 @@ public:
          return result;
      }
  
-    typename SIMD_T::Float ComputeClipCodeUnion()
+    Float<SIMD_T> ComputeClipCodeUnion()
      {
-        typename SIMD_T::Float result = clipCodes[0];
+        Float<SIMD_T> result = clipCodes[0];
  
          for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
          {
@@ -345,7 +345,7 @@ public:
  
      int ComputeClipMask()
      {
-        typename SIMD_T::Float clipUnion = ComputeClipCodeUnion();
+        Float<SIMD_T> clipUnion = ComputeClipCodeUnion();
  
          clipUnion = SIMD_T::and_ps(clipUnion, SIMD_T::castsi_ps(SIMD_T::set1_epi32(GUARDBAND_CLIP_MASK)));
  
@@ -353,31 +353,31 @@ public:
      }
  
      // clipper is responsible for culling any prims with NAN coordinates
-    int ComputeNaNMask(typename SIMD_T::Vec4 prim[])
+    int ComputeNaNMask(Vec4<SIMD_T> prim[])
      {
-        typename SIMD_T::Float vNanMask = SIMD_T::setzero_ps();
+        Float<SIMD_T> vNanMask = SIMD_T::setzero_ps();
  
          for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
          {
-            typename SIMD_T::Float vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
+            Float<SIMD_T> vNan01 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[0], prim[e].v[1]);
              vNanMask = SIMD_T::or_ps(vNanMask, vNan01);
  
-            typename SIMD_T::Float vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
+            Float<SIMD_T> vNan23 = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(prim[e].v[2], prim[e].v[3]);
              vNanMask = SIMD_T::or_ps(vNanMask, vNan23);
          }
  
          return SIMD_T::movemask_ps(vNanMask);
      }
  
-    int ComputeUserClipCullMask(PA_STATE &pa, typename SIMD_T::Vec4 prim[])
+    int ComputeUserClipCullMask(PA_STATE &pa, Vec4<SIMD_T> prim[])
      {
          uint8_t cullMask = state.backendState.cullDistanceMask;
          uint32_t vertexClipCullOffset = state.backendState.vertexClipCullOffset;
  
-        typename SIMD_T::Float vClipCullMask = SIMD_T::setzero_ps();
+        Float<SIMD_T> vClipCullMask = SIMD_T::setzero_ps();
  
-        typename SIMD_T::Vec4 vClipCullDistLo[3];
-        typename SIMD_T::Vec4 vClipCullDistHi[3];
+        Vec4<SIMD_T> vClipCullDistLo[3];
+        Vec4<SIMD_T> vClipCullDistHi[3];
  
          pa.Assemble(vertexClipCullOffset, vClipCullDistLo);
          pa.Assemble(vertexClipCullOffset + 1, vClipCullDistHi);
@@ -389,10 +389,10 @@ public:
              uint32_t slot = index >> 2;
              uint32_t component = index & 0x3;
  
-            typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+            Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
              for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
              {
-                typename SIMD_T::Float vCullComp;
+                Float<SIMD_T> vCullComp;
                  if (slot == 0)
                  {
                      vCullComp = vClipCullDistLo[e][component];
@@ -403,7 +403,7 @@ public:
                  }
  
                  // cull if cull distance < 0 || NAN
-                typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
+                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vCullComp);
                  vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
              }
              vClipCullMask = SIMD_T::or_ps(vClipCullMask, vCullMaskElem);
@@ -417,10 +417,10 @@ public:
              uint32_t slot = index >> 2;
              uint32_t component = index & 0x3;
  
-            typename SIMD_T::Float vCullMaskElem = SIMD_T::set1_ps(-1.0f);
+            Float<SIMD_T> vCullMaskElem = SIMD_T::set1_ps(-1.0f);
              for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
              {
-                typename SIMD_T::Float vClipComp;
+                Float<SIMD_T> vClipComp;
                  if (slot == 0)
                  {
                      vClipComp = vClipCullDistLo[e][component];
@@ -430,8 +430,8 @@ public:
                      vClipComp = vClipCullDistHi[e][component];
                  }
  
-                typename SIMD_T::Float vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
-                typename SIMD_T::Float vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
+                Float<SIMD_T> vClip = SIMD_T::template cmp_ps<SIMD_T::CompareType::UNORD_Q>(vClipComp, vClipComp);
+                Float<SIMD_T> vCull = SIMD_T::template cmp_ps<SIMD_T::CompareType::NLE_UQ>(SIMD_T::setzero_ps(), vClipComp);
                  vCullMaskElem = SIMD_T::and_ps(vCullMaskElem, vCull);
                  vClipCullMask = SIMD_T::or_ps(vClipCullMask, vClip);
              }
@@ -441,8 +441,8 @@ public:
          return SIMD_T::movemask_ps(vClipCullMask);
      }
  
-    void ClipSimd(const typename SIMD_T::Vec4 prim[], const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, PA_STATE &pa,
-                  const typename SIMD_T::Integer &vPrimId, const typename SIMD_T::Integer &vViewportIdx, const typename SIMD_T::Integer &vRtIdx)
+    void ClipSimd(const Vec4<SIMD_T> prim[], const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, PA_STATE &pa,
+                  const Integer<SIMD_T> &vPrimId, const Integer<SIMD_T> &vViewportIdx, const Integer<SIMD_T> &vRtIdx)
      {
          // input/output vertex store for clipper
          SIMDVERTEX_T<SIMD_T> vertices[7]; // maximum 7 verts generated per triangle
@@ -456,7 +456,7 @@ public:
          ///@todo: line topology for wireframe?
  
          // assemble pos
-        typename SIMD_T::Vec4 tmpVector[NumVertsPerPrim];
+        Vec4<SIMD_T> tmpVector[NumVertsPerPrim];
          for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
          {
              vertices[i].attrib[VERTEX_POSITION_SLOT] = prim[i];
@@ -515,7 +515,7 @@ public:
  
          uint32_t numAttribs = maxSlot + 1;
  
-        typename SIMD_T::Integer vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+        Integer<SIMD_T> vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
  
          BinnerChooser<SIMD_T> binner(NumVertsPerPrim, pa.pDC->pState->state.rastState.conservativeRast);
  
@@ -602,9 +602,9 @@ public:
  #endif
              for (uint32_t c = 0; c < 4; ++c)
              {
-                SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+                SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
                  transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                pBase += sizeof(typename SIMD_T::Float);
+                pBase += sizeof(Float<SIMD_T>);
              }
  
              // transpose attribs
@@ -616,9 +616,9 @@ public:
  
                  for (uint32_t c = 0; c < 4; ++c)
                  {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
                      transposedPrims[0].attrib[attribSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(typename SIMD_T::Float);
+                    pBase += sizeof(Float<SIMD_T>);
                  }
              }
  
@@ -630,9 +630,9 @@ public:
  
                  for (uint32_t c = 0; c < 4; ++c)
                  {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
                      transposedPrims[0].attrib[vertexClipCullSlot][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(typename SIMD_T::Float);
+                    pBase += sizeof(Float<SIMD_T>);
                  }
              }
  
@@ -642,9 +642,9 @@ public:
  
                  for (uint32_t c = 0; c < 4; ++c)
                  {
-                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
+                    SIMD256::Float temp = SIMD256::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD256::setzero_ps(), reinterpret_cast<const float *>(pBase), vOffsets, vMask);
                      transposedPrims[0].attrib[vertexClipCullSlot + 1][c] = SimdHelper<SIMD_T>::insert_lo_ps(temp);
-                    pBase += sizeof(typename SIMD_T::Float);
+                    pBase += sizeof(Float<SIMD_T>);
                  }
              }
  
@@ -656,16 +656,16 @@ public:
  
              const uint32_t primMask = primMaskMap[numEmittedPrims];
  
-            const typename SIMD_T::Integer primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
-            const typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
-            const typename SIMD_T::Integer rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
+            const Integer<SIMD_T> primID = SIMD_T::set1_epi32(pPrimitiveId[inputPrim]);
+            const Integer<SIMD_T> viewportIdx = SIMD_T::set1_epi32(pViewportIdx[inputPrim]);
+            const Integer<SIMD_T> rtIdx = SIMD_T::set1_epi32(pRtIdx[inputPrim]);
  
  
              while (clipPA.GetNextStreamOutput())
              {
                  do
                  {
-                    typename SIMD_T::Vec4 attrib[NumVertsPerPrim];
+                    Vec4<SIMD_T> attrib[NumVertsPerPrim];
  
                      bool assemble = clipPA.Assemble(VERTEX_POSITION_SLOT, attrib);
  
@@ -686,8 +686,8 @@ public:
          UPDATE_STAT_FE(CPrimitives, numClippedPrims);
      }
  
-    void ExecuteStage(PA_STATE &pa, typename SIMD_T::Vec4 prim[], uint32_t primMask,
-                      typename SIMD_T::Integer const &primId, typename SIMD_T::Integer const &viewportIdx, typename SIMD_T::Integer const &rtIdx)
+    void ExecuteStage(PA_STATE &pa, Vec4<SIMD_T> prim[], uint32_t primMask,
+                      Integer<SIMD_T> const &primId, Integer<SIMD_T> const &viewportIdx, Integer<SIMD_T> const &rtIdx)
      {
          SWR_ASSERT(pa.pDC != nullptr);
  
@@ -709,7 +709,7 @@ public:
          }
  
          // cull prims outside view frustum
-        typename SIMD_T::Float clipIntersection = ComputeClipCodeIntersection();
+        Float<SIMD_T> clipIntersection = ComputeClipCodeIntersection();
          int validMask = primMask & SimdHelper<SIMD_T>::cmpeq_ps_mask(clipIntersection, SIMD_T::setzero_ps());
  
          // skip clipping for points
@@ -740,16 +740,16 @@ public:
      }
  
  private:
-    typename SIMD_T::Float ComputeInterpFactor(typename SIMD_T::Float const &boundaryCoord0, typename SIMD_T::Float const &boundaryCoord1)
+    Float<SIMD_T> ComputeInterpFactor(Float<SIMD_T> const &boundaryCoord0, Float<SIMD_T> const &boundaryCoord1)
      {
          return SIMD_T::div_ps(boundaryCoord0, SIMD_T::sub_ps(boundaryCoord0, boundaryCoord1));
      }
  
-    typename SIMD_T::Integer ComputeOffsets(uint32_t attrib, typename SIMD_T::Integer const &vIndices, uint32_t component)
+    Integer<SIMD_T> ComputeOffsets(uint32_t attrib, Integer<SIMD_T> const &vIndices, uint32_t component)
      {
          const uint32_t simdVertexStride = sizeof(SIMDVERTEX_T<SIMD_T>);
-        const uint32_t componentStride  = sizeof(typename SIMD_T::Float);
-        const uint32_t attribStride     = sizeof(typename SIMD_T::Vec4);
+        const uint32_t componentStride  = sizeof(Float<SIMD_T>);
+        const uint32_t attribStride     = sizeof(Vec4<SIMD_T>);
  
          static const OSALIGNSIMD16(uint32_t) elemOffset[16] =
          {
@@ -771,12 +771,12 @@ private:
              15 * sizeof(float),
          };
  
-        static_assert(sizeof(typename SIMD_T::Integer) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
+        static_assert(sizeof(Integer<SIMD_T>) <= sizeof(elemOffset), "Clipper::ComputeOffsets, Increase number of element offsets.");
  
-        typename SIMD_T::Integer vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const typename SIMD_T::Integer *>(elemOffset));
+        Integer<SIMD_T> vElemOffset = SIMD_T::loadu_si(reinterpret_cast<const Integer<SIMD_T> *>(elemOffset));
  
          // step to the simdvertex
-        typename SIMD_T::Integer vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
+        Integer<SIMD_T> vOffsets = SIMD_T::mullo_epi32(vIndices, SIMD_T::set1_epi32(simdVertexStride));
  
          // step to the attribute and component
          vOffsets = SIMD_T::add_epi32(vOffsets, SIMD_T::set1_epi32(attribStride * attrib + componentStride * component));
@@ -787,17 +787,17 @@ private:
          return vOffsets;
      }
  
-    typename SIMD_T::Float GatherComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component)
+    Float<SIMD_T> GatherComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component)
      {
-        typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
-        typename SIMD_T::Float vSrc = SIMD_T::setzero_ps();
+        Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
+        Float<SIMD_T> vSrc = SIMD_T::setzero_ps();
  
-        return SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(vSrc, pBuffer, vOffsets, vMask);
+        return SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(vSrc, pBuffer, vOffsets, vMask);
      }
  
-    void ScatterComponent(const float* pBuffer, uint32_t attrib, typename SIMD_T::Float const &vMask, typename SIMD_T::Integer const &vIndices, uint32_t component, typename SIMD_T::Float const &vSrc)
+    void ScatterComponent(const float* pBuffer, uint32_t attrib, Float<SIMD_T> const &vMask, Integer<SIMD_T> const &vIndices, uint32_t component, Float<SIMD_T> const &vSrc)
      {
-        typename SIMD_T::Integer vOffsets = ComputeOffsets(attrib, vIndices, component);
+        Integer<SIMD_T> vOffsets = ComputeOffsets(attrib, vIndices, component);
  
          const uint32_t *pOffsets = reinterpret_cast<const uint32_t *>(&vOffsets);
          const float *pSrc = reinterpret_cast<const float *>(&vSrc);
@@ -813,12 +813,12 @@ private:
  
      template<SWR_CLIPCODES ClippingPlane>
      void intersect(
-        const typename SIMD_T::Float &vActiveMask,  // active lanes to operate on
-        const typename SIMD_T::Integer &s,          // index to first edge vertex v0 in pInPts.
-        const typename SIMD_T::Integer &p,          // index to second edge vertex v1 in pInPts.
-        const typename SIMD_T::Vec4 &v1,            // vertex 0 position
-        const typename SIMD_T::Vec4 &v2,            // vertex 1 position
-        typename SIMD_T::Integer &outIndex,         // output index.
+        const Float<SIMD_T> &vActiveMask,  // active lanes to operate on
+        const Integer<SIMD_T> &s,          // index to first edge vertex v0 in pInPts.
+        const Integer<SIMD_T> &p,          // index to second edge vertex v1 in pInPts.
+        const Vec4<SIMD_T> &v1,            // vertex 0 position
+        const Vec4<SIMD_T> &v2,            // vertex 1 position
+        Integer<SIMD_T> &outIndex,         // output index.
          const float *pInVerts,                      // array of all the input positions.
          uint32_t numInAttribs,                      // number of attributes per vertex.
          float *pOutVerts)                           // array of output positions. We'll write our new intersection point at i*4.
@@ -827,7 +827,7 @@ private:
          uint32_t vertexClipCullOffset = this->state.backendState.vertexClipCullOffset;
  
          // compute interpolation factor
-        typename SIMD_T::Float t;
+        Float<SIMD_T> t;
          switch (ClippingPlane)
          {
          case FRUSTUM_LEFT:      t = ComputeInterpFactor(SIMD_T::add_ps(v1[3], v1[0]), SIMD_T::add_ps(v2[3], v2[0])); break;
@@ -852,7 +852,7 @@ private:
          // interpolate position and store
          for (uint32_t c = 0; c < 4; ++c)
          {
-            typename SIMD_T::Float vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
+            Float<SIMD_T> vOutPos = SIMD_T::fmadd_ps(SIMD_T::sub_ps(v2[c], v1[c]), t, v1[c]);
              ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
          }
  
@@ -862,9 +862,9 @@ private:
              uint32_t attribSlot = vertexAttribOffset + a;
              for (uint32_t c = 0; c < 4; ++c)
              {
-                typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                  ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
              }
          }
@@ -875,9 +875,9 @@ private:
              uint32_t attribSlot = vertexClipCullOffset;
              for (uint32_t c = 0; c < 4; ++c)
              {
-                typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                  ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
              }
          }
@@ -887,16 +887,16 @@ private:
              uint32_t attribSlot = vertexClipCullOffset + 1;
              for (uint32_t c = 0; c < 4; ++c)
              {
-                typename SIMD_T::Float vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
-                typename SIMD_T::Float vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
-                typename SIMD_T::Float vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                Float<SIMD_T> vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                Float<SIMD_T> vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                Float<SIMD_T> vOutAttrib = SIMD_T::fmadd_ps(SIMD_T::sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
                  ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
              }
          }
      }
  
      template<SWR_CLIPCODES ClippingPlane>
-    typename SIMD_T::Float inside(const typename SIMD_T::Vec4 &v)
+    Float<SIMD_T> inside(const Vec4<SIMD_T> &v)
      {
          switch (ClippingPlane)
          {
@@ -913,23 +913,23 @@ private:
      }
  
      template<SWR_CLIPCODES ClippingPlane>
-    typename SIMD_T::Integer ClipTriToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+    Integer<SIMD_T> ClipTriToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
      {
          uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
  
-        typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
-        typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
-        typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+        Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+        Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+        Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
  
          while (!SIMD_T::testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
          {
-            typename SIMD_T::Integer s = vCurIndex;
-            typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
-            typename SIMD_T::Integer underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
+            Integer<SIMD_T> s = vCurIndex;
+            Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+            Integer<SIMD_T> underFlowMask = SIMD_T::cmpgt_epi32(vNumInPts, p);
              p = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::setzero_ps(), SIMD_T::castsi_ps(p), SIMD_T::castsi_ps(underFlowMask)));
  
              // gather position
-            typename SIMD_T::Vec4 vInPos0, vInPos1;
+            Vec4<SIMD_T> vInPos0, vInPos1;
              for (uint32_t c = 0; c < 4; ++c)
              {
                  vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
@@ -937,11 +937,11 @@ private:
              }
  
              // compute inside mask
-            typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
-            typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
+            Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+            Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
  
              // compute intersection mask (s_in != p_in)
-            typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+            Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
              intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
  
              // store s if inside
@@ -960,7 +960,7 @@ private:
                      uint32_t attribSlot = vertexAttribOffset + a;
                      for (uint32_t c = 0; c < 4; ++c)
                      {
-                        typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
                          ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
                      }
                  }
@@ -972,7 +972,7 @@ private:
                      uint32_t attribSlot = vertexClipCullSlot;
                      for (uint32_t c = 0; c < 4; ++c)
                      {
-                        typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
                          ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
                      }
                  }
@@ -982,7 +982,7 @@ private:
                      uint32_t attribSlot = vertexClipCullSlot + 1;
                      for (uint32_t c = 0; c < 4; ++c)
                      {
-                        typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
                          ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
                      }
                  }
@@ -1009,21 +1009,21 @@ private:
      }
  
      template<SWR_CLIPCODES ClippingPlane>
-    typename SIMD_T::Integer ClipLineToPlane(const float *pInVerts, const typename SIMD_T::Integer &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
+    Integer<SIMD_T> ClipLineToPlane(const float *pInVerts, const Integer<SIMD_T> &vNumInPts, uint32_t numInAttribs, float *pOutVerts)
      {
          uint32_t vertexAttribOffset = this->state.backendState.vertexAttribOffset;
  
-        typename SIMD_T::Integer vCurIndex = SIMD_T::setzero_si();
-        typename SIMD_T::Integer vOutIndex = SIMD_T::setzero_si();
-        typename SIMD_T::Float vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
+        Integer<SIMD_T> vCurIndex = SIMD_T::setzero_si();
+        Integer<SIMD_T> vOutIndex = SIMD_T::setzero_si();
+        Float<SIMD_T> vActiveMask = SIMD_T::castsi_ps(SIMD_T::cmplt_epi32(vCurIndex, vNumInPts));
  
          if (!SIMD_T::testz_ps(vActiveMask, vActiveMask))
          {
-            typename SIMD_T::Integer s = vCurIndex;
-            typename SIMD_T::Integer p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
+            Integer<SIMD_T> s = vCurIndex;
+            Integer<SIMD_T> p = SIMD_T::add_epi32(s, SIMD_T::set1_epi32(1));
  
              // gather position
-            typename SIMD_T::Vec4 vInPos0, vInPos1;
+            Vec4<SIMD_T> vInPos0, vInPos1;
              for (uint32_t c = 0; c < 4; ++c)
              {
                  vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
@@ -1031,11 +1031,11 @@ private:
              }
  
              // compute inside mask
-            typename SIMD_T::Float s_in = inside<ClippingPlane>(vInPos0);
-            typename SIMD_T::Float p_in = inside<ClippingPlane>(vInPos1);
+            Float<SIMD_T> s_in = inside<ClippingPlane>(vInPos0);
+            Float<SIMD_T> p_in = inside<ClippingPlane>(vInPos1);
  
              // compute intersection mask (s_in != p_in)
-            typename SIMD_T::Float intersectMask = SIMD_T::xor_ps(s_in, p_in);
+            Float<SIMD_T> intersectMask = SIMD_T::xor_ps(s_in, p_in);
              intersectMask = SIMD_T::and_ps(intersectMask, vActiveMask);
  
              // store s if inside
@@ -1053,7 +1053,7 @@ private:
                      uint32_t attribSlot = vertexAttribOffset + a;
                      for (uint32_t c = 0; c < 4; ++c)
                      {
-                        typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
                          ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
                      }
                  }
@@ -1086,7 +1086,7 @@ private:
                      uint32_t attribSlot = vertexAttribOffset + a;
                      for (uint32_t c = 0; c < 4; ++c)
                      {
-                        typename SIMD_T::Float vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+                        Float<SIMD_T> vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
                          ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
                      }
                  }
@@ -1099,17 +1099,17 @@ private:
          return vOutIndex;
      }
  
-    typename SIMD_T::Integer ClipPrims(float *pVertices, const typename SIMD_T::Float &vPrimMask, const typename SIMD_T::Float &vClipMask, int numAttribs)
+    Integer<SIMD_T> ClipPrims(float *pVertices, const Float<SIMD_T> &vPrimMask, const Float<SIMD_T> &vClipMask, int numAttribs)
      {
          // temp storage
          float *pTempVerts = reinterpret_cast<float *>(ClipHelper<SIMD_T>::GetTempVertices());
  
          // zero out num input verts for non-active lanes
-        typename SIMD_T::Integer vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
+        Integer<SIMD_T> vNumInPts = SIMD_T::set1_epi32(NumVertsPerPrim);
          vNumInPts = SIMD_T::blendv_epi32(SIMD_T::setzero_si(), vNumInPts, vClipMask);
  
          // clip prims to frustum
-        typename SIMD_T::Integer vNumOutPts;
+        Integer<SIMD_T> vNumOutPts;
          if (NumVertsPerPrim == 3)
          {
              vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
@@ -1131,7 +1131,7 @@ private:
          }
  
          // restore num verts for non-clipped, active lanes
-        typename SIMD_T::Float vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
+        Float<SIMD_T> vNonClippedMask = SIMD_T::andnot_ps(vClipMask, vPrimMask);
          vNumOutPts = SIMD_T::blendv_epi32(vNumOutPts, SIMD_T::set1_epi32(NumVertsPerPrim), vNonClippedMask);
  
          return vNumOutPts;
@@ -1140,7 +1140,7 @@ private:
      const uint32_t workerId{ 0 };
      DRAW_CONTEXT *pDC{ nullptr };
      const API_STATE &state;
-    typename SIMD_T::Float clipCodes[NumVertsPerPrim];
+    Float<SIMD_T> clipCodes[NumVertsPerPrim];
  };
  
  
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp

index 13c9f3670f7d1a59b16cfb98925f060552bb7553..1c4b522e45ea67c7ce755c63568c791bf2c55e9e 100644 (file)
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -733,7 +733,7 @@ template<typename SIMD_T, uint32_t SimdWidth>
  void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t numAttribs)
  {
      uint32_t srcVertexStride = numAttribs * sizeof(float) * 4;
-    uint32_t dstVertexStride = numAttribs * sizeof(typename SIMD_T::Float) * 4;
+    uint32_t dstVertexStride = numAttribs * sizeof(Float<SIMD_T>) * 4;
  
      OSALIGNSIMD16(uint32_t) gatherOffsets[SimdWidth];
  
@@ -741,7 +741,7 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t
      {
          gatherOffsets[i] = srcVertexStride * i;
      }
-    auto vGatherOffsets = SIMD_T::load_si((typename SIMD_T::Integer*)&gatherOffsets[0]);
+    auto vGatherOffsets = SIMD_T::load_si((Integer<SIMD_T>*)&gatherOffsets[0]);
  
      uint32_t numSimd = AlignUp(numVerts, SimdWidth) / SimdWidth;
      uint32_t remainingVerts = numVerts;
@@ -759,18 +759,18 @@ void TransposeSOAtoAOS(uint8_t* pDst, uint8_t* pSrc, uint32_t numVerts, uint32_t
  
          for (uint32_t a = 0; a < numAttribs; ++a)
          {
-            auto attribGatherX = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
-            auto attribGatherY = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
-            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
-            auto attribGatherW = SIMD_T::template mask_i32gather_ps<typename SIMD_T::ScaleFactor(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
+            auto attribGatherX = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase, vGatherOffsets, vMask);
+            auto attribGatherY = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float)), vGatherOffsets, vMask);
+            auto attribGatherZ = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 2), vGatherOffsets, vMask);
+            auto attribGatherW = SIMD_T::template mask_i32gather_ps<ScaleFactor<SIMD_T>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase + sizeof(float) * 3), vGatherOffsets, vMask);
  
              SIMD_T::maskstore_ps((float*)pDstBase, viMask, attribGatherX);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float)), viMask, attribGatherY);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 2), viMask, attribGatherZ);
-            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(typename SIMD_T::Float) * 3), viMask, attribGatherW);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>)), viMask, attribGatherY);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 2), viMask, attribGatherZ);
+            SIMD_T::maskstore_ps((float*)(pDstBase + sizeof(Float<SIMD_T>) * 3), viMask, attribGatherW);
  
              pSrcBase += sizeof(float) * 4;
-            pDstBase += sizeof(typename SIMD_T::Float) * 4;
+            pDstBase += sizeof(Float<SIMD_T>) * 4;
          }
          remainingVerts -= SimdWidth;
      }
@@ -1101,7 +1101,7 @@ static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state,
  
      // Allocate storage for transposed GS output
      uint32_t numSimdBatches = AlignUp(gsState.maxNumVerts, SIMD_WIDTH) / SIMD_WIDTH;
-    uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(typename SIMD_T::Vec4);
+    uint32_t transposedBufferSize = numSimdBatches * gsState.outputVertexSize * sizeof(Vec4<SIMD_T>);
      pGsBuffers->pGsTransposed = (uint8_t*)pArena->AllocAligned(transposedBufferSize, 32);
  
      // Allocate storage to hold temporary stream->cut buffer, if necessary
author	George Kyriazis <george.kyriazis@intel.com>
	Wed, 7 Feb 2018 22:51:41 +0000 (16:51 -0600)
committer	George Kyriazis <george.kyriazis@intel.com>
	Fri, 16 Feb 2018 16:54:01 +0000 (10:54 -0600)
src/gallium/drivers/swr/rasterizer/common/simdlib.hpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/binner.cpp		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/binner.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/clip.h		patch \| blob \| history
src/gallium/drivers/swr/rasterizer/core/frontend.cpp		patch \| blob \| history