src/gallium/drivers/swr/rasterizer/core/binner.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file binner.cpp
  24 *
  25 * @brief Implementation for the macrotile binner
  26 *
  27 ******************************************************************************/
  28
  29 #include "binner.h"
  30 #include "context.h"
  31 #include "frontend.h"
  32 #include "conservativeRast.h"
  33 #include "pa.h"
  34 #include "rasterizer.h"
  35 #include "rdtsc_core.h"
  36 #include "tilemgr.h"
  37
  38 // Function Prototype
  39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  40 void BinPostSetupLinesImpl(
  41     DRAW_CONTEXT *pDC,
  42     PA_STATE &pa,
  43     uint32_t workerId,
  44     Vec4<SIMD_T> prim[],
  45     Float<SIMD_T> recipW[],
  46     uint32_t primMask,
  47     Integer<SIMD_T> const &primID,
  48     Integer<SIMD_T> const &viewportIdx,
  49     Integer<SIMD_T> const &rtIdx);
  50
  51 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  52 void BinPostSetupPointsImpl(
  53     DRAW_CONTEXT *pDC,
  54     PA_STATE &pa,
  55     uint32_t workerId,
  56     Vec4<SIMD_T> prim[],
  57     uint32_t primMask,
  58     Integer<SIMD_T> const &primID,
  59     Integer<SIMD_T> const &viewportIdx,
  60     Integer<SIMD_T> const &rtIdx);
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief Processes attributes for the backend based on linkage mask and
  64 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
  65 /// @param pDC - Draw context
  66 /// @param pa - Primitive Assembly state
  67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
  68 /// @param pLinkageMap - maps VS attribute slot to PS slot
  69 /// @param triIndex - Triangle to process attributes for
  70 /// @param pBuffer - Output result
  71 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
  72 INLINE void ProcessAttributes(
  73     DRAW_CONTEXT *pDC,
  74     PA_STATE&pa,
  75     uint32_t triIndex,
  76     uint32_t primId,
  77     float *pBuffer)
  78 {
  79     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
  80     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
  81     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
  82     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
  83     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
  84     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
  85
  86     static const float constTable[3][4] = {
  87         { 0.0f, 0.0f, 0.0f, 0.0f },
  88         { 0.0f, 0.0f, 0.0f, 1.0f },
  89         { 1.0f, 1.0f, 1.0f, 1.0f }
  90     };
  91
  92     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
  93     {
  94         uint32_t inputSlot;
  95         if (IsSwizzledT::value)
  96         {
  97             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
  98             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
  99
 100         }
 101         else
 102         {
 103             inputSlot = backendState.vertexAttribOffset + i;
 104         }
 105
 106         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
 107         float* pAttribStart = pBuffer;
 108
 109         if (HasConstantInterpT::value || IsDegenerate::value)
 110         {
 111             if (CheckBit(constantInterpMask, i))
 112             {
 113                 uint32_t vid;
 114                 uint32_t adjustedTriIndex;
 115                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
 116                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
 117                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
 118                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
 119                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
 120
 121                 switch (topo) {
 122                 case TOP_QUAD_LIST:
 123                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
 124                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
 125                     break;
 126                 case TOP_QUAD_STRIP:
 127                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
 128                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
 129                     break;
 130                 case TOP_TRIANGLE_STRIP:
 131                     adjustedTriIndex = triIndex;
 132                     vid = (triIndex & 1)
 133                         ? tristripProvokingVertex[provokingVertex]
 134                         : provokingVertex;
 135                     break;
 136                 default:
 137                     adjustedTriIndex = triIndex;
 138                     vid = provokingVertex;
 139                     break;
 140                 }
 141
 142                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
 143
 144                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 145                 {
 146                     SIMD128::store_ps(pBuffer, attrib[vid]);
 147                     pBuffer += 4;
 148                 }
 149             }
 150             else
 151             {
 152                 pa.AssembleSingle(inputSlot, triIndex, attrib);
 153
 154                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 155                 {
 156                     SIMD128::store_ps(pBuffer, attrib[i]);
 157                     pBuffer += 4;
 158                 }
 159             }
 160         }
 161         else
 162         {
 163             pa.AssembleSingle(inputSlot, triIndex, attrib);
 164
 165             for (uint32_t i = 0; i < NumVertsT::value; ++i)
 166             {
 167                 SIMD128::store_ps(pBuffer, attrib[i]);
 168                 pBuffer += 4;
 169             }
 170         }
 171
 172         // pad out the attrib buffer to 3 verts to ensure the triangle
 173         // interpolation code in the pixel shader works correctly for the
 174         // 3 topologies - point, line, tri.  This effectively zeros out the
 175         // effect of the missing vertices in the triangle interpolation.
 176         for (uint32_t v = NumVertsT::value; v < 3; ++v)
 177         {
 178             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
 179             pBuffer += 4;
 180         }
 181
 182         // check for constant source overrides
 183         if (IsSwizzledT::value)
 184         {
 185             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
 186             if (mask)
 187             {
 188                 DWORD comp;
 189                 while (_BitScanForward(&comp, mask))
 190                 {
 191                     mask &= ~(1 << comp);
 192
 193                     float constantValue = 0.0f;
 194                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
 195                     {
 196                     case SWR_CONSTANT_SOURCE_CONST_0000:
 197                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
 198                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
 199                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
 200                         break;
 201                     case SWR_CONSTANT_SOURCE_PRIM_ID:
 202                         constantValue = *(float*)&primId;
 203                         break;
 204                     }
 205
 206                     // apply constant value to all 3 vertices
 207                     for (uint32_t v = 0; v < 3; ++v)
 208                     {
 209                         pAttribStart[comp + v * 4] = constantValue;
 210                     }
 211                 }
 212             }
 213         }
 214     }
 215 }
 216
 217 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 218
 219 struct ProcessAttributesChooser
 220 {
 221     typedef PFN_PROCESS_ATTRIBUTES FuncType;
 222
 223     template <typename... ArgsB>
 224     static FuncType GetFunc()
 225     {
 226         return ProcessAttributes<ArgsB...>;
 227     }
 228 };
 229
 230 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
 231 {
 232     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Processes enabled user clip distances. Loads the active clip
 237 ///        distances from the PA, sets up barycentric equations, and
 238 ///        stores the results to the output buffer
 239 /// @param pa - Primitive Assembly state
 240 /// @param primIndex - primitive index to process
 241 /// @param clipDistMask - mask of enabled clip distances
 242 /// @param pUserClipBuffer - buffer to store results
 243 template<uint32_t NumVerts>
 244 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 245 {
 246     DWORD clipDist;
 247     uint32_t clipDistMask = state.clipDistanceMask;
 248     while (_BitScanForward(&clipDist, clipDistMask))
 249     {
 250         clipDistMask &= ~(1 << clipDist);
 251         uint32_t clipSlot = clipDist >> 2;
 252         uint32_t clipComp = clipDist & 0x3;
 253         uint32_t clipAttribSlot = clipSlot == 0 ?
 254             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 255
 256         simd4scalar primClipDist[3];
 257         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 258
 259         float vertClipDist[NumVerts];
 260         for (uint32_t e = 0; e < NumVerts; ++e)
 261         {
 262             OSALIGNSIMD(float) aVertClipDist[4];
 263             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
 264             vertClipDist[e] = aVertClipDist[clipComp];
 265         };
 266
 267         // setup plane equations for barycentric interpolation in the backend
 268         float baryCoeff[NumVerts];
 269         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
 270         for (uint32_t e = 0; e < NumVerts - 1; ++e)
 271         {
 272             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
 273         }
 274         baryCoeff[NumVerts - 1] = last;
 275
 276         for (uint32_t e = 0; e < NumVerts; ++e)
 277         {
 278             *(pUserClipBuffer++) = baryCoeff[e];
 279         }
 280     }
 281 }
 282
 283 INLINE
 284 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
 285 {
 286     vTranspose3x8(dst, src0, src1, src2);
 287 }
 288
 289 INLINE
 290 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
 291 {
 292     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 293 }
 294
 295
 296 #if KNOB_ENABLE_EARLY_RAST
 297
 298 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
 299 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
 300
 301
 302 template<typename SIMD_T>
 303 struct EarlyRastHelper
 304 {
 305 };
 306
 307 template<>
 308 struct EarlyRastHelper<SIMD256>
 309 {
 310     static SIMD256::Integer InitShiftCntrl()
 311     {
 312         return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
 313     }
 314 };
 315
 316 #if USE_SIMD16_FRONTEND
 317 template<>
 318 struct EarlyRastHelper<SIMD512>
 319 {
 320     static SIMD512::Integer InitShiftCntrl()
 321     {
 322         return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
 323     }
 324 };
 325
 326 #endif
 327 //////////////////////////////////////////////////////////////////////////
 328 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
 329 ///        (ER tile) can be rasterized as early as in binner to check if
 330 ///        they cover any  pixels. If not - the triangles can be
 331 ///        culled in binner.
 332 ///
 333 /// @param er_bbox - coordinates of ER tile for each triangle
 334 /// @param vAi - A coefficients of triangle edges
 335 /// @param vBi - B coefficients of triangle edges
 336 /// @param vXi - X coordinates of triangle vertices
 337 /// @param vYi - Y coordinates of triangle vertices
 338 /// @param frontWindingTris - mask indicating CCW/CW triangles
 339 /// @param triMask - mask for valid SIMD lanes (triangles)
 340 /// @param oneTileMask - defines triangles for ER to work on
 341 ///                      (tris that fit into ER tile)
 342 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 343 uint32_t SIMDCALL EarlyRasterizer(
 344         SIMDBBOX_T<SIMD_T> &er_bbox,
 345         Integer<SIMD_T> (&vAi)[3],
 346         Integer<SIMD_T> (&vBi)[3],
 347         Integer<SIMD_T> (&vXi)[3],
 348         Integer<SIMD_T> (&vYi)[3],
 349         uint32_t cwTrisMask,
 350         uint32_t triMask,
 351         uint32_t oneTileMask)
 352 {
 353     // step to pixel center of top-left pixel of the triangle bbox
 354     Integer<SIMD_T> vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
 355     vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 356
 357     Integer<SIMD_T> vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
 358     vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 359
 360     // negate A and B for CW tris
 361     Integer<SIMD_T> vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
 362     Integer<SIMD_T> vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
 363     Integer<SIMD_T> vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
 364     Integer<SIMD_T> vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
 365     Integer<SIMD_T> vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
 366     Integer<SIMD_T> vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
 367
 368     RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
 369
 370     Integer<SIMD_T> vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
 371     Integer<SIMD_T> vCwTris = SIMD_T::set1_epi32(cwTrisMask);
 372     Integer<SIMD_T> vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
 373
 374     vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
 375     vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
 376     vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
 377     vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
 378     vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
 379     vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
 380
 381     // evaluate edge equations at top-left pixel
 382     Integer<SIMD_T> vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
 383     Integer<SIMD_T> vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
 384     Integer<SIMD_T> vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
 385
 386     Integer<SIMD_T> vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
 387     Integer<SIMD_T> vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
 388     Integer<SIMD_T> vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
 389
 390     Integer<SIMD_T> vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
 391     Integer<SIMD_T> vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
 392     Integer<SIMD_T> vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
 393
 394     Integer<SIMD_T> vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
 395     Integer<SIMD_T> vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
 396     Integer<SIMD_T> vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
 397
 398     Integer<SIMD_T> vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
 399     Integer<SIMD_T> vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
 400     Integer<SIMD_T> vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
 401
 402     vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
 403     vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
 404     vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
 405
 406     // top left rule
 407     Integer<SIMD_T> vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
 408     Integer<SIMD_T> vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
 409     Integer<SIMD_T> vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
 410
 411     // vA < 0
 412     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
 413     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
 414     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
 415
 416     // vA == 0 && vB < 0
 417     Integer<SIMD_T> vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
 418     Integer<SIMD_T> vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
 419     Integer<SIMD_T> vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
 420
 421     vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
 422     vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
 423     vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
 424
 425     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
 426     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
 427     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
 428
 429
 430 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
 431     // Go down
 432     // coverage pixel 0
 433     Integer<SIMD_T> vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
 434     vMask0 = SIMD_T::and_si(vMask0, vEdge2);
 435
 436     // coverage pixel 1
 437     Integer<SIMD_T> vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
 438     Integer<SIMD_T> vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
 439     Integer<SIMD_T> vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
 440     Integer<SIMD_T> vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
 441     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
 442
 443     // coverage pixel 2
 444     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 445     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 446     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 447     Integer<SIMD_T> vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
 448     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
 449
 450     // coverage pixel 3
 451     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 452     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 453     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 454     Integer<SIMD_T> vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
 455     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
 456
 457     // One step to the right and then up
 458
 459     // coverage pixel 4
 460     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 461     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 462     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 463     Integer<SIMD_T> vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
 464     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
 465
 466     // coverage pixel 5
 467     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 468     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 469     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 470     Integer<SIMD_T> vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
 471     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
 472
 473     // coverage pixel 6
 474     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 475     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 476     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 477     Integer<SIMD_T> vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
 478     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
 479
 480     // coverage pixel 7
 481     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 482     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 483     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 484     Integer<SIMD_T> vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
 485     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
 486
 487     Integer<SIMD_T> vLit1 = SIMD_T::or_si(vMask0, vMask1);
 488     vLit1 = SIMD_T::or_si(vLit1, vMask2);
 489     vLit1 = SIMD_T::or_si(vLit1, vMask3);
 490     vLit1 = SIMD_T::or_si(vLit1, vMask4);
 491     vLit1 = SIMD_T::or_si(vLit1, vMask5);
 492     vLit1 = SIMD_T::or_si(vLit1, vMask6);
 493     vLit1 = SIMD_T::or_si(vLit1, vMask7);
 494
 495     // Step to the right and go down again
 496
 497     // coverage pixel 0
 498     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 499     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 500     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 501     vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
 502     vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
 503
 504     // coverage pixel 1
 505     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 506     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 507     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 508     vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
 509     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
 510
 511     // coverage pixel 2
 512     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 513     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 514     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 515     vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
 516     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
 517
 518     // coverage pixel 3
 519     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 520     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 521     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 522     vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
 523     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
 524
 525     // And for the last time - to the right and up
 526
 527     // coverage pixel 4
 528     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 529     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 530     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 531     vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
 532     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
 533
 534     // coverage pixel 5
 535     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 536     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 537     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 538     vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
 539     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
 540
 541     // coverage pixel 6
 542     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 543     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 544     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 545     vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
 546     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
 547
 548     // coverage pixel 7
 549     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 550     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 551     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 552     vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
 553     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
 554
 555     Integer<SIMD_T> vLit2 = SIMD_T::or_si(vMask0, vMask1);
 556     vLit2 = SIMD_T::or_si(vLit2, vMask2);
 557     vLit2 = SIMD_T::or_si(vLit2, vMask3);
 558     vLit2 = SIMD_T::or_si(vLit2, vMask4);
 559     vLit2 = SIMD_T::or_si(vLit2, vMask5);
 560     vLit2 = SIMD_T::or_si(vLit2, vMask6);
 561     vLit2 = SIMD_T::or_si(vLit2, vMask7);
 562
 563     Integer<SIMD_T> vLit = SIMD_T::or_si(vLit1, vLit2);
 564
 565 #else
 566     // Generic algorithm sweeping in row by row order
 567     Integer<SIMD_T> vRowMask[ER_SIMD_TILE_Y_DIM];
 568
 569     Integer<SIMD_T> vEdge0N = vEdge0;
 570     Integer<SIMD_T> vEdge1N = vEdge1;
 571     Integer<SIMD_T> vEdge2N = vEdge2;
 572
 573     for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
 574     {
 575         // Store edge values at the beginning of the row
 576         Integer<SIMD_T> vRowEdge0 = vEdge0N;
 577         Integer<SIMD_T> vRowEdge1 = vEdge1N;
 578         Integer<SIMD_T> vRowEdge2 = vEdge2N;
 579
 580         Integer<SIMD_T> vColMask[ER_SIMD_TILE_X_DIM];
 581
 582         for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
 583         {
 584             vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
 585             vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
 586
 587             vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 588             vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 589             vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 590         }
 591         vRowMask[row] = vColMask[0];
 592         for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
 593         {
 594             vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
 595         }
 596         // Restore values and go to the next row
 597         vEdge0N = vRowEdge0;
 598         vEdge1N = vRowEdge1;
 599         vEdge2N = vRowEdge2;
 600
 601         vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 602         vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 603         vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 604     }
 605
 606     // compress all masks
 607     Integer<SIMD_T> vLit = vRowMask[0];
 608     for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
 609     {
 610         vLit = SIMD_T::or_si(vLit, vRowMask[row]);
 611     }
 612
 613 #endif
 614     // Check which triangles has any pixel lit
 615     uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
 616     uint32_t maskUnlit = ~maskLit & oneTileMask;
 617
 618     uint32_t oldTriMask = triMask;
 619     triMask &= ~maskUnlit;
 620
 621     if (triMask ^ oldTriMask)
 622     {
 623         RDTSC_EVENT(FEEarlyRastExit, _mm_popcnt_u32(triMask & oneTileMask), 0);
 624     }
 625     return triMask;
 626 }
 627
 628 #endif // Early rasterizer
 629
 630 //////////////////////////////////////////////////////////////////////////
 631 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
 632 ///        culling, viewport transform, etc.
 633 /// @param pDC - pointer to draw context.
 634 /// @param pa - The primitive assembly object.
 635 /// @param workerId - thread's worker id. Even thread has a unique id.
 636 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
 637 /// @param primID - Primitive ID for each triangle.
 638 /// @param viewportIdx - viewport array index for each triangle.
 639 /// @tparam CT - ConservativeRastFETraits
 640 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 641 void SIMDCALL BinTrianglesImpl(
 642     DRAW_CONTEXT *pDC,
 643     PA_STATE &pa,
 644     uint32_t workerId,
 645     Vec4<SIMD_T> tri[3],
 646     uint32_t triMask,
 647     Integer<SIMD_T> const &primID,
 648     Integer<SIMD_T> const &viewportIdx,
 649     Integer<SIMD_T> const &rtIdx)
 650 {
 651     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
 652
 653     RDTSC_BEGIN(FEBinTriangles, pDC->drawId);
 654
 655     const API_STATE& state = GetApiState(pDC);
 656     const SWR_RASTSTATE& rastState = state.rastState;
 657     const SWR_FRONTEND_STATE& feState = state.frontendState;
 658
 659     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 660
 661     Float<SIMD_T> vRecipW0 = SIMD_T::set1_ps(1.0f);
 662     Float<SIMD_T> vRecipW1 = SIMD_T::set1_ps(1.0f);
 663     Float<SIMD_T> vRecipW2 = SIMD_T::set1_ps(1.0f);
 664
 665     if (feState.vpTransformDisable)
 666     {
 667         // RHW is passed in directly when VP transform is disabled
 668         vRecipW0 = tri[0].v[3];
 669         vRecipW1 = tri[1].v[3];
 670         vRecipW2 = tri[2].v[3];
 671     }
 672     else
 673     {
 674         // Perspective divide
 675         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
 676         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
 677         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
 678
 679         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
 680         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
 681         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
 682
 683         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
 684         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
 685         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
 686
 687         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
 688         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
 689         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 690
 691         // Viewport transform to screen space coords
 692         if (pa.viewportArrayActive)
 693         {
 694             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 695         }
 696         else
 697         {
 698             viewportTransform<3>(tri, state.vpMatrices);
 699         }
 700     }
 701
 702     // Adjust for pixel center location
 703     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
 704
 705     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 706     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
 707
 708     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
 709     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
 710
 711     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
 712     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
 713
 714     // Set vXi, vYi to required fixed point precision
 715     Integer<SIMD_T> vXi[3], vYi[3];
 716     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
 717
 718     // triangle setup
 719     Integer<SIMD_T> vAi[3], vBi[3];
 720     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
 721
 722     // determinant
 723     Integer<SIMD_T> vDet[2];
 724     calcDeterminantIntVertical(vAi, vBi, vDet);
 725
 726     // cull zero area
 727     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
 728     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 729
 730     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 731
 732     // don't cull degenerate triangles if we're conservatively rasterizing
 733     uint32_t origTriMask = triMask;
 734     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
 735     {
 736         triMask &= ~cullZeroAreaMask;
 737     }
 738
 739     // determine front winding tris
 740     // CW  +det
 741     // CCW det < 0;
 742     // 0 area triangles are marked as backfacing regardless of winding order,
 743     // which is required behavior for conservative rast and wireframe rendering
 744     uint32_t frontWindingTris;
 745     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
 746     {
 747         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 748         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 749     }
 750     else
 751     {
 752         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
 753         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
 754     }
 755     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 756
 757     // cull
 758     uint32_t cullTris;
 759     switch ((SWR_CULLMODE)rastState.cullMode)
 760     {
 761     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
 762     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
 763     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
 764         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
 765     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
 766     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
 767     }
 768
 769     triMask &= ~cullTris;
 770
 771     if (origTriMask ^ triMask)
 772     {
 773         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 774     }
 775
 776     AR_EVENT(CullInfoEvent(pDC->drawId, cullZeroAreaMask, cullTris, origTriMask));
 777
 778     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
 779     // compute per tri backface
 780     uint32_t frontFaceMask = frontWindingTris;
 781     uint32_t *pPrimID = (uint32_t *)&primID;
 782     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 783     DWORD triIndex = 0;
 784
 785     uint32_t edgeEnable;
 786     PFN_WORK_FUNC pfnWork;
 787     if (CT::IsConservativeT::value)
 788     {
 789         // determine which edges of the degenerate tri, if any, are valid to rasterize.
 790         // used to call the appropriate templated rasterizer function
 791         if (cullZeroAreaMask > 0)
 792         {
 793             // e0 = v1-v0
 794             const Integer<SIMD_T> x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
 795             const Integer<SIMD_T> y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 796
 797             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 798
 799             // e1 = v2-v1
 800             const Integer<SIMD_T> x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
 801             const Integer<SIMD_T> y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 802
 803             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 804
 805             // e2 = v0-v2
 806             // if v0 == v1 & v1 == v2, v0 == v2
 807             uint32_t e2Mask = e0Mask & e1Mask;
 808             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
 809
 810             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
 811             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
 812             e0Mask = pdep_u32(e0Mask, 0x00249249);
 813
 814             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
 815             e1Mask = pdep_u32(e1Mask, 0x00492492);
 816
 817             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
 818             e2Mask = pdep_u32(e2Mask, 0x00924924);
 819
 820             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
 821         }
 822         else
 823         {
 824             edgeEnable = 0x00FFFFFF;
 825         }
 826     }
 827     else
 828     {
 829         // degenerate triangles won't be sent to rasterizer; just enable all edges
 830         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 831             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
 832     }
 833
 834     SIMDBBOX_T<SIMD_T> bbox;
 835
 836     if (!triMask)
 837     {
 838         goto endBinTriangles;
 839     }
 840
 841     // Calc bounding box of triangles
 842     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
 843
 844     // determine if triangle falls between pixel centers and discard
 845     // only discard for non-MSAA case and when conservative rast is disabled
 846     // (xmin + 127) & ~255
 847     // (xmax + 128) & ~255
 848     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
 849         (!CT::IsConservativeT::value))
 850     {
 851         origTriMask = triMask;
 852
 853         int cullCenterMask;
 854
 855         {
 856             Integer<SIMD_T> xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
 857             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
 858             Integer<SIMD_T> xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
 859             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 860
 861             Integer<SIMD_T> vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 862
 863             Integer<SIMD_T> ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
 864             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
 865             Integer<SIMD_T> ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
 866             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 867
 868             Integer<SIMD_T> vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 869
 870             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
 871             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
 872         }
 873
 874         triMask &= ~cullCenterMask;
 875
 876         if (origTriMask ^ triMask)
 877         {
 878             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 879         }
 880     }
 881
 882     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
 883     // Gather the AOS effective scissor rects based on the per-prim VP index.
 884     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 885     {
 886         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
 887         if (pa.viewportArrayActive)
 888
 889         {
 890             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
 891         }
 892         else // broadcast fast path for non-VPAI case.
 893         {
 894             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
 895             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
 896             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
 897             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
 898         }
 899
 900         // Make triangle bbox inclusive
 901         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
 902         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
 903
 904         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
 905         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
 906         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
 907         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
 908     }
 909
 910     if (CT::IsConservativeT::value)
 911     {
 912         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
 913         // some area. Bump the xmax/ymax edges out
 914
 915         Integer<SIMD_T> topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
 916         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 917
 918         Integer<SIMD_T> leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
 919         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
 920     }
 921
 922     // Cull tris completely outside scissor
 923     {
 924         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
 925         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
 926         Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
 927         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
 928         triMask = triMask & ~maskOutsideScissor;
 929     }
 930
 931 #if KNOB_ENABLE_EARLY_RAST
 932     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
 933     {
 934         // Try early rasterization - culling small triangles which do not cover any pixels
 935
 936         // convert to ER tiles
 937         SIMDBBOX_T<SIMD_T> er_bbox;
 938
 939         er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
 940         er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
 941         er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
 942         er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
 943
 944         Integer<SIMD_T> vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
 945         Integer<SIMD_T> vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
 946
 947         // Take only triangles that fit into ER tile
 948         uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
 949
 950         if (oneTileMask)
 951         {
 952             // determine CW tris (det > 0)
 953             uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 954             uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 955             uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
 956
 957             // Try early rasterization
 958             triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
 959
 960             if (!triMask)
 961             {
 962                 RDTSC_END(FEBinTriangles, 1);
 963                 return;
 964             }
 965         }
 966
 967     }
 968 #endif
 969
 970 endBinTriangles:
 971
 972
 973     // Send surviving triangles to the line or point binner based on fill mode
 974     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
 975     {
 976         // Simple non-conformant wireframe mode, useful for debugging
 977         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
 978         Vec4<SIMD_T> line[2];
 979         Float<SIMD_T> recipW[2];
 980
 981         line[0] = tri[0];
 982         line[1] = tri[1];
 983         recipW[0] = vRecipW0;
 984         recipW[1] = vRecipW1;
 985
 986         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 987
 988         line[0] = tri[1];
 989         line[1] = tri[2];
 990         recipW[0] = vRecipW1;
 991         recipW[1] = vRecipW2;
 992
 993         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 994
 995         line[0] = tri[2];
 996         line[1] = tri[0];
 997         recipW[0] = vRecipW2;
 998         recipW[1] = vRecipW0;
 999
1000         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
1001
1002         RDTSC_END(FEBinTriangles, 1);
1003         return;
1004     }
1005     else if (rastState.fillMode == SWR_FILLMODE_POINT)
1006     {
1007         // Bin 3 points
1008         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
1009         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
1010         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
1011
1012         RDTSC_END(FEBinTriangles, 1);
1013         return;
1014     }
1015
1016     // Convert triangle bbox to macrotile units.
1017     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1018     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1019     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1020     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1021
1022     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1023
1024     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
1025     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
1026     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
1027     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
1028
1029     // transpose verts needed for backend
1030     /// @todo modify BE to take non-transformed verts
1031     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1032     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1033     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1034     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1035
1036     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1037     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1038     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1039     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1040
1041     // scan remaining valid triangles and bin each separately
1042     while (_BitScanForward(&triIndex, triMask))
1043     {
1044         uint32_t linkageCount = state.backendState.numAttributes;
1045         uint32_t numScalarAttribs = linkageCount * 4;
1046
1047         BE_WORK work;
1048         work.type = DRAW;
1049
1050         bool isDegenerate;
1051         if (CT::IsConservativeT::value)
1052         {
1053             // only rasterize valid edges if we have a degenerate primitive
1054             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1055             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1056                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1057
1058             // Degenerate triangles are required to be constant interpolated
1059             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1060         }
1061         else
1062         {
1063             isDegenerate = false;
1064             work.pfnWork = pfnWork;
1065         }
1066
1067         // Select attribute processor
1068         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1069             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1070
1071         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1072
1073         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1074         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1075         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1076
1077         auto pArena = pDC->pArena;
1078         SWR_ASSERT(pArena != nullptr);
1079
1080         // store active attribs
1081         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1082         desc.pAttribs = pAttribs;
1083         desc.numAttribs = linkageCount;
1084         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1085
1086         // store triangle vertex data
1087         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1088
1089         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
1090         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
1091         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
1092         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1093
1094         // store user clip distances
1095         if (state.backendState.clipDistanceMask)
1096         {
1097             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1098             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1099             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1100         }
1101
1102         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1103         {
1104             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1105             {
1106 #if KNOB_ENABLE_TOSS_POINTS
1107                 if (!KNOB_TOSS_SETUP_TRIS)
1108 #endif
1109                 {
1110                     pTileMgr->enqueue(x, y, &work);
1111                 }
1112             }
1113         }
1114
1115                      triMask &= ~(1 << triIndex);
1116     }
1117
1118     RDTSC_END(FEBinTriangles, 1);
1119 }
1120
1121 template <typename CT>
1122 void BinTriangles(
1123     DRAW_CONTEXT *pDC,
1124     PA_STATE &pa,
1125     uint32_t workerId,
1126     simdvector tri[3],
1127     uint32_t triMask,
1128     simdscalari const &primID,
1129     simdscalari const &viewportIdx,
1130     simdscalari const &rtIdx)
1131 {
1132     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1133 }
1134
1135 #if USE_SIMD16_FRONTEND
1136 template <typename CT>
1137 void SIMDCALL BinTriangles_simd16(
1138     DRAW_CONTEXT *pDC,
1139     PA_STATE &pa,
1140     uint32_t workerId,
1141     simd16vector tri[3],
1142     uint32_t triMask,
1143     simd16scalari const &primID,
1144     simd16scalari const &viewportIdx,
1145     simd16scalari const &rtIdx)
1146 {
1147     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1148 }
1149
1150 #endif
1151 struct FEBinTrianglesChooser
1152 {
1153     typedef PFN_PROCESS_PRIMS FuncType;
1154
1155     template <typename... ArgsB>
1156     static FuncType GetFunc()
1157     {
1158         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1159     }
1160 };
1161
1162 // Selector for correct templated BinTrinagles function
1163 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1164 {
1165     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1166 }
1167
1168 #if USE_SIMD16_FRONTEND
1169 struct FEBinTrianglesChooser_simd16
1170 {
1171     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1172
1173     template <typename... ArgsB>
1174     static FuncType GetFunc()
1175     {
1176         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1177     }
1178 };
1179
1180 // Selector for correct templated BinTrinagles function
1181 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1182 {
1183     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1184 }
1185
1186 #endif
1187
1188 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1189 void BinPostSetupPointsImpl(
1190     DRAW_CONTEXT *pDC,
1191     PA_STATE &pa,
1192     uint32_t workerId,
1193     Vec4<SIMD_T> prim[],
1194     uint32_t primMask,
1195     Integer<SIMD_T> const &primID,
1196     Integer<SIMD_T> const &viewportIdx,
1197     Integer<SIMD_T> const &rtIdx)
1198 {
1199     RDTSC_BEGIN(FEBinPoints, pDC->drawId);
1200
1201     Vec4<SIMD_T> &primVerts = prim[0];
1202
1203     const API_STATE& state = GetApiState(pDC);
1204     const SWR_RASTSTATE& rastState = state.rastState;
1205     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1206
1207     // Select attribute processor
1208     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1209         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1210
1211     // convert to fixed point
1212     Integer<SIMD_T> vXi, vYi;
1213
1214     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1215     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1216
1217     if (CanUseSimplePoints(pDC))
1218     {
1219         // adjust for ymin-xmin rule
1220         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1221         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1222
1223         // cull points off the ymin-xmin edge of the viewport
1224         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1225         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1226
1227         // compute macro tile coordinates
1228         Integer<SIMD_T> macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1229         Integer<SIMD_T> macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1230
1231         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1232
1233         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroX), macroX);
1234         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMacroY), macroY);
1235
1236         // compute raster tile coordinates
1237         Integer<SIMD_T> rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1238         Integer<SIMD_T> rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1239
1240         // compute raster tile relative x,y for coverage mask
1241         Integer<SIMD_T> tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1242         Integer<SIMD_T> tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1243
1244         Integer<SIMD_T> tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1245         Integer<SIMD_T> tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1246
1247         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1248         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1249
1250         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeX), tileRelativeX);
1251         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileRelativeY), tileRelativeY);
1252
1253         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1254         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1255
1256         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedX), tileAlignedX);
1257         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aTileAlignedY), tileAlignedY);
1258
1259         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1260         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1261
1262         // store render target array index
1263         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1264
1265         uint32_t *pPrimID = (uint32_t *)&primID;
1266         DWORD primIndex = 0;
1267
1268         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1269
1270         // scan remaining valid triangles and bin each separately
1271         while (_BitScanForward(&primIndex, primMask))
1272         {
1273             uint32_t linkageCount = backendState.numAttributes;
1274             uint32_t numScalarAttribs = linkageCount * 4;
1275
1276             BE_WORK work;
1277             work.type = DRAW;
1278
1279             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1280
1281             // points are always front facing
1282             desc.triFlags.frontFacing = 1;
1283             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1284             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1285
1286             work.pfnWork = RasterizeSimplePoint;
1287
1288             auto pArena = pDC->pArena;
1289             SWR_ASSERT(pArena != nullptr);
1290
1291             // store attributes
1292             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1293             desc.pAttribs = pAttribs;
1294             desc.numAttribs = linkageCount;
1295
1296             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1297
1298             // store raster tile aligned x, y, perspective correct z
1299             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1300             desc.pTriBuffer = pTriBuffer;
1301             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1302             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1303             *pTriBuffer = aZ[primIndex];
1304
1305             uint32_t tX = aTileRelativeX[primIndex];
1306             uint32_t tY = aTileRelativeY[primIndex];
1307
1308             // pack the relative x,y into the coverageMask, the rasterizer will
1309             // generate the true coverage mask from it
1310             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1311
1312             // bin it
1313             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1314 #if KNOB_ENABLE_TOSS_POINTS
1315             if (!KNOB_TOSS_SETUP_TRIS)
1316 #endif
1317             {
1318                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1319             }
1320
1321             primMask &= ~(1 << primIndex);
1322         }
1323     }
1324     else
1325     {
1326         // non simple points need to be potentially binned to multiple macro tiles
1327         Float<SIMD_T> vPointSize;
1328
1329         if (rastState.pointParam)
1330         {
1331             Vec4<SIMD_T> size[3];
1332             pa.Assemble(VERTEX_SGV_SLOT, size);
1333             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1334         }
1335         else
1336         {
1337             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1338         }
1339
1340         // bloat point to bbox
1341         SIMDBBOX_T<SIMD_T> bbox;
1342
1343         bbox.xmin = bbox.xmax = vXi;
1344         bbox.ymin = bbox.ymax = vYi;
1345
1346         Float<SIMD_T> vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1347         Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1348
1349         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1350         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1351         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1352         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1353
1354         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1355         // Gather the AOS effective scissor rects based on the per-prim VP index.
1356         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1357         {
1358             Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1359
1360             if (pa.viewportArrayActive)
1361             {
1362                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1363             }
1364             else // broadcast fast path for non-VPAI case.
1365             {
1366                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1367                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1368                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1369                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1370             }
1371
1372             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1373             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1374             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1375             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1376         }
1377
1378         // Cull bloated points completely outside scissor
1379         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1380         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1381         Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1382         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1383         primMask = primMask & ~maskOutsideScissor;
1384
1385         // Convert bbox to macrotile units.
1386         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1387         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1388         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1389         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1390
1391         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1392
1393         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
1394         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
1395         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
1396         SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
1397
1398         // store render target array index
1399         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1400
1401         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1402         SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1403
1404         uint32_t *pPrimID = (uint32_t *)&primID;
1405
1406         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1407         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1408         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1409
1410         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1411         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1412         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1413
1414         // scan remaining valid prims and bin each separately
1415         const SWR_BACKEND_STATE& backendState = state.backendState;
1416         DWORD primIndex;
1417         while (_BitScanForward(&primIndex, primMask))
1418         {
1419             uint32_t linkageCount = backendState.numAttributes;
1420             uint32_t numScalarAttribs = linkageCount * 4;
1421
1422             BE_WORK work;
1423             work.type = DRAW;
1424
1425             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1426
1427             desc.triFlags.frontFacing = 1;
1428             desc.triFlags.pointSize = aPointSize[primIndex];
1429             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1430             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1431
1432             work.pfnWork = RasterizeTriPoint;
1433
1434             auto pArena = pDC->pArena;
1435             SWR_ASSERT(pArena != nullptr);
1436
1437             // store active attribs
1438             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1439             desc.numAttribs = linkageCount;
1440             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1441
1442             // store point vertex data
1443             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1444             desc.pTriBuffer = pTriBuffer;
1445             *pTriBuffer++ = aPrimVertsX[primIndex];
1446             *pTriBuffer++ = aPrimVertsY[primIndex];
1447             *pTriBuffer = aPrimVertsZ[primIndex];
1448
1449             // store user clip distances
1450             if (backendState.clipDistanceMask)
1451             {
1452                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1453                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1454                 float dists[8];
1455                 float one = 1.0f;
1456                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1457                 for (uint32_t i = 0; i < numClipDist; i++) {
1458                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1459                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1460                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1461                 }
1462             }
1463
1464             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1465             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1466             {
1467                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1468                 {
1469 #if KNOB_ENABLE_TOSS_POINTS
1470                     if (!KNOB_TOSS_SETUP_TRIS)
1471 #endif
1472                     {
1473                         pTileMgr->enqueue(x, y, &work);
1474                     }
1475                 }
1476             }
1477
1478             primMask &= ~(1 << primIndex);
1479         }
1480     }
1481
1482     RDTSC_END(FEBinPoints, 1);
1483 }
1484
1485 //////////////////////////////////////////////////////////////////////////
1486 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1487 /// @param pDC - pointer to draw context.
1488 /// @param pa - The primitive assembly object.
1489 /// @param workerId - thread's worker id. Even thread has a unique id.
1490 /// @param tri - Contains point position data for SIMDs worth of points.
1491 /// @param primID - Primitive ID for each point.
1492 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1493 void BinPointsImpl(
1494     DRAW_CONTEXT *pDC,
1495     PA_STATE &pa,
1496     uint32_t workerId,
1497     Vec4<SIMD_T> prim[3],
1498     uint32_t primMask,
1499     Integer<SIMD_T> const &primID,
1500     Integer<SIMD_T> const &viewportIdx,
1501     Integer<SIMD_T> const &rtIdx)
1502 {
1503     const API_STATE& state = GetApiState(pDC);
1504     const SWR_FRONTEND_STATE& feState = state.frontendState;
1505     const SWR_RASTSTATE& rastState = state.rastState;
1506
1507     if (!feState.vpTransformDisable)
1508     {
1509         // perspective divide
1510         Float<SIMD_T> vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1511
1512         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1513         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1514         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1515
1516         // viewport transform to screen coords
1517         if (pa.viewportArrayActive)
1518         {
1519             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1520         }
1521         else
1522         {
1523             viewportTransform<1>(prim, state.vpMatrices);
1524         }
1525     }
1526
1527     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1528
1529     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1530     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1531
1532     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1533         pDC,
1534         pa,
1535         workerId,
1536         prim,
1537         primMask,
1538         primID,
1539         viewportIdx,
1540         rtIdx);
1541 }
1542
1543 void BinPoints(
1544     DRAW_CONTEXT *pDC,
1545     PA_STATE &pa,
1546     uint32_t workerId,
1547     simdvector prim[3],
1548     uint32_t primMask,
1549     simdscalari const &primID,
1550     simdscalari const &viewportIdx,
1551     simdscalari const &rtIdx)
1552 {
1553     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1554         pDC,
1555         pa,
1556         workerId,
1557         prim,
1558         primMask,
1559         primID,
1560         viewportIdx,
1561         rtIdx);
1562 }
1563
1564 #if USE_SIMD16_FRONTEND
1565 void SIMDCALL BinPoints_simd16(
1566     DRAW_CONTEXT *pDC,
1567     PA_STATE &pa,
1568     uint32_t workerId,
1569     simd16vector prim[3],
1570     uint32_t primMask,
1571     simd16scalari const &primID,
1572     simd16scalari const &viewportIdx,
1573     simd16scalari const & rtIdx)
1574 {
1575     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1576         pDC,
1577         pa,
1578         workerId,
1579         prim,
1580         primMask,
1581         primID,
1582         viewportIdx,
1583         rtIdx);
1584 }
1585
1586 #endif
1587 //////////////////////////////////////////////////////////////////////////
1588 /// @brief Bin SIMD lines to the backend.
1589 /// @param pDC - pointer to draw context.
1590 /// @param pa - The primitive assembly object.
1591 /// @param workerId - thread's worker id. Even thread has a unique id.
1592 /// @param tri - Contains line position data for SIMDs worth of points.
1593 /// @param primID - Primitive ID for each line.
1594 /// @param viewportIdx - Viewport Array Index for each line.
1595 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1596 void BinPostSetupLinesImpl(
1597     DRAW_CONTEXT *pDC,
1598     PA_STATE &pa,
1599     uint32_t workerId,
1600     Vec4<SIMD_T> prim[],
1601     Float<SIMD_T> recipW[],
1602     uint32_t primMask,
1603     Integer<SIMD_T> const &primID,
1604     Integer<SIMD_T> const &viewportIdx,
1605     Integer<SIMD_T> const &rtIdx)
1606 {
1607     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1608
1609     RDTSC_BEGIN(FEBinLines, pDC->drawId);
1610
1611     const API_STATE &state = GetApiState(pDC);
1612     const SWR_RASTSTATE &rastState = state.rastState;
1613
1614     // Select attribute processor
1615     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1616         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1617
1618     Float<SIMD_T> &vRecipW0 = recipW[0];
1619     Float<SIMD_T> &vRecipW1 = recipW[1];
1620
1621     // convert to fixed point
1622     Integer<SIMD_T> vXi[2], vYi[2];
1623
1624     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1625     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1626     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1627     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1628
1629     // compute x-major vs y-major mask
1630     Integer<SIMD_T> xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1631     Integer<SIMD_T> yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1632     Float<SIMD_T> vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1633     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1634
1635     // cull zero-length lines
1636     Integer<SIMD_T> vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1637     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1638
1639     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1640
1641     uint32_t *pPrimID = (uint32_t *)&primID;
1642     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1643
1644     // Calc bounding box of lines
1645     SIMDBBOX_T<SIMD_T> bbox;
1646     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1647     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1648     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1649     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1650
1651     // bloat bbox by line width along minor axis
1652     Float<SIMD_T> vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1653     Integer<SIMD_T> vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1654
1655     SIMDBBOX_T<SIMD_T> bloatBox;
1656
1657     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1658     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1659     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1660     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1661
1662     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1663     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1664     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1665     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1666
1667     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1668     {
1669         Integer<SIMD_T> scisXmin, scisYmin, scisXmax, scisYmax;
1670
1671         if (pa.viewportArrayActive)
1672         {
1673             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1674         }
1675         else // broadcast fast path for non-VPAI case.
1676         {
1677             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1678             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1679             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1680             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1681         }
1682
1683         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1684         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1685         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1686         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1687     }
1688
1689     // Cull prims completely outside scissor
1690     {
1691         Integer<SIMD_T> maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1692         Integer<SIMD_T> maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1693         Integer<SIMD_T> maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1694         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1695         primMask = primMask & ~maskOutsideScissor;
1696     }
1697
1698     // transpose verts needed for backend
1699     /// @todo modify BE to take non-transformed verts
1700     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1701     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1702     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1703     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1704
1705     if (!primMask)
1706     {
1707         goto endBinLines;
1708     }
1709
1710     // Convert triangle bbox to macrotile units.
1711     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1712     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1713     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1714     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1715
1716     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1717
1718     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTLeft),   bbox.xmin);
1719     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTRight),  bbox.xmax);
1720     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTTop),    bbox.ymin);
1721     SIMD_T::store_si(reinterpret_cast<Integer<SIMD_T> *>(aMTBottom), bbox.ymax);
1722
1723     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1724     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1725     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1726     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1727
1728     // scan remaining valid prims and bin each separately
1729     DWORD primIndex;
1730     while (_BitScanForward(&primIndex, primMask))
1731     {
1732         uint32_t linkageCount = state.backendState.numAttributes;
1733         uint32_t numScalarAttribs = linkageCount * 4;
1734
1735         BE_WORK work;
1736         work.type = DRAW;
1737
1738         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1739
1740         desc.triFlags.frontFacing = 1;
1741         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1742         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1743         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1744
1745         work.pfnWork = RasterizeLine;
1746
1747         auto pArena = pDC->pArena;
1748         SWR_ASSERT(pArena != nullptr);
1749
1750         // store active attribs
1751         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1752         desc.numAttribs = linkageCount;
1753         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1754
1755         // store line vertex data
1756         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1757
1758         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1759         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1760         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1761         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1762
1763         // store user clip distances
1764         if (state.backendState.clipDistanceMask)
1765         {
1766             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1767             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1768             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1769         }
1770
1771         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1772         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1773         {
1774             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1775             {
1776 #if KNOB_ENABLE_TOSS_POINTS
1777                 if (!KNOB_TOSS_SETUP_TRIS)
1778 #endif
1779                 {
1780                     pTileMgr->enqueue(x, y, &work);
1781                 }
1782             }
1783         }
1784
1785         primMask &= ~(1 << primIndex);
1786     }
1787
1788 endBinLines:
1789
1790     RDTSC_END(FEBinLines, 1);
1791 }
1792
1793 //////////////////////////////////////////////////////////////////////////
1794 /// @brief Bin SIMD lines to the backend.
1795 /// @param pDC - pointer to draw context.
1796 /// @param pa - The primitive assembly object.
1797 /// @param workerId - thread's worker id. Even thread has a unique id.
1798 /// @param tri - Contains line position data for SIMDs worth of points.
1799 /// @param primID - Primitive ID for each line.
1800 /// @param viewportIdx - Viewport Array Index for each line.
1801 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1802 void SIMDCALL BinLinesImpl(
1803     DRAW_CONTEXT *pDC,
1804     PA_STATE &pa,
1805     uint32_t workerId,
1806     Vec4<SIMD_T> prim[3],
1807     uint32_t primMask,
1808     Integer<SIMD_T> const &primID,
1809     Integer<SIMD_T> const &viewportIdx,
1810     Integer<SIMD_T> const & rtIdx)
1811 {
1812     const API_STATE& state = GetApiState(pDC);
1813     const SWR_RASTSTATE& rastState = state.rastState;
1814     const SWR_FRONTEND_STATE& feState = state.frontendState;
1815
1816     Float<SIMD_T> vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1817
1818     if (!feState.vpTransformDisable)
1819     {
1820         // perspective divide
1821         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1822         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1823
1824         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1825         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1826
1827         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1828         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1829
1830         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1831         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1832
1833         // viewport transform to screen coords
1834         if (pa.viewportArrayActive)
1835         {
1836             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1837         }
1838         else
1839         {
1840             viewportTransform<2>(prim, state.vpMatrices);
1841         }
1842     }
1843
1844     // adjust for pixel center location
1845     Float<SIMD_T> offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1846
1847     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1848     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1849
1850     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1851     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1852
1853     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1854         pDC,
1855         pa,
1856         workerId,
1857         prim,
1858         vRecipW,
1859         primMask,
1860         primID,
1861         viewportIdx,
1862         rtIdx);
1863 }
1864
1865 void BinLines(
1866     DRAW_CONTEXT *pDC,
1867     PA_STATE &pa,
1868     uint32_t workerId,
1869     simdvector prim[],
1870     uint32_t primMask,
1871     simdscalari const &primID,
1872     simdscalari const &viewportIdx,
1873     simdscalari const &rtIdx)
1874 {
1875     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1876 }
1877
1878 #if USE_SIMD16_FRONTEND
1879 void SIMDCALL BinLines_simd16(
1880     DRAW_CONTEXT *pDC,
1881     PA_STATE &pa,
1882     uint32_t workerId,
1883     simd16vector prim[3],
1884     uint32_t primMask,
1885     simd16scalari const &primID,
1886     simd16scalari const &viewportIdx,
1887     simd16scalari const &rtIdx)
1888 {
1889     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1890 }
1891
1892 #endif