src/gallium/drivers/swr/rasterizer/core/binner.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file binner.cpp
  24 *
  25 * @brief Implementation for the macrotile binner
  26 *
  27 ******************************************************************************/
  28
  29 #include "binner.h"
  30 #include "context.h"
  31 #include "frontend.h"
  32 #include "conservativeRast.h"
  33 #include "pa.h"
  34 #include "rasterizer.h"
  35 #include "rdtsc_core.h"
  36 #include "tilemgr.h"
  37
  38 // Function Prototype
  39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  40 void BinPostSetupLinesImpl(
  41     DRAW_CONTEXT *pDC,
  42     PA_STATE &pa,
  43     uint32_t workerId,
  44     typename SIMD_T::Vec4 prim[],
  45     typename SIMD_T::Float recipW[],
  46     uint32_t primMask,
  47     typename SIMD_T::Integer const &primID,
  48     typename SIMD_T::Integer const &viewportIdx);
  49
  50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  51 void BinPostSetupPointsImpl(
  52     DRAW_CONTEXT *pDC,
  53     PA_STATE &pa,
  54     uint32_t workerId,
  55     typename SIMD_T::Vec4 prim[],
  56     uint32_t primMask,
  57     typename SIMD_T::Integer const &primID,
  58     typename SIMD_T::Integer const &viewportIdx);
  59
  60 //////////////////////////////////////////////////////////////////////////
  61 /// @brief Processes attributes for the backend based on linkage mask and
  62 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
  63 /// @param pDC - Draw context
  64 /// @param pa - Primitive Assembly state
  65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
  66 /// @param pLinkageMap - maps VS attribute slot to PS slot
  67 /// @param triIndex - Triangle to process attributes for
  68 /// @param pBuffer - Output result
  69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
  70 INLINE void ProcessAttributes(
  71     DRAW_CONTEXT *pDC,
  72     PA_STATE&pa,
  73     uint32_t triIndex,
  74     uint32_t primId,
  75     float *pBuffer)
  76 {
  77     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
  78     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
  79     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
  80     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
  81     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
  82     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
  83
  84     static const float constTable[3][4] = {
  85         { 0.0f, 0.0f, 0.0f, 0.0f },
  86         { 0.0f, 0.0f, 0.0f, 1.0f },
  87         { 1.0f, 1.0f, 1.0f, 1.0f }
  88     };
  89
  90     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
  91     {
  92         uint32_t inputSlot;
  93         if (IsSwizzledT::value)
  94         {
  95             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
  96             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
  97
  98         }
  99         else
 100         {
 101             inputSlot = backendState.vertexAttribOffset + i;
 102         }
 103
 104         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
 105         float* pAttribStart = pBuffer;
 106
 107         if (HasConstantInterpT::value || IsDegenerate::value)
 108         {
 109             if (CheckBit(constantInterpMask, i))
 110             {
 111                 uint32_t vid;
 112                 uint32_t adjustedTriIndex;
 113                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
 114                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
 115                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
 116                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
 117                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
 118
 119                 switch (topo) {
 120                 case TOP_QUAD_LIST:
 121                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
 122                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
 123                     break;
 124                 case TOP_QUAD_STRIP:
 125                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
 126                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
 127                     break;
 128                 case TOP_TRIANGLE_STRIP:
 129                     adjustedTriIndex = triIndex;
 130                     vid = (triIndex & 1)
 131                         ? tristripProvokingVertex[provokingVertex]
 132                         : provokingVertex;
 133                     break;
 134                 default:
 135                     adjustedTriIndex = triIndex;
 136                     vid = provokingVertex;
 137                     break;
 138                 }
 139
 140                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
 141
 142                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 143                 {
 144                     SIMD128::store_ps(pBuffer, attrib[vid]);
 145                     pBuffer += 4;
 146                 }
 147             }
 148             else
 149             {
 150                 pa.AssembleSingle(inputSlot, triIndex, attrib);
 151
 152                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 153                 {
 154                     SIMD128::store_ps(pBuffer, attrib[i]);
 155                     pBuffer += 4;
 156                 }
 157             }
 158         }
 159         else
 160         {
 161             pa.AssembleSingle(inputSlot, triIndex, attrib);
 162
 163             for (uint32_t i = 0; i < NumVertsT::value; ++i)
 164             {
 165                 SIMD128::store_ps(pBuffer, attrib[i]);
 166                 pBuffer += 4;
 167             }
 168         }
 169
 170         // pad out the attrib buffer to 3 verts to ensure the triangle
 171         // interpolation code in the pixel shader works correctly for the
 172         // 3 topologies - point, line, tri.  This effectively zeros out the
 173         // effect of the missing vertices in the triangle interpolation.
 174         for (uint32_t v = NumVertsT::value; v < 3; ++v)
 175         {
 176             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
 177             pBuffer += 4;
 178         }
 179
 180         // check for constant source overrides
 181         if (IsSwizzledT::value)
 182         {
 183             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
 184             if (mask)
 185             {
 186                 DWORD comp;
 187                 while (_BitScanForward(&comp, mask))
 188                 {
 189                     mask &= ~(1 << comp);
 190
 191                     float constantValue = 0.0f;
 192                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
 193                     {
 194                     case SWR_CONSTANT_SOURCE_CONST_0000:
 195                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
 196                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
 197                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
 198                         break;
 199                     case SWR_CONSTANT_SOURCE_PRIM_ID:
 200                         constantValue = *(float*)&primId;
 201                         break;
 202                     }
 203
 204                     // apply constant value to all 3 vertices
 205                     for (uint32_t v = 0; v < 3; ++v)
 206                     {
 207                         pAttribStart[comp + v * 4] = constantValue;
 208                     }
 209                 }
 210             }
 211         }
 212     }
 213 }
 214
 215 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 216
 217 struct ProcessAttributesChooser
 218 {
 219     typedef PFN_PROCESS_ATTRIBUTES FuncType;
 220
 221     template <typename... ArgsB>
 222     static FuncType GetFunc()
 223     {
 224         return ProcessAttributes<ArgsB...>;
 225     }
 226 };
 227
 228 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
 229 {
 230     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 231 }
 232
 233 //////////////////////////////////////////////////////////////////////////
 234 /// @brief Processes enabled user clip distances. Loads the active clip
 235 ///        distances from the PA, sets up barycentric equations, and
 236 ///        stores the results to the output buffer
 237 /// @param pa - Primitive Assembly state
 238 /// @param primIndex - primitive index to process
 239 /// @param clipDistMask - mask of enabled clip distances
 240 /// @param pUserClipBuffer - buffer to store results
 241 template<uint32_t NumVerts>
 242 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 243 {
 244     DWORD clipDist;
 245     uint32_t clipDistMask = state.clipDistanceMask;
 246     while (_BitScanForward(&clipDist, clipDistMask))
 247     {
 248         clipDistMask &= ~(1 << clipDist);
 249         uint32_t clipSlot = clipDist >> 2;
 250         uint32_t clipComp = clipDist & 0x3;
 251         uint32_t clipAttribSlot = clipSlot == 0 ?
 252             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 253
 254         simd4scalar primClipDist[3];
 255         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 256
 257         float vertClipDist[NumVerts];
 258         for (uint32_t e = 0; e < NumVerts; ++e)
 259         {
 260             OSALIGNSIMD(float) aVertClipDist[4];
 261             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
 262             vertClipDist[e] = aVertClipDist[clipComp];
 263         };
 264
 265         // setup plane equations for barycentric interpolation in the backend
 266         float baryCoeff[NumVerts];
 267         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
 268         for (uint32_t e = 0; e < NumVerts - 1; ++e)
 269         {
 270             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
 271         }
 272         baryCoeff[NumVerts - 1] = last;
 273
 274         for (uint32_t e = 0; e < NumVerts; ++e)
 275         {
 276             *(pUserClipBuffer++) = baryCoeff[e];
 277         }
 278     }
 279 }
 280
 281 INLINE
 282 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
 283 {
 284     vTranspose3x8(dst, src0, src1, src2);
 285 }
 286
 287 INLINE
 288 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
 289 {
 290     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 291 }
 292
 293 //////////////////////////////////////////////////////////////////////////
 294 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
 295 ///        culling, viewport transform, etc.
 296 /// @param pDC - pointer to draw context.
 297 /// @param pa - The primitive assembly object.
 298 /// @param workerId - thread's worker id. Even thread has a unique id.
 299 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
 300 /// @param primID - Primitive ID for each triangle.
 301 /// @param viewportIdx - viewport array index for each triangle.
 302 /// @tparam CT - ConservativeRastFETraits
 303 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 304 void SIMDCALL BinTrianglesImpl(
 305     DRAW_CONTEXT *pDC,
 306     PA_STATE &pa,
 307     uint32_t workerId,
 308     typename SIMD_T::Vec4 tri[3],
 309     uint32_t triMask,
 310     typename SIMD_T::Integer const &primID)
 311 {
 312     SWR_CONTEXT *pContext = pDC->pContext;
 313
 314     AR_BEGIN(FEBinTriangles, pDC->drawId);
 315
 316     const API_STATE& state = GetApiState(pDC);
 317     const SWR_RASTSTATE& rastState = state.rastState;
 318     const SWR_FRONTEND_STATE& feState = state.frontendState;
 319
 320     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 321
 322     typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
 323     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 324     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 325
 326     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
 327     typename SIMD_T::Vec4 vpiAttrib[3];
 328     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 329
 330     if (state.backendState.readViewportArrayIndex)
 331     {
 332         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 333
 334         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 335     }
 336
 337
 338     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
 339     {
 340         // OOB indices => forced to zero.
 341         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 342         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 343         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
 344         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 345     }
 346     else
 347     {
 348         viewportIdx = vpai;
 349     }
 350
 351     if (feState.vpTransformDisable)
 352     {
 353         // RHW is passed in directly when VP transform is disabled
 354         vRecipW0 = tri[0].v[3];
 355         vRecipW1 = tri[1].v[3];
 356         vRecipW2 = tri[2].v[3];
 357     }
 358     else
 359     {
 360         // Perspective divide
 361         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
 362         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
 363         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
 364
 365         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
 366         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
 367         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
 368
 369         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
 370         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
 371         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
 372
 373         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
 374         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
 375         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 376
 377         // Viewport transform to screen space coords
 378         if (state.backendState.readViewportArrayIndex)
 379         {
 380             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 381         }
 382         else
 383         {
 384             viewportTransform<3>(tri, state.vpMatrices);
 385         }
 386     }
 387
 388     // Adjust for pixel center location
 389     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
 390
 391     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 392     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
 393
 394     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
 395     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
 396
 397     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
 398     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
 399
 400     // Set vXi, vYi to required fixed point precision
 401     typename SIMD_T::Integer vXi[3], vYi[3];
 402     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
 403
 404     // triangle setup
 405     typename SIMD_T::Integer vAi[3], vBi[3];
 406     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
 407
 408     // determinant
 409     typename SIMD_T::Integer vDet[2];
 410     calcDeterminantIntVertical(vAi, vBi, vDet);
 411
 412     // cull zero area
 413     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
 414     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 415
 416     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 417
 418     // don't cull degenerate triangles if we're conservatively rasterizing
 419     uint32_t origTriMask = triMask;
 420     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
 421     {
 422         triMask &= ~cullZeroAreaMask;
 423     }
 424
 425     // determine front winding tris
 426     // CW  +det
 427     // CCW det < 0;
 428     // 0 area triangles are marked as backfacing regardless of winding order,
 429     // which is required behavior for conservative rast and wireframe rendering
 430     uint32_t frontWindingTris;
 431     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
 432     {
 433         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 434         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 435     }
 436     else
 437     {
 438         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
 439         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
 440     }
 441     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 442
 443     // cull
 444     uint32_t cullTris;
 445     switch ((SWR_CULLMODE)rastState.cullMode)
 446     {
 447     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
 448     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
 449     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
 450         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
 451     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
 452     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
 453     }
 454
 455     triMask &= ~cullTris;
 456
 457     if (origTriMask ^ triMask)
 458     {
 459         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 460     }
 461
 462     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
 463     // compute per tri backface
 464     uint32_t frontFaceMask = frontWindingTris;
 465     uint32_t *pPrimID = (uint32_t *)&primID;
 466     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 467     DWORD triIndex = 0;
 468
 469     uint32_t edgeEnable;
 470     PFN_WORK_FUNC pfnWork;
 471     if (CT::IsConservativeT::value)
 472     {
 473         // determine which edges of the degenerate tri, if any, are valid to rasterize.
 474         // used to call the appropriate templated rasterizer function
 475         if (cullZeroAreaMask > 0)
 476         {
 477             // e0 = v1-v0
 478             const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
 479             const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 480
 481             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 482
 483             // e1 = v2-v1
 484             const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
 485             const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 486
 487             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 488
 489             // e2 = v0-v2
 490             // if v0 == v1 & v1 == v2, v0 == v2
 491             uint32_t e2Mask = e0Mask & e1Mask;
 492             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
 493
 494             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
 495             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
 496             e0Mask = pdep_u32(e0Mask, 0x00249249);
 497
 498             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
 499             e1Mask = pdep_u32(e1Mask, 0x00492492);
 500
 501             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
 502             e2Mask = pdep_u32(e2Mask, 0x00924924);
 503
 504             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
 505         }
 506         else
 507         {
 508             edgeEnable = 0x00FFFFFF;
 509         }
 510     }
 511     else
 512     {
 513         // degenerate triangles won't be sent to rasterizer; just enable all edges
 514         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 515             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
 516     }
 517
 518     SIMDBBOX_T<SIMD_T> bbox;
 519
 520     if (!triMask)
 521     {
 522         goto endBinTriangles;
 523     }
 524
 525     // Calc bounding box of triangles
 526     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
 527
 528     // determine if triangle falls between pixel centers and discard
 529     // only discard for non-MSAA case and when conservative rast is disabled
 530     // (xmin + 127) & ~255
 531     // (xmax + 128) & ~255
 532     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
 533         (!CT::IsConservativeT::value))
 534     {
 535         origTriMask = triMask;
 536
 537         int cullCenterMask;
 538
 539         {
 540             typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
 541             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
 542             typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
 543             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 544
 545             typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 546
 547             typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
 548             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
 549             typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
 550             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 551
 552             typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 553
 554             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
 555             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
 556         }
 557
 558         triMask &= ~cullCenterMask;
 559
 560         if (origTriMask ^ triMask)
 561         {
 562             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 563         }
 564     }
 565
 566     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
 567     // Gather the AOS effective scissor rects based on the per-prim VP index.
 568     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 569     {
 570         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 571
 572         if (state.backendState.readViewportArrayIndex)
 573         {
 574             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
 575         }
 576         else // broadcast fast path for non-VPAI case.
 577         {
 578             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
 579             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
 580             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
 581             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
 582         }
 583
 584         // Make triangle bbox inclusive
 585         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
 586         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
 587
 588         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
 589         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
 590         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
 591         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
 592     }
 593
 594     if (CT::IsConservativeT::value)
 595     {
 596         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
 597         // some area. Bump the xmax/ymax edges out
 598
 599         typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
 600         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 601
 602         typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
 603         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
 604     }
 605
 606     // Cull tris completely outside scissor
 607     {
 608         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
 609         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
 610         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
 611         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
 612         triMask = triMask & ~maskOutsideScissor;
 613     }
 614
 615 endBinTriangles:
 616
 617
 618     // Send surviving triangles to the line or point binner based on fill mode
 619     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
 620     {
 621         // Simple non-conformant wireframe mode, useful for debugging
 622         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
 623         typename SIMD_T::Vec4 line[2];
 624         typename SIMD_T::Float recipW[2];
 625
 626         line[0] = tri[0];
 627         line[1] = tri[1];
 628         recipW[0] = vRecipW0;
 629         recipW[1] = vRecipW1;
 630
 631         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 632
 633         line[0] = tri[1];
 634         line[1] = tri[2];
 635         recipW[0] = vRecipW1;
 636         recipW[1] = vRecipW2;
 637
 638         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 639
 640         line[0] = tri[2];
 641         line[1] = tri[0];
 642         recipW[0] = vRecipW2;
 643         recipW[1] = vRecipW0;
 644
 645         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 646
 647         AR_END(FEBinTriangles, 1);
 648         return;
 649     }
 650     else if (rastState.fillMode == SWR_FILLMODE_POINT)
 651     {
 652         // Bin 3 points
 653         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
 654         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
 655         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
 656
 657         AR_END(FEBinTriangles, 1);
 658         return;
 659     }
 660
 661     // Convert triangle bbox to macrotile units.
 662     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
 663     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
 664     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
 665     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 666
 667     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 668
 669     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
 670     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
 671     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
 672     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
 673
 674     // transpose verts needed for backend
 675     /// @todo modify BE to take non-transformed verts
 676     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
 677     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
 678     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
 679     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 680
 681     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
 682     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
 683     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
 684     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 685
 686     // store render target array index
 687     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
 688     if (state.backendState.readRenderTargetArrayIndex)
 689     {
 690         typename SIMD_T::Vec4 vRtai[3];
 691         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
 692         typename SIMD_T::Integer vRtaii;
 693         vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
 694         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
 695     }
 696     else
 697     {
 698         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
 699     }
 700
 701
 702     // scan remaining valid triangles and bin each separately
 703     while (_BitScanForward(&triIndex, triMask))
 704     {
 705         uint32_t linkageCount = state.backendState.numAttributes;
 706         uint32_t numScalarAttribs = linkageCount * 4;
 707
 708         BE_WORK work;
 709         work.type = DRAW;
 710
 711         bool isDegenerate;
 712         if (CT::IsConservativeT::value)
 713         {
 714             // only rasterize valid edges if we have a degenerate primitive
 715             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
 716             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 717                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
 718
 719             // Degenerate triangles are required to be constant interpolated
 720             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
 721         }
 722         else
 723         {
 724             isDegenerate = false;
 725             work.pfnWork = pfnWork;
 726         }
 727
 728         // Select attribute processor
 729         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
 730             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
 731
 732         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 733
 734         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
 735         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
 736         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
 737
 738         auto pArena = pDC->pArena;
 739         SWR_ASSERT(pArena != nullptr);
 740
 741         // store active attribs
 742         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
 743         desc.pAttribs = pAttribs;
 744         desc.numAttribs = linkageCount;
 745         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
 746
 747         // store triangle vertex data
 748         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 749
 750         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
 751         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
 752         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
 753         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 754
 755         // store user clip distances
 756         if (state.backendState.clipDistanceMask)
 757         {
 758             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
 759             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
 760             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
 761         }
 762
 763         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
 764         {
 765             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
 766             {
 767 #if KNOB_ENABLE_TOSS_POINTS
 768                 if (!KNOB_TOSS_SETUP_TRIS)
 769 #endif
 770                 {
 771                     pTileMgr->enqueue(x, y, &work);
 772                 }
 773             }
 774         }
 775
 776                      triMask &= ~(1 << triIndex);
 777     }
 778
 779     AR_END(FEBinTriangles, 1);
 780 }
 781
 782 template <typename CT>
 783 void BinTriangles(
 784     DRAW_CONTEXT *pDC,
 785     PA_STATE &pa,
 786     uint32_t workerId,
 787     simdvector tri[3],
 788     uint32_t triMask,
 789     simdscalari const &primID)
 790 {
 791     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 792 }
 793
 794 #if USE_SIMD16_FRONTEND
 795 template <typename CT>
 796 void SIMDCALL BinTriangles_simd16(
 797     DRAW_CONTEXT *pDC,
 798     PA_STATE &pa,
 799     uint32_t workerId,
 800     simd16vector tri[3],
 801     uint32_t triMask,
 802     simd16scalari const &primID)
 803 {
 804     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 805 }
 806
 807 #endif
 808 struct FEBinTrianglesChooser
 809 {
 810     typedef PFN_PROCESS_PRIMS FuncType;
 811
 812     template <typename... ArgsB>
 813     static FuncType GetFunc()
 814     {
 815         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
 816     }
 817 };
 818
 819 // Selector for correct templated BinTrinagles function
 820 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
 821 {
 822     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
 823 }
 824
 825 #if USE_SIMD16_FRONTEND
 826 struct FEBinTrianglesChooser_simd16
 827 {
 828     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
 829
 830     template <typename... ArgsB>
 831     static FuncType GetFunc()
 832     {
 833         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
 834     }
 835 };
 836
 837 // Selector for correct templated BinTrinagles function
 838 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
 839 {
 840     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
 841 }
 842
 843 #endif
 844
 845 template <typename SIMD_T, uint32_t SIMD_WIDTH>
 846 void BinPostSetupPointsImpl(
 847     DRAW_CONTEXT *pDC,
 848     PA_STATE &pa,
 849     uint32_t workerId,
 850     typename SIMD_T::Vec4 prim[],
 851     uint32_t primMask,
 852     typename SIMD_T::Integer const &primID,
 853     typename SIMD_T::Integer const &viewportIdx)
 854 {
 855     SWR_CONTEXT *pContext = pDC->pContext;
 856
 857     AR_BEGIN(FEBinPoints, pDC->drawId);
 858
 859     typename SIMD_T::Vec4 &primVerts = prim[0];
 860
 861     const API_STATE& state = GetApiState(pDC);
 862     const SWR_RASTSTATE& rastState = state.rastState;
 863     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 864
 865     // Select attribute processor
 866     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
 867         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 868
 869     // convert to fixed point
 870     typename SIMD_T::Integer vXi, vYi;
 871
 872     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
 873     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
 874
 875     if (CanUseSimplePoints(pDC))
 876     {
 877         // adjust for ymin-xmin rule
 878         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
 879         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
 880
 881         // cull points off the ymin-xmin edge of the viewport
 882         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
 883         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
 884
 885         // compute macro tile coordinates
 886         typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
 887         typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
 888
 889         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
 890
 891         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
 892         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
 893
 894         // compute raster tile coordinates
 895         typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
 896         typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
 897
 898         // compute raster tile relative x,y for coverage mask
 899         typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
 900         typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
 901
 902         typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
 903         typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
 904
 905         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
 906         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
 907
 908         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
 909         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
 910
 911         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
 912         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
 913
 914         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
 915         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
 916
 917         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
 918         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
 919
 920         // store render target array index
 921         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
 922         if (state.backendState.readRenderTargetArrayIndex)
 923         {
 924             typename SIMD_T::Vec4 vRtai;
 925             pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
 926             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
 927             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
 928         }
 929         else
 930         {
 931             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
 932         }
 933
 934         uint32_t *pPrimID = (uint32_t *)&primID;
 935         DWORD primIndex = 0;
 936
 937         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
 938
 939         // scan remaining valid triangles and bin each separately
 940         while (_BitScanForward(&primIndex, primMask))
 941         {
 942             uint32_t linkageCount = backendState.numAttributes;
 943             uint32_t numScalarAttribs = linkageCount * 4;
 944
 945             BE_WORK work;
 946             work.type = DRAW;
 947
 948             TRIANGLE_WORK_DESC &desc = work.desc.tri;
 949
 950             // points are always front facing
 951             desc.triFlags.frontFacing = 1;
 952             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
 953             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
 954
 955             work.pfnWork = RasterizeSimplePoint;
 956
 957             auto pArena = pDC->pArena;
 958             SWR_ASSERT(pArena != nullptr);
 959
 960             // store attributes
 961             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
 962             desc.pAttribs = pAttribs;
 963             desc.numAttribs = linkageCount;
 964
 965             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
 966
 967             // store raster tile aligned x, y, perspective correct z
 968             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
 969             desc.pTriBuffer = pTriBuffer;
 970             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
 971             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
 972             *pTriBuffer = aZ[primIndex];
 973
 974             uint32_t tX = aTileRelativeX[primIndex];
 975             uint32_t tY = aTileRelativeY[primIndex];
 976
 977             // pack the relative x,y into the coverageMask, the rasterizer will
 978             // generate the true coverage mask from it
 979             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
 980
 981             // bin it
 982             MacroTileMgr *pTileMgr = pDC->pTileMgr;
 983 #if KNOB_ENABLE_TOSS_POINTS
 984             if (!KNOB_TOSS_SETUP_TRIS)
 985 #endif
 986             {
 987                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
 988             }
 989
 990             primMask &= ~(1 << primIndex);
 991         }
 992     }
 993     else
 994     {
 995         // non simple points need to be potentially binned to multiple macro tiles
 996         typename SIMD_T::Float vPointSize;
 997
 998         if (rastState.pointParam)
 999         {
1000             typename SIMD_T::Vec4 size[3];
1001             pa.Assemble(VERTEX_SGV_SLOT, size);
1002             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1003         }
1004         else
1005         {
1006             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1007         }
1008
1009         // bloat point to bbox
1010         SIMDBBOX_T<SIMD_T> bbox;
1011
1012         bbox.xmin = bbox.xmax = vXi;
1013         bbox.ymin = bbox.ymax = vYi;
1014
1015         typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1016         typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1017
1018         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1019         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1020         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1021         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1022
1023         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1024         // Gather the AOS effective scissor rects based on the per-prim VP index.
1025         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1026         {
1027             typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1028
1029             if (state.backendState.readViewportArrayIndex)
1030             {
1031                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1032             }
1033             else // broadcast fast path for non-VPAI case.
1034             {
1035                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1036                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1037                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1038                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1039             }
1040
1041             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1042             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1043             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1044             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1045         }
1046
1047         // Cull bloated points completely outside scissor
1048         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1049         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1050         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1051         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1052         primMask = primMask & ~maskOutsideScissor;
1053
1054         // Convert bbox to macrotile units.
1055         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1056         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1057         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1058         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1059
1060         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1061
1062         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1063         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1064         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1065         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1066
1067         // store render target array index
1068         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1069         if (state.backendState.readRenderTargetArrayIndex)
1070         {
1071             typename SIMD_T::Vec4 vRtai[2];
1072             pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1073             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1074             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1075         }
1076         else
1077         {
1078             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1079         }
1080
1081         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1082         SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1083
1084         uint32_t *pPrimID = (uint32_t *)&primID;
1085
1086         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1087         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1088         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1089
1090         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1091         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1092         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1093
1094         // scan remaining valid prims and bin each separately
1095         const SWR_BACKEND_STATE& backendState = state.backendState;
1096         DWORD primIndex;
1097         while (_BitScanForward(&primIndex, primMask))
1098         {
1099             uint32_t linkageCount = backendState.numAttributes;
1100             uint32_t numScalarAttribs = linkageCount * 4;
1101
1102             BE_WORK work;
1103             work.type = DRAW;
1104
1105             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1106
1107             desc.triFlags.frontFacing = 1;
1108             desc.triFlags.pointSize = aPointSize[primIndex];
1109             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1110             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1111
1112             work.pfnWork = RasterizeTriPoint;
1113
1114             auto pArena = pDC->pArena;
1115             SWR_ASSERT(pArena != nullptr);
1116
1117             // store active attribs
1118             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1119             desc.numAttribs = linkageCount;
1120             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1121
1122             // store point vertex data
1123             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1124             desc.pTriBuffer = pTriBuffer;
1125             *pTriBuffer++ = aPrimVertsX[primIndex];
1126             *pTriBuffer++ = aPrimVertsY[primIndex];
1127             *pTriBuffer = aPrimVertsZ[primIndex];
1128
1129             // store user clip distances
1130             if (backendState.clipDistanceMask)
1131             {
1132                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1133                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1134                 float dists[8];
1135                 float one = 1.0f;
1136                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1137                 for (uint32_t i = 0; i < numClipDist; i++) {
1138                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1139                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1140                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1141                 }
1142             }
1143
1144             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1145             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1146             {
1147                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1148                 {
1149 #if KNOB_ENABLE_TOSS_POINTS
1150                     if (!KNOB_TOSS_SETUP_TRIS)
1151 #endif
1152                     {
1153                         pTileMgr->enqueue(x, y, &work);
1154                     }
1155                 }
1156             }
1157
1158             primMask &= ~(1 << primIndex);
1159         }
1160     }
1161
1162     AR_END(FEBinPoints, 1);
1163 }
1164
1165 //////////////////////////////////////////////////////////////////////////
1166 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1167 /// @param pDC - pointer to draw context.
1168 /// @param pa - The primitive assembly object.
1169 /// @param workerId - thread's worker id. Even thread has a unique id.
1170 /// @param tri - Contains point position data for SIMDs worth of points.
1171 /// @param primID - Primitive ID for each point.
1172 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1173 void BinPointsImpl(
1174     DRAW_CONTEXT *pDC,
1175     PA_STATE &pa,
1176     uint32_t workerId,
1177     typename SIMD_T::Vec4 prim[3],
1178     uint32_t primMask,
1179     typename SIMD_T::Integer const &primID)
1180 {
1181     const API_STATE& state = GetApiState(pDC);
1182     const SWR_FRONTEND_STATE& feState = state.frontendState;
1183     const SWR_RASTSTATE& rastState = state.rastState;
1184
1185     // Read back viewport index if required
1186     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1187     typename SIMD_T::Vec4 vpiAttrib[1];
1188     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1189
1190     if (state.backendState.readViewportArrayIndex)
1191     {
1192         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1193
1194         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1195     }
1196
1197
1198     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1199     {
1200         // OOB indices => forced to zero.
1201         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1202         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1203         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1204         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1205     }
1206     else
1207     {
1208         viewportIdx = vpai;
1209     }
1210
1211     if (!feState.vpTransformDisable)
1212     {
1213         // perspective divide
1214         typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1215
1216         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1217         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1218         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1219
1220         // viewport transform to screen coords
1221         if (state.backendState.readViewportArrayIndex)
1222         {
1223             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1224         }
1225         else
1226         {
1227             viewportTransform<1>(prim, state.vpMatrices);
1228         }
1229     }
1230
1231     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1232
1233     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1234     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1235
1236     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1237         pDC,
1238         pa,
1239         workerId,
1240         prim,
1241         primMask,
1242         primID,
1243         viewportIdx);
1244 }
1245
1246 void BinPoints(
1247     DRAW_CONTEXT *pDC,
1248     PA_STATE &pa,
1249     uint32_t workerId,
1250     simdvector prim[3],
1251     uint32_t primMask,
1252     simdscalari const &primID)
1253 {
1254     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1255         pDC,
1256         pa,
1257         workerId,
1258         prim,
1259         primMask,
1260         primID);
1261 }
1262
1263 #if USE_SIMD16_FRONTEND
1264 void SIMDCALL BinPoints_simd16(
1265     DRAW_CONTEXT *pDC,
1266     PA_STATE &pa,
1267     uint32_t workerId,
1268     simd16vector prim[3],
1269     uint32_t primMask,
1270     simd16scalari const &primID)
1271 {
1272     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1273         pDC,
1274         pa,
1275         workerId,
1276         prim,
1277         primMask,
1278         primID);
1279 }
1280
1281 #endif
1282 //////////////////////////////////////////////////////////////////////////
1283 /// @brief Bin SIMD lines to the backend.
1284 /// @param pDC - pointer to draw context.
1285 /// @param pa - The primitive assembly object.
1286 /// @param workerId - thread's worker id. Even thread has a unique id.
1287 /// @param tri - Contains line position data for SIMDs worth of points.
1288 /// @param primID - Primitive ID for each line.
1289 /// @param viewportIdx - Viewport Array Index for each line.
1290 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1291 void BinPostSetupLinesImpl(
1292     DRAW_CONTEXT *pDC,
1293     PA_STATE &pa,
1294     uint32_t workerId,
1295     typename SIMD_T::Vec4 prim[],
1296     typename SIMD_T::Float recipW[],
1297     uint32_t primMask,
1298     typename SIMD_T::Integer const &primID,
1299     typename SIMD_T::Integer const &viewportIdx)
1300 {
1301     SWR_CONTEXT *pContext = pDC->pContext;
1302
1303     AR_BEGIN(FEBinLines, pDC->drawId);
1304
1305     const API_STATE &state = GetApiState(pDC);
1306     const SWR_RASTSTATE &rastState = state.rastState;
1307
1308     // Select attribute processor
1309     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1310         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1311
1312     typename SIMD_T::Float &vRecipW0 = recipW[0];
1313     typename SIMD_T::Float &vRecipW1 = recipW[1];
1314
1315     // convert to fixed point
1316     typename SIMD_T::Integer vXi[2], vYi[2];
1317
1318     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1319     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1320     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1321     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1322
1323     // compute x-major vs y-major mask
1324     typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1325     typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1326     typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1327     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1328
1329     // cull zero-length lines
1330     typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1331     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1332
1333     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1334
1335     uint32_t *pPrimID = (uint32_t *)&primID;
1336     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1337
1338     // Calc bounding box of lines
1339     SIMDBBOX_T<SIMD_T> bbox;
1340     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1341     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1342     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1343     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1344
1345     // bloat bbox by line width along minor axis
1346     typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1347     typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1348
1349     SIMDBBOX_T<SIMD_T> bloatBox;
1350
1351     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1352     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1353     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1354     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1355
1356     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1357     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1358     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1359     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1360
1361     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1362     {
1363         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1364
1365         if (state.backendState.readViewportArrayIndex)
1366         {
1367             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1368         }
1369         else // broadcast fast path for non-VPAI case.
1370         {
1371             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1372             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1373             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1374             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1375         }
1376
1377         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1378         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1379         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1380         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1381     }
1382
1383     // Cull prims completely outside scissor
1384     {
1385         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1386         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1387         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1388         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1389         primMask = primMask & ~maskOutsideScissor;
1390     }
1391
1392     // transpose verts needed for backend
1393     /// @todo modify BE to take non-transformed verts
1394     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1395     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1396     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1397     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1398
1399     if (!primMask)
1400     {
1401         goto endBinLines;
1402     }
1403
1404     // Convert triangle bbox to macrotile units.
1405     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1406     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1407     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1408     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1409
1410     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1411
1412     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1413     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1414     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1415     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1416
1417     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1418     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1419     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1420     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1421
1422     // store render target array index
1423     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1424     if (state.backendState.readRenderTargetArrayIndex)
1425     {
1426         typename SIMD_T::Vec4 vRtai[2];
1427         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1428         typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1429         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1430     }
1431     else
1432     {
1433         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1434     }
1435
1436     // scan remaining valid prims and bin each separately
1437     DWORD primIndex;
1438     while (_BitScanForward(&primIndex, primMask))
1439     {
1440         uint32_t linkageCount = state.backendState.numAttributes;
1441         uint32_t numScalarAttribs = linkageCount * 4;
1442
1443         BE_WORK work;
1444         work.type = DRAW;
1445
1446         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1447
1448         desc.triFlags.frontFacing = 1;
1449         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1450         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1451         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1452
1453         work.pfnWork = RasterizeLine;
1454
1455         auto pArena = pDC->pArena;
1456         SWR_ASSERT(pArena != nullptr);
1457
1458         // store active attribs
1459         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1460         desc.numAttribs = linkageCount;
1461         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1462
1463         // store line vertex data
1464         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1465
1466         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1467         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1468         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1469         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1470
1471         // store user clip distances
1472         if (state.backendState.clipDistanceMask)
1473         {
1474             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1475             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1476             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1477         }
1478
1479         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1480         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1481         {
1482             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1483             {
1484 #if KNOB_ENABLE_TOSS_POINTS
1485                 if (!KNOB_TOSS_SETUP_TRIS)
1486 #endif
1487                 {
1488                     pTileMgr->enqueue(x, y, &work);
1489                 }
1490             }
1491         }
1492
1493         primMask &= ~(1 << primIndex);
1494     }
1495
1496 endBinLines:
1497
1498     AR_END(FEBinLines, 1);
1499 }
1500
1501 //////////////////////////////////////////////////////////////////////////
1502 /// @brief Bin SIMD lines to the backend.
1503 /// @param pDC - pointer to draw context.
1504 /// @param pa - The primitive assembly object.
1505 /// @param workerId - thread's worker id. Even thread has a unique id.
1506 /// @param tri - Contains line position data for SIMDs worth of points.
1507 /// @param primID - Primitive ID for each line.
1508 /// @param viewportIdx - Viewport Array Index for each line.
1509 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1510 void SIMDCALL BinLinesImpl(
1511     DRAW_CONTEXT *pDC,
1512     PA_STATE &pa,
1513     uint32_t workerId,
1514     typename SIMD_T::Vec4 prim[3],
1515     uint32_t primMask,
1516     typename SIMD_T::Integer const &primID)
1517 {
1518     const API_STATE& state = GetApiState(pDC);
1519     const SWR_RASTSTATE& rastState = state.rastState;
1520     const SWR_FRONTEND_STATE& feState = state.frontendState;
1521
1522     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1523
1524     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1525     typename SIMD_T::Vec4 vpiAttrib[2];
1526     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1527
1528     if (state.backendState.readViewportArrayIndex)
1529     {
1530         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1531         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1532     }
1533
1534
1535     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1536     {
1537         // OOB indices => forced to zero.
1538         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1539         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1540         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1541         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1542     }
1543
1544     if (!feState.vpTransformDisable)
1545     {
1546         // perspective divide
1547         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1548         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1549
1550         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1551         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1552
1553         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1554         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1555
1556         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1557         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1558
1559         // viewport transform to screen coords
1560         if (state.backendState.readViewportArrayIndex)
1561         {
1562             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1563         }
1564         else
1565         {
1566             viewportTransform<2>(prim, state.vpMatrices);
1567         }
1568     }
1569
1570     // adjust for pixel center location
1571     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1572
1573     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1574     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1575
1576     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1577     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1578
1579     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1580         pDC,
1581         pa,
1582         workerId,
1583         prim,
1584         vRecipW,
1585         primMask,
1586         primID,
1587         viewportIdx);
1588 }
1589
1590 void BinLines(
1591     DRAW_CONTEXT *pDC,
1592     PA_STATE &pa,
1593     uint32_t workerId,
1594     simdvector prim[],
1595     uint32_t primMask,
1596     simdscalari const &primID)
1597 {
1598     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1599 }
1600
1601 #if USE_SIMD16_FRONTEND
1602 void SIMDCALL BinLines_simd16(
1603     DRAW_CONTEXT *pDC,
1604     PA_STATE &pa,
1605     uint32_t workerId,
1606     simd16vector prim[3],
1607     uint32_t primMask,
1608     simd16scalari const &primID)
1609 {
1610     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1611 }
1612
1613 #endif