src/gallium/drivers/swr/rasterizer/core/binner.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file binner.cpp
  24 *
  25 * @brief Implementation for the macrotile binner
  26 *
  27 ******************************************************************************/
  28
  29 #include "binner.h"
  30 #include "context.h"
  31 #include "frontend.h"
  32 #include "conservativeRast.h"
  33 #include "pa.h"
  34 #include "rasterizer.h"
  35 #include "rdtsc_core.h"
  36 #include "tilemgr.h"
  37
  38 // Function Prototype
  39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  40 void BinPostSetupLinesImpl(
  41     DRAW_CONTEXT *pDC,
  42     PA_STATE &pa,
  43     uint32_t workerId,
  44     typename SIMD_T::Vec4 prim[],
  45     typename SIMD_T::Float recipW[],
  46     uint32_t primMask,
  47     typename SIMD_T::Integer const &primID,
  48     typename SIMD_T::Integer const &viewportIdx);
  49
  50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  51 void BinPostSetupPointsImpl(
  52     DRAW_CONTEXT *pDC,
  53     PA_STATE &pa,
  54     uint32_t workerId,
  55     typename SIMD_T::Vec4 prim[],
  56     uint32_t primMask,
  57     typename SIMD_T::Integer const &primID,
  58     typename SIMD_T::Integer const &viewportIdx);
  59
  60 //////////////////////////////////////////////////////////////////////////
  61 /// @brief Processes attributes for the backend based on linkage mask and
  62 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
  63 /// @param pDC - Draw context
  64 /// @param pa - Primitive Assembly state
  65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
  66 /// @param pLinkageMap - maps VS attribute slot to PS slot
  67 /// @param triIndex - Triangle to process attributes for
  68 /// @param pBuffer - Output result
  69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
  70 INLINE void ProcessAttributes(
  71     DRAW_CONTEXT *pDC,
  72     PA_STATE&pa,
  73     uint32_t triIndex,
  74     uint32_t primId,
  75     float *pBuffer)
  76 {
  77     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
  78     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
  79     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
  80     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
  81     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
  82     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
  83
  84     static const float constTable[3][4] = {
  85         { 0.0f, 0.0f, 0.0f, 0.0f },
  86         { 0.0f, 0.0f, 0.0f, 1.0f },
  87         { 1.0f, 1.0f, 1.0f, 1.0f }
  88     };
  89
  90     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
  91     {
  92         uint32_t inputSlot;
  93         if (IsSwizzledT::value)
  94         {
  95             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
  96             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
  97
  98         }
  99         else
 100         {
 101             inputSlot = backendState.vertexAttribOffset + i;
 102         }
 103
 104         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
 105         float* pAttribStart = pBuffer;
 106
 107         if (HasConstantInterpT::value || IsDegenerate::value)
 108         {
 109             if (CheckBit(constantInterpMask, i))
 110             {
 111                 uint32_t vid;
 112                 uint32_t adjustedTriIndex;
 113                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
 114                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
 115                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
 116                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
 117                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
 118
 119                 switch (topo) {
 120                 case TOP_QUAD_LIST:
 121                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
 122                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
 123                     break;
 124                 case TOP_QUAD_STRIP:
 125                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
 126                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
 127                     break;
 128                 case TOP_TRIANGLE_STRIP:
 129                     adjustedTriIndex = triIndex;
 130                     vid = (triIndex & 1)
 131                         ? tristripProvokingVertex[provokingVertex]
 132                         : provokingVertex;
 133                     break;
 134                 default:
 135                     adjustedTriIndex = triIndex;
 136                     vid = provokingVertex;
 137                     break;
 138                 }
 139
 140                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
 141
 142                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 143                 {
 144                     SIMD128::store_ps(pBuffer, attrib[vid]);
 145                     pBuffer += 4;
 146                 }
 147             }
 148             else
 149             {
 150                 pa.AssembleSingle(inputSlot, triIndex, attrib);
 151
 152                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 153                 {
 154                     SIMD128::store_ps(pBuffer, attrib[i]);
 155                     pBuffer += 4;
 156                 }
 157             }
 158         }
 159         else
 160         {
 161             pa.AssembleSingle(inputSlot, triIndex, attrib);
 162
 163             for (uint32_t i = 0; i < NumVertsT::value; ++i)
 164             {
 165                 SIMD128::store_ps(pBuffer, attrib[i]);
 166                 pBuffer += 4;
 167             }
 168         }
 169
 170         // pad out the attrib buffer to 3 verts to ensure the triangle
 171         // interpolation code in the pixel shader works correctly for the
 172         // 3 topologies - point, line, tri.  This effectively zeros out the
 173         // effect of the missing vertices in the triangle interpolation.
 174         for (uint32_t v = NumVertsT::value; v < 3; ++v)
 175         {
 176             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
 177             pBuffer += 4;
 178         }
 179
 180         // check for constant source overrides
 181         if (IsSwizzledT::value)
 182         {
 183             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
 184             if (mask)
 185             {
 186                 DWORD comp;
 187                 while (_BitScanForward(&comp, mask))
 188                 {
 189                     mask &= ~(1 << comp);
 190
 191                     float constantValue = 0.0f;
 192                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
 193                     {
 194                     case SWR_CONSTANT_SOURCE_CONST_0000:
 195                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
 196                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
 197                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
 198                         break;
 199                     case SWR_CONSTANT_SOURCE_PRIM_ID:
 200                         constantValue = *(float*)&primId;
 201                         break;
 202                     }
 203
 204                     // apply constant value to all 3 vertices
 205                     for (uint32_t v = 0; v < 3; ++v)
 206                     {
 207                         pAttribStart[comp + v * 4] = constantValue;
 208                     }
 209                 }
 210             }
 211         }
 212     }
 213 }
 214
 215 //////////////////////////////////////////////////////////////////////////
 216 /// @brief  Gather scissor rect data based on per-prim viewport indices.
 217 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
 218 /// @param pViewportIndex - array of per-primitive vewport indexes.
 219 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
 220 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
 221 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
 222 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 223 //
 224 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 225 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
 226     simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
 227 {
 228     scisXmin = _simd_set_epi32(
 229         pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 230         pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 231         pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 232         pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 233         pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 234         pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 235         pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 236         pScissorsInFixedPoint[pViewportIndex[7]].xmin);
 237     scisYmin = _simd_set_epi32(
 238         pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 239         pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 240         pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 241         pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 242         pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 243         pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 244         pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 245         pScissorsInFixedPoint[pViewportIndex[7]].ymin);
 246     scisXmax = _simd_set_epi32(
 247         pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 248         pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 249         pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 250         pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 251         pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 252         pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 253         pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 254         pScissorsInFixedPoint[pViewportIndex[7]].xmax);
 255     scisYmax = _simd_set_epi32(
 256         pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 257         pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 258         pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 259         pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 260         pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 261         pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 262         pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 263         pScissorsInFixedPoint[pViewportIndex[7]].ymax);
 264 }
 265
 266 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
 267     simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
 268 {
 269     scisXmin = _simd16_set_epi32(
 270         pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 271         pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 272         pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 273         pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 274         pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 275         pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 276         pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 277         pScissorsInFixedPoint[pViewportIndex[7]].xmin,
 278         pScissorsInFixedPoint[pViewportIndex[8]].xmin,
 279         pScissorsInFixedPoint[pViewportIndex[9]].xmin,
 280         pScissorsInFixedPoint[pViewportIndex[10]].xmin,
 281         pScissorsInFixedPoint[pViewportIndex[11]].xmin,
 282         pScissorsInFixedPoint[pViewportIndex[12]].xmin,
 283         pScissorsInFixedPoint[pViewportIndex[13]].xmin,
 284         pScissorsInFixedPoint[pViewportIndex[14]].xmin,
 285         pScissorsInFixedPoint[pViewportIndex[15]].xmin);
 286
 287     scisYmin = _simd16_set_epi32(
 288         pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 289         pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 290         pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 291         pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 292         pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 293         pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 294         pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 295         pScissorsInFixedPoint[pViewportIndex[7]].ymin,
 296         pScissorsInFixedPoint[pViewportIndex[8]].ymin,
 297         pScissorsInFixedPoint[pViewportIndex[9]].ymin,
 298         pScissorsInFixedPoint[pViewportIndex[10]].ymin,
 299         pScissorsInFixedPoint[pViewportIndex[11]].ymin,
 300         pScissorsInFixedPoint[pViewportIndex[12]].ymin,
 301         pScissorsInFixedPoint[pViewportIndex[13]].ymin,
 302         pScissorsInFixedPoint[pViewportIndex[14]].ymin,
 303         pScissorsInFixedPoint[pViewportIndex[15]].ymin);
 304
 305     scisXmax = _simd16_set_epi32(
 306         pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 307         pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 308         pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 309         pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 310         pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 311         pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 312         pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 313         pScissorsInFixedPoint[pViewportIndex[7]].xmax,
 314         pScissorsInFixedPoint[pViewportIndex[8]].xmax,
 315         pScissorsInFixedPoint[pViewportIndex[9]].xmax,
 316         pScissorsInFixedPoint[pViewportIndex[10]].xmax,
 317         pScissorsInFixedPoint[pViewportIndex[11]].xmax,
 318         pScissorsInFixedPoint[pViewportIndex[12]].xmax,
 319         pScissorsInFixedPoint[pViewportIndex[13]].xmax,
 320         pScissorsInFixedPoint[pViewportIndex[14]].xmax,
 321         pScissorsInFixedPoint[pViewportIndex[15]].xmax);
 322
 323     scisYmax = _simd16_set_epi32(
 324         pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 325         pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 326         pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 327         pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 328         pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 329         pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 330         pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 331         pScissorsInFixedPoint[pViewportIndex[7]].ymax,
 332         pScissorsInFixedPoint[pViewportIndex[8]].ymax,
 333         pScissorsInFixedPoint[pViewportIndex[9]].ymax,
 334         pScissorsInFixedPoint[pViewportIndex[10]].ymax,
 335         pScissorsInFixedPoint[pViewportIndex[11]].ymax,
 336         pScissorsInFixedPoint[pViewportIndex[12]].ymax,
 337         pScissorsInFixedPoint[pViewportIndex[13]].ymax,
 338         pScissorsInFixedPoint[pViewportIndex[14]].ymax,
 339         pScissorsInFixedPoint[pViewportIndex[15]].ymax);
 340 }
 341
 342 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 343
 344 struct ProcessAttributesChooser
 345 {
 346     typedef PFN_PROCESS_ATTRIBUTES FuncType;
 347
 348     template <typename... ArgsB>
 349     static FuncType GetFunc()
 350     {
 351         return ProcessAttributes<ArgsB...>;
 352     }
 353 };
 354
 355 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
 356 {
 357     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 358 }
 359
 360 //////////////////////////////////////////////////////////////////////////
 361 /// @brief Processes enabled user clip distances. Loads the active clip
 362 ///        distances from the PA, sets up barycentric equations, and
 363 ///        stores the results to the output buffer
 364 /// @param pa - Primitive Assembly state
 365 /// @param primIndex - primitive index to process
 366 /// @param clipDistMask - mask of enabled clip distances
 367 /// @param pUserClipBuffer - buffer to store results
 368 template<uint32_t NumVerts>
 369 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 370 {
 371     DWORD clipDist;
 372     uint32_t clipDistMask = state.clipDistanceMask;
 373     while (_BitScanForward(&clipDist, clipDistMask))
 374     {
 375         clipDistMask &= ~(1 << clipDist);
 376         uint32_t clipSlot = clipDist >> 2;
 377         uint32_t clipComp = clipDist & 0x3;
 378         uint32_t clipAttribSlot = clipSlot == 0 ?
 379             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 380
 381         simd4scalar primClipDist[3];
 382         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 383
 384         float vertClipDist[NumVerts];
 385         for (uint32_t e = 0; e < NumVerts; ++e)
 386         {
 387             OSALIGNSIMD(float) aVertClipDist[4];
 388             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
 389             vertClipDist[e] = aVertClipDist[clipComp];
 390         };
 391
 392         // setup plane equations for barycentric interpolation in the backend
 393         float baryCoeff[NumVerts];
 394         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
 395         for (uint32_t e = 0; e < NumVerts - 1; ++e)
 396         {
 397             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
 398         }
 399         baryCoeff[NumVerts - 1] = last;
 400
 401         for (uint32_t e = 0; e < NumVerts; ++e)
 402         {
 403             *(pUserClipBuffer++) = baryCoeff[e];
 404         }
 405     }
 406 }
 407
 408 INLINE
 409 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
 410 {
 411     vTranspose3x8(dst, src0, src1, src2);
 412 }
 413
 414 INLINE
 415 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
 416 {
 417     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 418 }
 419
 420 //////////////////////////////////////////////////////////////////////////
 421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
 422 ///        culling, viewport transform, etc.
 423 /// @param pDC - pointer to draw context.
 424 /// @param pa - The primitive assembly object.
 425 /// @param workerId - thread's worker id. Even thread has a unique id.
 426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
 427 /// @param primID - Primitive ID for each triangle.
 428 /// @param viewportIdx - viewport array index for each triangle.
 429 /// @tparam CT - ConservativeRastFETraits
 430 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 431 void SIMDCALL BinTrianglesImpl(
 432     DRAW_CONTEXT *pDC,
 433     PA_STATE &pa,
 434     uint32_t workerId,
 435     typename SIMD_T::Vec4 tri[3],
 436     uint32_t triMask,
 437     typename SIMD_T::Integer const &primID)
 438 {
 439     SWR_CONTEXT *pContext = pDC->pContext;
 440
 441     AR_BEGIN(FEBinTriangles, pDC->drawId);
 442
 443     const API_STATE& state = GetApiState(pDC);
 444     const SWR_RASTSTATE& rastState = state.rastState;
 445     const SWR_FRONTEND_STATE& feState = state.frontendState;
 446
 447     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 448
 449     typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
 450     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 451     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 452
 453     typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
 454
 455     if (state.backendState.readViewportArrayIndex)
 456     {
 457         typename SIMD_T::Vec4 vpiAttrib[3];
 458         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 459
 460         // OOB indices => forced to zero.
 461         typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 462         vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
 463         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 464         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
 465         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 466     }
 467
 468     if (feState.vpTransformDisable)
 469     {
 470         // RHW is passed in directly when VP transform is disabled
 471         vRecipW0 = tri[0].v[3];
 472         vRecipW1 = tri[1].v[3];
 473         vRecipW2 = tri[2].v[3];
 474     }
 475     else
 476     {
 477         // Perspective divide
 478         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
 479         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
 480         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
 481
 482         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
 483         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
 484         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
 485
 486         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
 487         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
 488         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
 489
 490         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
 491         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
 492         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 493
 494         // Viewport transform to screen space coords
 495         if (state.backendState.readViewportArrayIndex)
 496         {
 497             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 498         }
 499         else
 500         {
 501             viewportTransform<3>(tri, state.vpMatrices);
 502         }
 503     }
 504
 505     // Adjust for pixel center location
 506     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
 507
 508     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 509     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
 510
 511     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
 512     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
 513
 514     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
 515     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
 516
 517     // Set vXi, vYi to required fixed point precision
 518     typename SIMD_T::Integer vXi[3], vYi[3];
 519     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
 520
 521     // triangle setup
 522     typename SIMD_T::Integer vAi[3], vBi[3];
 523     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
 524
 525     // determinant
 526     typename SIMD_T::Integer vDet[2];
 527     calcDeterminantIntVertical(vAi, vBi, vDet);
 528
 529     // cull zero area
 530     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
 531     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 532
 533     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 534
 535     // don't cull degenerate triangles if we're conservatively rasterizing
 536     uint32_t origTriMask = triMask;
 537     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
 538     {
 539         triMask &= ~cullZeroAreaMask;
 540     }
 541
 542     // determine front winding tris
 543     // CW  +det
 544     // CCW det < 0;
 545     // 0 area triangles are marked as backfacing regardless of winding order,
 546     // which is required behavior for conservative rast and wireframe rendering
 547     uint32_t frontWindingTris;
 548     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
 549     {
 550         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 551         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 552     }
 553     else
 554     {
 555         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
 556         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
 557     }
 558     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 559
 560     // cull
 561     uint32_t cullTris;
 562     switch ((SWR_CULLMODE)rastState.cullMode)
 563     {
 564     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
 565     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
 566     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
 567         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
 568     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
 569     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
 570     }
 571
 572     triMask &= ~cullTris;
 573
 574     if (origTriMask ^ triMask)
 575     {
 576         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 577     }
 578
 579     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
 580     // compute per tri backface
 581     uint32_t frontFaceMask = frontWindingTris;
 582     uint32_t *pPrimID = (uint32_t *)&primID;
 583     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 584     DWORD triIndex = 0;
 585
 586     uint32_t edgeEnable;
 587     PFN_WORK_FUNC pfnWork;
 588     if (CT::IsConservativeT::value)
 589     {
 590         // determine which edges of the degenerate tri, if any, are valid to rasterize.
 591         // used to call the appropriate templated rasterizer function
 592         if (cullZeroAreaMask > 0)
 593         {
 594             // e0 = v1-v0
 595             const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
 596             const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 597
 598             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 599
 600             // e1 = v2-v1
 601             const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
 602             const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 603
 604             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 605
 606             // e2 = v0-v2
 607             // if v0 == v1 & v1 == v2, v0 == v2
 608             uint32_t e2Mask = e0Mask & e1Mask;
 609             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
 610
 611             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
 612             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
 613             e0Mask = pdep_u32(e0Mask, 0x00249249);
 614
 615             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
 616             e1Mask = pdep_u32(e1Mask, 0x00492492);
 617
 618             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
 619             e2Mask = pdep_u32(e2Mask, 0x00924924);
 620
 621             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
 622         }
 623         else
 624         {
 625             edgeEnable = 0x00FFFFFF;
 626         }
 627     }
 628     else
 629     {
 630         // degenerate triangles won't be sent to rasterizer; just enable all edges
 631         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 632             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
 633     }
 634
 635     SIMDBBOX_T<SIMD_T> bbox;
 636
 637     if (!triMask)
 638     {
 639         goto endBinTriangles;
 640     }
 641
 642     // Calc bounding box of triangles
 643     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
 644
 645     // determine if triangle falls between pixel centers and discard
 646     // only discard for non-MSAA case and when conservative rast is disabled
 647     // (xmin + 127) & ~255
 648     // (xmax + 128) & ~255
 649     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
 650         (!CT::IsConservativeT::value))
 651     {
 652         origTriMask = triMask;
 653
 654         int cullCenterMask;
 655
 656         {
 657             typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
 658             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
 659             typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
 660             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 661
 662             typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 663
 664             typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
 665             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
 666             typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
 667             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 668
 669             typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 670
 671             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
 672             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
 673         }
 674
 675         triMask &= ~cullCenterMask;
 676
 677         if (origTriMask ^ triMask)
 678         {
 679             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 680         }
 681     }
 682
 683     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
 684     // Gather the AOS effective scissor rects based on the per-prim VP index.
 685     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 686     {
 687         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 688
 689         if (state.backendState.readViewportArrayIndex)
 690         {
 691             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
 692         }
 693         else // broadcast fast path for non-VPAI case.
 694         {
 695             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
 696             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
 697             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
 698             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
 699         }
 700
 701         // Make triangle bbox inclusive
 702         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
 703         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
 704
 705         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
 706         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
 707         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
 708         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
 709     }
 710
 711     if (CT::IsConservativeT::value)
 712     {
 713         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
 714         // some area. Bump the xmax/ymax edges out
 715
 716         typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
 717         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 718
 719         typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
 720         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
 721     }
 722
 723     // Cull tris completely outside scissor
 724     {
 725         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
 726         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
 727         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
 728         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
 729         triMask = triMask & ~maskOutsideScissor;
 730     }
 731
 732 endBinTriangles:
 733
 734
 735     // Send surviving triangles to the line or point binner based on fill mode
 736     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
 737     {
 738         // Simple non-conformant wireframe mode, useful for debugging
 739         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
 740         typename SIMD_T::Vec4 line[2];
 741         typename SIMD_T::Float recipW[2];
 742
 743         line[0] = tri[0];
 744         line[1] = tri[1];
 745         recipW[0] = vRecipW0;
 746         recipW[1] = vRecipW1;
 747
 748         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 749
 750         line[0] = tri[1];
 751         line[1] = tri[2];
 752         recipW[0] = vRecipW1;
 753         recipW[1] = vRecipW2;
 754
 755         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 756
 757         line[0] = tri[2];
 758         line[1] = tri[0];
 759         recipW[0] = vRecipW2;
 760         recipW[1] = vRecipW0;
 761
 762         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 763
 764         AR_END(FEBinTriangles, 1);
 765         return;
 766     }
 767     else if (rastState.fillMode == SWR_FILLMODE_POINT)
 768     {
 769         // Bin 3 points
 770         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
 771         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
 772         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
 773
 774         AR_END(FEBinTriangles, 1);
 775         return;
 776     }
 777
 778     // Convert triangle bbox to macrotile units.
 779     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
 780     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
 781     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
 782     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 783
 784     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 785
 786     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
 787     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
 788     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
 789     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
 790
 791     // transpose verts needed for backend
 792     /// @todo modify BE to take non-transformed verts
 793     simd4scalar vHorizX[SIMD_WIDTH];
 794     simd4scalar vHorizY[SIMD_WIDTH];
 795     simd4scalar vHorizZ[SIMD_WIDTH];
 796     simd4scalar vHorizW[SIMD_WIDTH];
 797
 798     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
 799     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
 800     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
 801     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 802
 803     // store render target array index
 804     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
 805     if (state.backendState.readRenderTargetArrayIndex)
 806     {
 807         typename SIMD_T::Vec4 vRtai[3];
 808         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
 809         typename SIMD_T::Integer vRtaii;
 810         vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
 811         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
 812     }
 813     else
 814     {
 815         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
 816     }
 817
 818     // scan remaining valid triangles and bin each separately
 819     while (_BitScanForward(&triIndex, triMask))
 820     {
 821         uint32_t linkageCount = state.backendState.numAttributes;
 822         uint32_t numScalarAttribs = linkageCount * 4;
 823
 824         BE_WORK work;
 825         work.type = DRAW;
 826
 827         bool isDegenerate;
 828         if (CT::IsConservativeT::value)
 829         {
 830             // only rasterize valid edges if we have a degenerate primitive
 831             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
 832             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 833                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
 834
 835             // Degenerate triangles are required to be constant interpolated
 836             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
 837         }
 838         else
 839         {
 840             isDegenerate = false;
 841             work.pfnWork = pfnWork;
 842         }
 843
 844         // Select attribute processor
 845         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
 846             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
 847
 848         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 849
 850         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
 851         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
 852         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
 853
 854         auto pArena = pDC->pArena;
 855         SWR_ASSERT(pArena != nullptr);
 856
 857         // store active attribs
 858         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
 859         desc.pAttribs = pAttribs;
 860         desc.numAttribs = linkageCount;
 861         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
 862
 863         // store triangle vertex data
 864         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 865
 866         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
 867         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
 868         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
 869         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 870
 871         // store user clip distances
 872         if (state.backendState.clipDistanceMask)
 873         {
 874             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
 875             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
 876             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
 877         }
 878
 879         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
 880         {
 881             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
 882             {
 883 #if KNOB_ENABLE_TOSS_POINTS
 884                 if (!KNOB_TOSS_SETUP_TRIS)
 885 #endif
 886                 {
 887                     pTileMgr->enqueue(x, y, &work);
 888                 }
 889             }
 890         }
 891
 892                      triMask &= ~(1 << triIndex);
 893     }
 894
 895     AR_END(FEBinTriangles, 1);
 896 }
 897
 898 template <typename CT>
 899 void BinTriangles(
 900     DRAW_CONTEXT *pDC,
 901     PA_STATE &pa,
 902     uint32_t workerId,
 903     simdvector tri[3],
 904     uint32_t triMask,
 905     simdscalari const &primID)
 906 {
 907     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 908 }
 909
 910 #if USE_SIMD16_FRONTEND
 911 template <typename CT>
 912 void SIMDCALL BinTriangles_simd16(
 913     DRAW_CONTEXT *pDC,
 914     PA_STATE &pa,
 915     uint32_t workerId,
 916     simd16vector tri[3],
 917     uint32_t triMask,
 918     simd16scalari const &primID)
 919 {
 920     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 921 }
 922
 923 #endif
 924 struct FEBinTrianglesChooser
 925 {
 926     typedef PFN_PROCESS_PRIMS FuncType;
 927
 928     template <typename... ArgsB>
 929     static FuncType GetFunc()
 930     {
 931         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
 932     }
 933 };
 934
 935 // Selector for correct templated BinTrinagles function
 936 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
 937 {
 938     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
 939 }
 940
 941 #if USE_SIMD16_FRONTEND
 942 struct FEBinTrianglesChooser_simd16
 943 {
 944     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
 945
 946     template <typename... ArgsB>
 947     static FuncType GetFunc()
 948     {
 949         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
 950     }
 951 };
 952
 953 // Selector for correct templated BinTrinagles function
 954 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
 955 {
 956     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
 957 }
 958
 959 #endif
 960
 961 template <typename SIMD_T, uint32_t SIMD_WIDTH>
 962 void BinPostSetupPointsImpl(
 963     DRAW_CONTEXT *pDC,
 964     PA_STATE &pa,
 965     uint32_t workerId,
 966     typename SIMD_T::Vec4 prim[],
 967     uint32_t primMask,
 968     typename SIMD_T::Integer const &primID,
 969     typename SIMD_T::Integer const &viewportIdx)
 970 {
 971     SWR_CONTEXT *pContext = pDC->pContext;
 972
 973     AR_BEGIN(FEBinPoints, pDC->drawId);
 974
 975     typename SIMD_T::Vec4 &primVerts = prim[0];
 976
 977     const API_STATE& state = GetApiState(pDC);
 978     const SWR_RASTSTATE& rastState = state.rastState;
 979     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 980
 981     // Select attribute processor
 982     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
 983         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 984
 985     // convert to fixed point
 986     typename SIMD_T::Integer vXi, vYi;
 987
 988     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
 989     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
 990
 991     if (CanUseSimplePoints(pDC))
 992     {
 993         // adjust for ymin-xmin rule
 994         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
 995         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
 996
 997         // cull points off the ymin-xmin edge of the viewport
 998         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
 999         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1000
1001         // compute macro tile coordinates
1002         typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1003         typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1004
1005         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1006
1007         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1008         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1009
1010         // compute raster tile coordinates
1011         typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1012         typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1013
1014         // compute raster tile relative x,y for coverage mask
1015         typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1016         typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1017
1018         typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1019         typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1020
1021         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1022         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1023
1024         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1025         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1026
1027         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1028         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1029
1030         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1031         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1032
1033         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1034         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1035
1036         // store render target array index
1037         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1038         if (state.backendState.readRenderTargetArrayIndex)
1039         {
1040             typename SIMD_T::Vec4 vRtai;
1041             pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
1042             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1043             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1044         }
1045         else
1046         {
1047             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1048         }
1049
1050         uint32_t *pPrimID = (uint32_t *)&primID;
1051         DWORD primIndex = 0;
1052
1053         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1054
1055         // scan remaining valid triangles and bin each separately
1056         while (_BitScanForward(&primIndex, primMask))
1057         {
1058             uint32_t linkageCount = backendState.numAttributes;
1059             uint32_t numScalarAttribs = linkageCount * 4;
1060
1061             BE_WORK work;
1062             work.type = DRAW;
1063
1064             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1065
1066             // points are always front facing
1067             desc.triFlags.frontFacing = 1;
1068             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1069             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1070
1071             work.pfnWork = RasterizeSimplePoint;
1072
1073             auto pArena = pDC->pArena;
1074             SWR_ASSERT(pArena != nullptr);
1075
1076             // store attributes
1077             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1078             desc.pAttribs = pAttribs;
1079             desc.numAttribs = linkageCount;
1080
1081             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1082
1083             // store raster tile aligned x, y, perspective correct z
1084             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1085             desc.pTriBuffer = pTriBuffer;
1086             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1087             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1088             *pTriBuffer = aZ[primIndex];
1089
1090             uint32_t tX = aTileRelativeX[primIndex];
1091             uint32_t tY = aTileRelativeY[primIndex];
1092
1093             // pack the relative x,y into the coverageMask, the rasterizer will
1094             // generate the true coverage mask from it
1095             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1096
1097             // bin it
1098             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1099 #if KNOB_ENABLE_TOSS_POINTS
1100             if (!KNOB_TOSS_SETUP_TRIS)
1101 #endif
1102             {
1103                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1104             }
1105
1106             primMask &= ~(1 << primIndex);
1107         }
1108     }
1109     else
1110     {
1111         // non simple points need to be potentially binned to multiple macro tiles
1112         typename SIMD_T::Float vPointSize;
1113
1114         if (rastState.pointParam)
1115         {
1116             typename SIMD_T::Vec4 size[3];
1117             pa.Assemble(VERTEX_SGV_SLOT, size);
1118             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1119         }
1120         else
1121         {
1122             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1123         }
1124
1125         // bloat point to bbox
1126         SIMDBBOX_T<SIMD_T> bbox;
1127
1128         bbox.xmin = bbox.xmax = vXi;
1129         bbox.ymin = bbox.ymax = vYi;
1130
1131         typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1132         typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1133
1134         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1135         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1136         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1137         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1138
1139         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1140         // Gather the AOS effective scissor rects based on the per-prim VP index.
1141         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1142         {
1143             typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1144
1145             if (state.backendState.readViewportArrayIndex)
1146             {
1147                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1148             }
1149             else // broadcast fast path for non-VPAI case.
1150             {
1151                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1152                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1153                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1154                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1155             }
1156
1157             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1158             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1159             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1160             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1161         }
1162
1163         // Cull bloated points completely outside scissor
1164         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1165         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1166         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1167         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1168         primMask = primMask & ~maskOutsideScissor;
1169
1170         // Convert bbox to macrotile units.
1171         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1172         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1173         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1174         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1175
1176         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1177
1178         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1179         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1180         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1181         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1182
1183         // store render target array index
1184         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1185         if (state.backendState.readRenderTargetArrayIndex)
1186         {
1187             typename SIMD_T::Vec4 vRtai[2];
1188             pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1189             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1190             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1191         }
1192         else
1193         {
1194             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1195         }
1196
1197         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1198         _simd16_store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1199
1200         uint32_t *pPrimID = (uint32_t *)&primID;
1201
1202         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1203         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1204         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1205
1206         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1207         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1208         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1209
1210         // scan remaining valid prims and bin each separately
1211         const SWR_BACKEND_STATE& backendState = state.backendState;
1212         DWORD primIndex;
1213         while (_BitScanForward(&primIndex, primMask))
1214         {
1215             uint32_t linkageCount = backendState.numAttributes;
1216             uint32_t numScalarAttribs = linkageCount * 4;
1217
1218             BE_WORK work;
1219             work.type = DRAW;
1220
1221             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1222
1223             desc.triFlags.frontFacing = 1;
1224             desc.triFlags.pointSize = aPointSize[primIndex];
1225             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1226             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1227
1228             work.pfnWork = RasterizeTriPoint;
1229
1230             auto pArena = pDC->pArena;
1231             SWR_ASSERT(pArena != nullptr);
1232
1233             // store active attribs
1234             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1235             desc.numAttribs = linkageCount;
1236             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1237
1238             // store point vertex data
1239             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1240             desc.pTriBuffer = pTriBuffer;
1241             *pTriBuffer++ = aPrimVertsX[primIndex];
1242             *pTriBuffer++ = aPrimVertsY[primIndex];
1243             *pTriBuffer = aPrimVertsZ[primIndex];
1244
1245             // store user clip distances
1246             if (backendState.clipDistanceMask)
1247             {
1248                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1249                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1250                 float dists[8];
1251                 float one = 1.0f;
1252                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1253                 for (uint32_t i = 0; i < numClipDist; i++) {
1254                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1255                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1256                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1257                 }
1258             }
1259
1260             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1261             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1262             {
1263                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1264                 {
1265 #if KNOB_ENABLE_TOSS_POINTS
1266                     if (!KNOB_TOSS_SETUP_TRIS)
1267 #endif
1268                     {
1269                         pTileMgr->enqueue(x, y, &work);
1270                     }
1271                 }
1272             }
1273
1274             primMask &= ~(1 << primIndex);
1275         }
1276     }
1277
1278     AR_END(FEBinPoints, 1);
1279 }
1280
1281 //////////////////////////////////////////////////////////////////////////
1282 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1283 /// @param pDC - pointer to draw context.
1284 /// @param pa - The primitive assembly object.
1285 /// @param workerId - thread's worker id. Even thread has a unique id.
1286 /// @param tri - Contains point position data for SIMDs worth of points.
1287 /// @param primID - Primitive ID for each point.
1288 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1289 void BinPointsImpl(
1290     DRAW_CONTEXT *pDC,
1291     PA_STATE &pa,
1292     uint32_t workerId,
1293     typename SIMD_T::Vec4 prim[3],
1294     uint32_t primMask,
1295     typename SIMD_T::Integer const &primID)
1296 {
1297     const API_STATE& state = GetApiState(pDC);
1298     const SWR_FRONTEND_STATE& feState = state.frontendState;
1299     const SWR_RASTSTATE& rastState = state.rastState;
1300
1301     // Read back viewport index if required
1302     typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
1303     if (state.backendState.readViewportArrayIndex)
1304     {
1305         typename SIMD_T::Vec4 vpiAttrib[1];
1306         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1307
1308         // OOB indices => forced to zero.
1309         typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1310         vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
1311         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1312         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1313         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1314     }
1315
1316     if (!feState.vpTransformDisable)
1317     {
1318         // perspective divide
1319         typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1320
1321         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1322         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1323         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1324
1325         // viewport transform to screen coords
1326         if (state.backendState.readViewportArrayIndex)
1327         {
1328             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1329         }
1330         else
1331         {
1332             viewportTransform<1>(prim, state.vpMatrices);
1333         }
1334     }
1335
1336     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1337
1338     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1339     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1340
1341     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1342         pDC,
1343         pa,
1344         workerId,
1345         prim,
1346         primMask,
1347         primID,
1348         viewportIdx);
1349 }
1350
1351 void BinPoints(
1352     DRAW_CONTEXT *pDC,
1353     PA_STATE &pa,
1354     uint32_t workerId,
1355     simdvector prim[3],
1356     uint32_t primMask,
1357     simdscalari const &primID)
1358 {
1359     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1360         pDC,
1361         pa,
1362         workerId,
1363         prim,
1364         primMask,
1365         primID);
1366 }
1367
1368 #if USE_SIMD16_FRONTEND
1369 void SIMDCALL BinPoints_simd16(
1370     DRAW_CONTEXT *pDC,
1371     PA_STATE &pa,
1372     uint32_t workerId,
1373     simd16vector prim[3],
1374     uint32_t primMask,
1375     simd16scalari const &primID)
1376 {
1377     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1378         pDC,
1379         pa,
1380         workerId,
1381         prim,
1382         primMask,
1383         primID);
1384 }
1385
1386 #endif
1387 //////////////////////////////////////////////////////////////////////////
1388 /// @brief Bin SIMD lines to the backend.
1389 /// @param pDC - pointer to draw context.
1390 /// @param pa - The primitive assembly object.
1391 /// @param workerId - thread's worker id. Even thread has a unique id.
1392 /// @param tri - Contains line position data for SIMDs worth of points.
1393 /// @param primID - Primitive ID for each line.
1394 /// @param viewportIdx - Viewport Array Index for each line.
1395 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1396 void BinPostSetupLinesImpl(
1397     DRAW_CONTEXT *pDC,
1398     PA_STATE &pa,
1399     uint32_t workerId,
1400     typename SIMD_T::Vec4 prim[],
1401     typename SIMD_T::Float recipW[],
1402     uint32_t primMask,
1403     typename SIMD_T::Integer const &primID,
1404     typename SIMD_T::Integer const &viewportIdx)
1405 {
1406     SWR_CONTEXT *pContext = pDC->pContext;
1407
1408     AR_BEGIN(FEBinLines, pDC->drawId);
1409
1410     const API_STATE &state = GetApiState(pDC);
1411     const SWR_RASTSTATE &rastState = state.rastState;
1412
1413     // Select attribute processor
1414     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1415         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1416
1417     typename SIMD_T::Float &vRecipW0 = recipW[0];
1418     typename SIMD_T::Float &vRecipW1 = recipW[1];
1419
1420     // convert to fixed point
1421     typename SIMD_T::Integer vXi[2], vYi[2];
1422
1423     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1424     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1425     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1426     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1427
1428     // compute x-major vs y-major mask
1429     typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1430     typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1431     typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1432     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1433
1434     // cull zero-length lines
1435     typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1436     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1437
1438     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1439
1440     uint32_t *pPrimID = (uint32_t *)&primID;
1441     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1442
1443     // Calc bounding box of lines
1444     SIMDBBOX_T<SIMD_T> bbox;
1445     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1446     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1447     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1448     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1449
1450     // bloat bbox by line width along minor axis
1451     typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1452     typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1453
1454     SIMDBBOX_T<SIMD_T> bloatBox;
1455
1456     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1457     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1458     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1459     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1460
1461     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1462     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1463     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1464     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1465
1466     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1467     {
1468         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1469
1470         if (state.backendState.readViewportArrayIndex)
1471         {
1472             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1473         }
1474         else // broadcast fast path for non-VPAI case.
1475         {
1476             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1477             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1478             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1479             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1480         }
1481
1482         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1483         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1484         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1485         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1486     }
1487
1488     // Cull prims completely outside scissor
1489     {
1490         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1491         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1492         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1493         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1494         primMask = primMask & ~maskOutsideScissor;
1495     }
1496
1497     // transpose verts needed for backend
1498     /// @todo modify BE to take non-transformed verts
1499     simd4scalar vHorizX[SIMD_WIDTH];
1500     simd4scalar vHorizY[SIMD_WIDTH];
1501     simd4scalar vHorizZ[SIMD_WIDTH];
1502     simd4scalar vHorizW[SIMD_WIDTH];
1503
1504     if (!primMask)
1505     {
1506         goto endBinLines;
1507     }
1508
1509     // Convert triangle bbox to macrotile units.
1510     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1511     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1512     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1513     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1514
1515     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1516
1517     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1518     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1519     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1520     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1521
1522     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1523     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1524     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1525     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1526
1527     // store render target array index
1528     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1529     if (state.backendState.readRenderTargetArrayIndex)
1530     {
1531         typename SIMD_T::Vec4 vRtai[2];
1532         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1533         typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1534         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1535     }
1536     else
1537     {
1538         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1539     }
1540
1541     // scan remaining valid prims and bin each separately
1542     DWORD primIndex;
1543     while (_BitScanForward(&primIndex, primMask))
1544     {
1545         uint32_t linkageCount = state.backendState.numAttributes;
1546         uint32_t numScalarAttribs = linkageCount * 4;
1547
1548         BE_WORK work;
1549         work.type = DRAW;
1550
1551         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1552
1553         desc.triFlags.frontFacing = 1;
1554         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1555         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1556         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1557
1558         work.pfnWork = RasterizeLine;
1559
1560         auto pArena = pDC->pArena;
1561         SWR_ASSERT(pArena != nullptr);
1562
1563         // store active attribs
1564         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1565         desc.numAttribs = linkageCount;
1566         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1567
1568         // store line vertex data
1569         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1570
1571         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1572         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1573         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1574         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1575
1576         // store user clip distances
1577         if (state.backendState.clipDistanceMask)
1578         {
1579             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1580             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1581             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1582         }
1583
1584         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1585         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1586         {
1587             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1588             {
1589 #if KNOB_ENABLE_TOSS_POINTS
1590                 if (!KNOB_TOSS_SETUP_TRIS)
1591 #endif
1592                 {
1593                     pTileMgr->enqueue(x, y, &work);
1594                 }
1595             }
1596         }
1597
1598         primMask &= ~(1 << primIndex);
1599     }
1600
1601 endBinLines:
1602
1603     AR_END(FEBinLines, 1);
1604 }
1605
1606 //////////////////////////////////////////////////////////////////////////
1607 /// @brief Bin SIMD lines to the backend.
1608 /// @param pDC - pointer to draw context.
1609 /// @param pa - The primitive assembly object.
1610 /// @param workerId - thread's worker id. Even thread has a unique id.
1611 /// @param tri - Contains line position data for SIMDs worth of points.
1612 /// @param primID - Primitive ID for each line.
1613 /// @param viewportIdx - Viewport Array Index for each line.
1614 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1615 void SIMDCALL BinLinesImpl(
1616     DRAW_CONTEXT *pDC,
1617     PA_STATE &pa,
1618     uint32_t workerId,
1619     typename SIMD_T::Vec4 prim[3],
1620     uint32_t primMask,
1621     typename SIMD_T::Integer const &primID)
1622 {
1623     const API_STATE& state = GetApiState(pDC);
1624     const SWR_RASTSTATE& rastState = state.rastState;
1625     const SWR_FRONTEND_STATE& feState = state.frontendState;
1626
1627     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1628
1629     typename SIMD_T::Integer viewportIdx = SIMD_T::set1_epi32(0);
1630     if (state.backendState.readViewportArrayIndex)
1631     {
1632         typename SIMD_T::Vec4 vpiAttrib[2];
1633         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1634
1635         // OOB indices => forced to zero.
1636         typename SIMD_T::Integer vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1637         vpai = SIMD_T::max_epi32(SIMD_T::setzero_si(), vpai);
1638         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1639         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1640         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1641     }
1642
1643     if (!feState.vpTransformDisable)
1644     {
1645         // perspective divide
1646         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1647         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1648
1649         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1650         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1651
1652         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1653         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1654
1655         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1656         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1657
1658         // viewport transform to screen coords
1659         if (state.backendState.readViewportArrayIndex)
1660         {
1661             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1662         }
1663         else
1664         {
1665             viewportTransform<2>(prim, state.vpMatrices);
1666         }
1667     }
1668
1669     // adjust for pixel center location
1670     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1671
1672     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1673     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1674
1675     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1676     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1677
1678     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1679         pDC,
1680         pa,
1681         workerId,
1682         prim,
1683         vRecipW,
1684         primMask,
1685         primID,
1686         viewportIdx);
1687 }
1688
1689 void BinLines(
1690     DRAW_CONTEXT *pDC,
1691     PA_STATE &pa,
1692     uint32_t workerId,
1693     simdvector prim[],
1694     uint32_t primMask,
1695     simdscalari const &primID)
1696 {
1697     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1698 }
1699
1700 #if USE_SIMD16_FRONTEND
1701 void SIMDCALL BinLines_simd16(
1702     DRAW_CONTEXT *pDC,
1703     PA_STATE &pa,
1704     uint32_t workerId,
1705     simd16vector prim[3],
1706     uint32_t primMask,
1707     simd16scalari const &primID)
1708 {
1709     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1710 }
1711
1712 #endif