src/gallium/drivers/swr/rasterizer/core/binner.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file binner.cpp
  24 *
  25 * @brief Implementation for the macrotile binner
  26 *
  27 ******************************************************************************/
  28
  29 #include "binner.h"
  30 #include "context.h"
  31 #include "frontend.h"
  32 #include "conservativeRast.h"
  33 #include "pa.h"
  34 #include "rasterizer.h"
  35 #include "rdtsc_core.h"
  36 #include "tilemgr.h"
  37
  38 // Function Prototype
  39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  40 void BinPostSetupLinesImpl(
  41     DRAW_CONTEXT *pDC,
  42     PA_STATE &pa,
  43     uint32_t workerId,
  44     typename SIMD_T::Vec4 prim[],
  45     typename SIMD_T::Float recipW[],
  46     uint32_t primMask,
  47     typename SIMD_T::Integer const &primID,
  48     typename SIMD_T::Integer const &viewportIdx);
  49
  50 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  51 void BinPostSetupPointsImpl(
  52     DRAW_CONTEXT *pDC,
  53     PA_STATE &pa,
  54     uint32_t workerId,
  55     typename SIMD_T::Vec4 prim[],
  56     uint32_t primMask,
  57     typename SIMD_T::Integer const &primID,
  58     typename SIMD_T::Integer const &viewportIdx);
  59
  60 //////////////////////////////////////////////////////////////////////////
  61 /// @brief Processes attributes for the backend based on linkage mask and
  62 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
  63 /// @param pDC - Draw context
  64 /// @param pa - Primitive Assembly state
  65 /// @param linkageMask - Specifies which VS outputs are routed to PS.
  66 /// @param pLinkageMap - maps VS attribute slot to PS slot
  67 /// @param triIndex - Triangle to process attributes for
  68 /// @param pBuffer - Output result
  69 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
  70 INLINE void ProcessAttributes(
  71     DRAW_CONTEXT *pDC,
  72     PA_STATE&pa,
  73     uint32_t triIndex,
  74     uint32_t primId,
  75     float *pBuffer)
  76 {
  77     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
  78     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
  79     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
  80     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
  81     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
  82     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
  83
  84     static const float constTable[3][4] = {
  85         { 0.0f, 0.0f, 0.0f, 0.0f },
  86         { 0.0f, 0.0f, 0.0f, 1.0f },
  87         { 1.0f, 1.0f, 1.0f, 1.0f }
  88     };
  89
  90     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
  91     {
  92         uint32_t inputSlot;
  93         if (IsSwizzledT::value)
  94         {
  95             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
  96             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
  97
  98         }
  99         else
 100         {
 101             inputSlot = backendState.vertexAttribOffset + i;
 102         }
 103
 104         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
 105         float* pAttribStart = pBuffer;
 106
 107         if (HasConstantInterpT::value || IsDegenerate::value)
 108         {
 109             if (CheckBit(constantInterpMask, i))
 110             {
 111                 uint32_t vid;
 112                 uint32_t adjustedTriIndex;
 113                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
 114                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
 115                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
 116                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
 117                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
 118
 119                 switch (topo) {
 120                 case TOP_QUAD_LIST:
 121                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
 122                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
 123                     break;
 124                 case TOP_QUAD_STRIP:
 125                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
 126                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
 127                     break;
 128                 case TOP_TRIANGLE_STRIP:
 129                     adjustedTriIndex = triIndex;
 130                     vid = (triIndex & 1)
 131                         ? tristripProvokingVertex[provokingVertex]
 132                         : provokingVertex;
 133                     break;
 134                 default:
 135                     adjustedTriIndex = triIndex;
 136                     vid = provokingVertex;
 137                     break;
 138                 }
 139
 140                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
 141
 142                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 143                 {
 144                     SIMD128::store_ps(pBuffer, attrib[vid]);
 145                     pBuffer += 4;
 146                 }
 147             }
 148             else
 149             {
 150                 pa.AssembleSingle(inputSlot, triIndex, attrib);
 151
 152                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 153                 {
 154                     SIMD128::store_ps(pBuffer, attrib[i]);
 155                     pBuffer += 4;
 156                 }
 157             }
 158         }
 159         else
 160         {
 161             pa.AssembleSingle(inputSlot, triIndex, attrib);
 162
 163             for (uint32_t i = 0; i < NumVertsT::value; ++i)
 164             {
 165                 SIMD128::store_ps(pBuffer, attrib[i]);
 166                 pBuffer += 4;
 167             }
 168         }
 169
 170         // pad out the attrib buffer to 3 verts to ensure the triangle
 171         // interpolation code in the pixel shader works correctly for the
 172         // 3 topologies - point, line, tri.  This effectively zeros out the
 173         // effect of the missing vertices in the triangle interpolation.
 174         for (uint32_t v = NumVertsT::value; v < 3; ++v)
 175         {
 176             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
 177             pBuffer += 4;
 178         }
 179
 180         // check for constant source overrides
 181         if (IsSwizzledT::value)
 182         {
 183             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
 184             if (mask)
 185             {
 186                 DWORD comp;
 187                 while (_BitScanForward(&comp, mask))
 188                 {
 189                     mask &= ~(1 << comp);
 190
 191                     float constantValue = 0.0f;
 192                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
 193                     {
 194                     case SWR_CONSTANT_SOURCE_CONST_0000:
 195                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
 196                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
 197                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
 198                         break;
 199                     case SWR_CONSTANT_SOURCE_PRIM_ID:
 200                         constantValue = *(float*)&primId;
 201                         break;
 202                     }
 203
 204                     // apply constant value to all 3 vertices
 205                     for (uint32_t v = 0; v < 3; ++v)
 206                     {
 207                         pAttribStart[comp + v * 4] = constantValue;
 208                     }
 209                 }
 210             }
 211         }
 212     }
 213 }
 214
 215 //////////////////////////////////////////////////////////////////////////
 216 /// @brief  Gather scissor rect data based on per-prim viewport indices.
 217 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
 218 /// @param pViewportIndex - array of per-primitive vewport indexes.
 219 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
 220 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
 221 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
 222 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 223 //
 224 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 225 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
 226     simdscalari &scisXmin, simdscalari &scisYmin, simdscalari &scisXmax, simdscalari &scisYmax)
 227 {
 228     scisXmin = _simd_set_epi32(
 229         pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 230         pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 231         pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 232         pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 233         pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 234         pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 235         pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 236         pScissorsInFixedPoint[pViewportIndex[7]].xmin);
 237     scisYmin = _simd_set_epi32(
 238         pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 239         pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 240         pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 241         pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 242         pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 243         pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 244         pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 245         pScissorsInFixedPoint[pViewportIndex[7]].ymin);
 246     scisXmax = _simd_set_epi32(
 247         pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 248         pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 249         pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 250         pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 251         pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 252         pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 253         pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 254         pScissorsInFixedPoint[pViewportIndex[7]].xmax);
 255     scisYmax = _simd_set_epi32(
 256         pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 257         pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 258         pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 259         pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 260         pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 261         pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 262         pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 263         pScissorsInFixedPoint[pViewportIndex[7]].ymax);
 264 }
 265
 266 static void GatherScissors(const SWR_RECT *pScissorsInFixedPoint, const uint32_t *pViewportIndex,
 267     simd16scalari &scisXmin, simd16scalari &scisYmin, simd16scalari &scisXmax, simd16scalari &scisYmax)
 268 {
 269     scisXmin = _simd16_set_epi32(
 270         pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 271         pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 272         pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 273         pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 274         pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 275         pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 276         pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 277         pScissorsInFixedPoint[pViewportIndex[7]].xmin,
 278         pScissorsInFixedPoint[pViewportIndex[8]].xmin,
 279         pScissorsInFixedPoint[pViewportIndex[9]].xmin,
 280         pScissorsInFixedPoint[pViewportIndex[10]].xmin,
 281         pScissorsInFixedPoint[pViewportIndex[11]].xmin,
 282         pScissorsInFixedPoint[pViewportIndex[12]].xmin,
 283         pScissorsInFixedPoint[pViewportIndex[13]].xmin,
 284         pScissorsInFixedPoint[pViewportIndex[14]].xmin,
 285         pScissorsInFixedPoint[pViewportIndex[15]].xmin);
 286
 287     scisYmin = _simd16_set_epi32(
 288         pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 289         pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 290         pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 291         pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 292         pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 293         pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 294         pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 295         pScissorsInFixedPoint[pViewportIndex[7]].ymin,
 296         pScissorsInFixedPoint[pViewportIndex[8]].ymin,
 297         pScissorsInFixedPoint[pViewportIndex[9]].ymin,
 298         pScissorsInFixedPoint[pViewportIndex[10]].ymin,
 299         pScissorsInFixedPoint[pViewportIndex[11]].ymin,
 300         pScissorsInFixedPoint[pViewportIndex[12]].ymin,
 301         pScissorsInFixedPoint[pViewportIndex[13]].ymin,
 302         pScissorsInFixedPoint[pViewportIndex[14]].ymin,
 303         pScissorsInFixedPoint[pViewportIndex[15]].ymin);
 304
 305     scisXmax = _simd16_set_epi32(
 306         pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 307         pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 308         pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 309         pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 310         pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 311         pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 312         pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 313         pScissorsInFixedPoint[pViewportIndex[7]].xmax,
 314         pScissorsInFixedPoint[pViewportIndex[8]].xmax,
 315         pScissorsInFixedPoint[pViewportIndex[9]].xmax,
 316         pScissorsInFixedPoint[pViewportIndex[10]].xmax,
 317         pScissorsInFixedPoint[pViewportIndex[11]].xmax,
 318         pScissorsInFixedPoint[pViewportIndex[12]].xmax,
 319         pScissorsInFixedPoint[pViewportIndex[13]].xmax,
 320         pScissorsInFixedPoint[pViewportIndex[14]].xmax,
 321         pScissorsInFixedPoint[pViewportIndex[15]].xmax);
 322
 323     scisYmax = _simd16_set_epi32(
 324         pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 325         pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 326         pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 327         pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 328         pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 329         pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 330         pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 331         pScissorsInFixedPoint[pViewportIndex[7]].ymax,
 332         pScissorsInFixedPoint[pViewportIndex[8]].ymax,
 333         pScissorsInFixedPoint[pViewportIndex[9]].ymax,
 334         pScissorsInFixedPoint[pViewportIndex[10]].ymax,
 335         pScissorsInFixedPoint[pViewportIndex[11]].ymax,
 336         pScissorsInFixedPoint[pViewportIndex[12]].ymax,
 337         pScissorsInFixedPoint[pViewportIndex[13]].ymax,
 338         pScissorsInFixedPoint[pViewportIndex[14]].ymax,
 339         pScissorsInFixedPoint[pViewportIndex[15]].ymax);
 340 }
 341
 342 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 343
 344 struct ProcessAttributesChooser
 345 {
 346     typedef PFN_PROCESS_ATTRIBUTES FuncType;
 347
 348     template <typename... ArgsB>
 349     static FuncType GetFunc()
 350     {
 351         return ProcessAttributes<ArgsB...>;
 352     }
 353 };
 354
 355 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
 356 {
 357     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 358 }
 359
 360 //////////////////////////////////////////////////////////////////////////
 361 /// @brief Processes enabled user clip distances. Loads the active clip
 362 ///        distances from the PA, sets up barycentric equations, and
 363 ///        stores the results to the output buffer
 364 /// @param pa - Primitive Assembly state
 365 /// @param primIndex - primitive index to process
 366 /// @param clipDistMask - mask of enabled clip distances
 367 /// @param pUserClipBuffer - buffer to store results
 368 template<uint32_t NumVerts>
 369 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 370 {
 371     DWORD clipDist;
 372     uint32_t clipDistMask = state.clipDistanceMask;
 373     while (_BitScanForward(&clipDist, clipDistMask))
 374     {
 375         clipDistMask &= ~(1 << clipDist);
 376         uint32_t clipSlot = clipDist >> 2;
 377         uint32_t clipComp = clipDist & 0x3;
 378         uint32_t clipAttribSlot = clipSlot == 0 ?
 379             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 380
 381         simd4scalar primClipDist[3];
 382         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 383
 384         float vertClipDist[NumVerts];
 385         for (uint32_t e = 0; e < NumVerts; ++e)
 386         {
 387             OSALIGNSIMD(float) aVertClipDist[4];
 388             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
 389             vertClipDist[e] = aVertClipDist[clipComp];
 390         };
 391
 392         // setup plane equations for barycentric interpolation in the backend
 393         float baryCoeff[NumVerts];
 394         float last = vertClipDist[NumVerts - 1] * pRecipW[NumVerts - 1];
 395         for (uint32_t e = 0; e < NumVerts - 1; ++e)
 396         {
 397             baryCoeff[e] = vertClipDist[e] * pRecipW[e] - last;
 398         }
 399         baryCoeff[NumVerts - 1] = last;
 400
 401         for (uint32_t e = 0; e < NumVerts; ++e)
 402         {
 403             *(pUserClipBuffer++) = baryCoeff[e];
 404         }
 405     }
 406 }
 407
 408 INLINE
 409 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
 410 {
 411     vTranspose3x8(dst, src0, src1, src2);
 412 }
 413
 414 INLINE
 415 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
 416 {
 417     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 418 }
 419
 420 //////////////////////////////////////////////////////////////////////////
 421 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
 422 ///        culling, viewport transform, etc.
 423 /// @param pDC - pointer to draw context.
 424 /// @param pa - The primitive assembly object.
 425 /// @param workerId - thread's worker id. Even thread has a unique id.
 426 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
 427 /// @param primID - Primitive ID for each triangle.
 428 /// @param viewportIdx - viewport array index for each triangle.
 429 /// @tparam CT - ConservativeRastFETraits
 430 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 431 void SIMDCALL BinTrianglesImpl(
 432     DRAW_CONTEXT *pDC,
 433     PA_STATE &pa,
 434     uint32_t workerId,
 435     typename SIMD_T::Vec4 tri[3],
 436     uint32_t triMask,
 437     typename SIMD_T::Integer const &primID)
 438 {
 439     SWR_CONTEXT *pContext = pDC->pContext;
 440
 441     AR_BEGIN(FEBinTriangles, pDC->drawId);
 442
 443     const API_STATE& state = GetApiState(pDC);
 444     const SWR_RASTSTATE& rastState = state.rastState;
 445     const SWR_FRONTEND_STATE& feState = state.frontendState;
 446
 447     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 448
 449     typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
 450     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 451     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 452
 453     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
 454     typename SIMD_T::Vec4 vpiAttrib[3];
 455     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
 456
 457     if (state.backendState.readViewportArrayIndex)
 458     {
 459         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
 460
 461         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
 462     }
 463
 464
 465     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
 466     {
 467         // OOB indices => forced to zero.
 468         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
 469         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 470         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
 471         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
 472     }
 473     else
 474     {
 475         viewportIdx = vpai;
 476     }
 477
 478     if (feState.vpTransformDisable)
 479     {
 480         // RHW is passed in directly when VP transform is disabled
 481         vRecipW0 = tri[0].v[3];
 482         vRecipW1 = tri[1].v[3];
 483         vRecipW2 = tri[2].v[3];
 484     }
 485     else
 486     {
 487         // Perspective divide
 488         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
 489         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
 490         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
 491
 492         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
 493         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
 494         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
 495
 496         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
 497         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
 498         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
 499
 500         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
 501         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
 502         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 503
 504         // Viewport transform to screen space coords
 505         if (state.backendState.readViewportArrayIndex)
 506         {
 507             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 508         }
 509         else
 510         {
 511             viewportTransform<3>(tri, state.vpMatrices);
 512         }
 513     }
 514
 515     // Adjust for pixel center location
 516     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
 517
 518     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 519     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
 520
 521     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
 522     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
 523
 524     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
 525     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
 526
 527     // Set vXi, vYi to required fixed point precision
 528     typename SIMD_T::Integer vXi[3], vYi[3];
 529     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
 530
 531     // triangle setup
 532     typename SIMD_T::Integer vAi[3], vBi[3];
 533     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
 534
 535     // determinant
 536     typename SIMD_T::Integer vDet[2];
 537     calcDeterminantIntVertical(vAi, vBi, vDet);
 538
 539     // cull zero area
 540     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
 541     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 542
 543     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 544
 545     // don't cull degenerate triangles if we're conservatively rasterizing
 546     uint32_t origTriMask = triMask;
 547     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
 548     {
 549         triMask &= ~cullZeroAreaMask;
 550     }
 551
 552     // determine front winding tris
 553     // CW  +det
 554     // CCW det < 0;
 555     // 0 area triangles are marked as backfacing regardless of winding order,
 556     // which is required behavior for conservative rast and wireframe rendering
 557     uint32_t frontWindingTris;
 558     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
 559     {
 560         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 561         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 562     }
 563     else
 564     {
 565         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
 566         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
 567     }
 568     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 569
 570     // cull
 571     uint32_t cullTris;
 572     switch ((SWR_CULLMODE)rastState.cullMode)
 573     {
 574     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
 575     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
 576     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
 577         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
 578     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
 579     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
 580     }
 581
 582     triMask &= ~cullTris;
 583
 584     if (origTriMask ^ triMask)
 585     {
 586         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 587     }
 588
 589     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
 590     // compute per tri backface
 591     uint32_t frontFaceMask = frontWindingTris;
 592     uint32_t *pPrimID = (uint32_t *)&primID;
 593     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 594     DWORD triIndex = 0;
 595
 596     uint32_t edgeEnable;
 597     PFN_WORK_FUNC pfnWork;
 598     if (CT::IsConservativeT::value)
 599     {
 600         // determine which edges of the degenerate tri, if any, are valid to rasterize.
 601         // used to call the appropriate templated rasterizer function
 602         if (cullZeroAreaMask > 0)
 603         {
 604             // e0 = v1-v0
 605             const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
 606             const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 607
 608             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 609
 610             // e1 = v2-v1
 611             const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
 612             const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 613
 614             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 615
 616             // e2 = v0-v2
 617             // if v0 == v1 & v1 == v2, v0 == v2
 618             uint32_t e2Mask = e0Mask & e1Mask;
 619             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
 620
 621             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
 622             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
 623             e0Mask = pdep_u32(e0Mask, 0x00249249);
 624
 625             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
 626             e1Mask = pdep_u32(e1Mask, 0x00492492);
 627
 628             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
 629             e2Mask = pdep_u32(e2Mask, 0x00924924);
 630
 631             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
 632         }
 633         else
 634         {
 635             edgeEnable = 0x00FFFFFF;
 636         }
 637     }
 638     else
 639     {
 640         // degenerate triangles won't be sent to rasterizer; just enable all edges
 641         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 642             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
 643     }
 644
 645     SIMDBBOX_T<SIMD_T> bbox;
 646
 647     if (!triMask)
 648     {
 649         goto endBinTriangles;
 650     }
 651
 652     // Calc bounding box of triangles
 653     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
 654
 655     // determine if triangle falls between pixel centers and discard
 656     // only discard for non-MSAA case and when conservative rast is disabled
 657     // (xmin + 127) & ~255
 658     // (xmax + 128) & ~255
 659     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
 660         (!CT::IsConservativeT::value))
 661     {
 662         origTriMask = triMask;
 663
 664         int cullCenterMask;
 665
 666         {
 667             typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
 668             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
 669             typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
 670             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 671
 672             typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 673
 674             typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
 675             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
 676             typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
 677             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 678
 679             typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 680
 681             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
 682             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
 683         }
 684
 685         triMask &= ~cullCenterMask;
 686
 687         if (origTriMask ^ triMask)
 688         {
 689             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 690         }
 691     }
 692
 693     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
 694     // Gather the AOS effective scissor rects based on the per-prim VP index.
 695     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 696     {
 697         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 698
 699         if (state.backendState.readViewportArrayIndex)
 700         {
 701             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
 702         }
 703         else // broadcast fast path for non-VPAI case.
 704         {
 705             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
 706             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
 707             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
 708             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
 709         }
 710
 711         // Make triangle bbox inclusive
 712         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
 713         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
 714
 715         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
 716         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
 717         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
 718         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
 719     }
 720
 721     if (CT::IsConservativeT::value)
 722     {
 723         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
 724         // some area. Bump the xmax/ymax edges out
 725
 726         typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
 727         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 728
 729         typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
 730         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
 731     }
 732
 733     // Cull tris completely outside scissor
 734     {
 735         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
 736         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
 737         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
 738         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
 739         triMask = triMask & ~maskOutsideScissor;
 740     }
 741
 742 endBinTriangles:
 743
 744
 745     // Send surviving triangles to the line or point binner based on fill mode
 746     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
 747     {
 748         // Simple non-conformant wireframe mode, useful for debugging
 749         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
 750         typename SIMD_T::Vec4 line[2];
 751         typename SIMD_T::Float recipW[2];
 752
 753         line[0] = tri[0];
 754         line[1] = tri[1];
 755         recipW[0] = vRecipW0;
 756         recipW[1] = vRecipW1;
 757
 758         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 759
 760         line[0] = tri[1];
 761         line[1] = tri[2];
 762         recipW[0] = vRecipW1;
 763         recipW[1] = vRecipW2;
 764
 765         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 766
 767         line[0] = tri[2];
 768         line[1] = tri[0];
 769         recipW[0] = vRecipW2;
 770         recipW[1] = vRecipW0;
 771
 772         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx);
 773
 774         AR_END(FEBinTriangles, 1);
 775         return;
 776     }
 777     else if (rastState.fillMode == SWR_FILLMODE_POINT)
 778     {
 779         // Bin 3 points
 780         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx);
 781         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx);
 782         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx);
 783
 784         AR_END(FEBinTriangles, 1);
 785         return;
 786     }
 787
 788     // Convert triangle bbox to macrotile units.
 789     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
 790     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
 791     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
 792     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
 793
 794     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
 795
 796     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
 797     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
 798     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
 799     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
 800
 801     // transpose verts needed for backend
 802     /// @todo modify BE to take non-transformed verts
 803     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
 804     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
 805     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
 806     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
 807
 808     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
 809     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
 810     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
 811     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
 812
 813     // store render target array index
 814     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
 815     if (state.backendState.readRenderTargetArrayIndex)
 816     {
 817         typename SIMD_T::Vec4 vRtai[3];
 818         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
 819         typename SIMD_T::Integer vRtaii;
 820         vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
 821         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
 822     }
 823     else
 824     {
 825         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
 826     }
 827
 828
 829     // scan remaining valid triangles and bin each separately
 830     while (_BitScanForward(&triIndex, triMask))
 831     {
 832         uint32_t linkageCount = state.backendState.numAttributes;
 833         uint32_t numScalarAttribs = linkageCount * 4;
 834
 835         BE_WORK work;
 836         work.type = DRAW;
 837
 838         bool isDegenerate;
 839         if (CT::IsConservativeT::value)
 840         {
 841             // only rasterize valid edges if we have a degenerate primitive
 842             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
 843             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 844                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
 845
 846             // Degenerate triangles are required to be constant interpolated
 847             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
 848         }
 849         else
 850         {
 851             isDegenerate = false;
 852             work.pfnWork = pfnWork;
 853         }
 854
 855         // Select attribute processor
 856         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
 857             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
 858
 859         TRIANGLE_WORK_DESC &desc = work.desc.tri;
 860
 861         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
 862         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
 863         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
 864
 865         auto pArena = pDC->pArena;
 866         SWR_ASSERT(pArena != nullptr);
 867
 868         // store active attribs
 869         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
 870         desc.pAttribs = pAttribs;
 871         desc.numAttribs = linkageCount;
 872         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
 873
 874         // store triangle vertex data
 875         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 876
 877         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
 878         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
 879         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
 880         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 881
 882         // store user clip distances
 883         if (state.backendState.clipDistanceMask)
 884         {
 885             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
 886             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
 887             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
 888         }
 889
 890         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
 891         {
 892             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
 893             {
 894 #if KNOB_ENABLE_TOSS_POINTS
 895                 if (!KNOB_TOSS_SETUP_TRIS)
 896 #endif
 897                 {
 898                     pTileMgr->enqueue(x, y, &work);
 899                 }
 900             }
 901         }
 902
 903                      triMask &= ~(1 << triIndex);
 904     }
 905
 906     AR_END(FEBinTriangles, 1);
 907 }
 908
 909 template <typename CT>
 910 void BinTriangles(
 911     DRAW_CONTEXT *pDC,
 912     PA_STATE &pa,
 913     uint32_t workerId,
 914     simdvector tri[3],
 915     uint32_t triMask,
 916     simdscalari const &primID)
 917 {
 918     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 919 }
 920
 921 #if USE_SIMD16_FRONTEND
 922 template <typename CT>
 923 void SIMDCALL BinTriangles_simd16(
 924     DRAW_CONTEXT *pDC,
 925     PA_STATE &pa,
 926     uint32_t workerId,
 927     simd16vector tri[3],
 928     uint32_t triMask,
 929     simd16scalari const &primID)
 930 {
 931     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID);
 932 }
 933
 934 #endif
 935 struct FEBinTrianglesChooser
 936 {
 937     typedef PFN_PROCESS_PRIMS FuncType;
 938
 939     template <typename... ArgsB>
 940     static FuncType GetFunc()
 941     {
 942         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
 943     }
 944 };
 945
 946 // Selector for correct templated BinTrinagles function
 947 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
 948 {
 949     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
 950 }
 951
 952 #if USE_SIMD16_FRONTEND
 953 struct FEBinTrianglesChooser_simd16
 954 {
 955     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
 956
 957     template <typename... ArgsB>
 958     static FuncType GetFunc()
 959     {
 960         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
 961     }
 962 };
 963
 964 // Selector for correct templated BinTrinagles function
 965 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
 966 {
 967     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
 968 }
 969
 970 #endif
 971
 972 template <typename SIMD_T, uint32_t SIMD_WIDTH>
 973 void BinPostSetupPointsImpl(
 974     DRAW_CONTEXT *pDC,
 975     PA_STATE &pa,
 976     uint32_t workerId,
 977     typename SIMD_T::Vec4 prim[],
 978     uint32_t primMask,
 979     typename SIMD_T::Integer const &primID,
 980     typename SIMD_T::Integer const &viewportIdx)
 981 {
 982     SWR_CONTEXT *pContext = pDC->pContext;
 983
 984     AR_BEGIN(FEBinPoints, pDC->drawId);
 985
 986     typename SIMD_T::Vec4 &primVerts = prim[0];
 987
 988     const API_STATE& state = GetApiState(pDC);
 989     const SWR_RASTSTATE& rastState = state.rastState;
 990     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 991
 992     // Select attribute processor
 993     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
 994         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
 995
 996     // convert to fixed point
 997     typename SIMD_T::Integer vXi, vYi;
 998
 999     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1000     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1001
1002     if (CanUseSimplePoints(pDC))
1003     {
1004         // adjust for ymin-xmin rule
1005         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1006         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1007
1008         // cull points off the ymin-xmin edge of the viewport
1009         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1010         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1011
1012         // compute macro tile coordinates
1013         typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1014         typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1015
1016         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1017
1018         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1019         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1020
1021         // compute raster tile coordinates
1022         typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1023         typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1024
1025         // compute raster tile relative x,y for coverage mask
1026         typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1027         typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1028
1029         typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1030         typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1031
1032         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1033         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1034
1035         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1036         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1037
1038         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1039         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1040
1041         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1042         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1043
1044         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1045         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1046
1047         // store render target array index
1048         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1049         if (state.backendState.readRenderTargetArrayIndex)
1050         {
1051             typename SIMD_T::Vec4 vRtai;
1052             pa.Assemble(VERTEX_SGV_SLOT, &vRtai);
1053             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[VERTEX_SGV_RTAI_COMP]);
1054             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1055         }
1056         else
1057         {
1058             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1059         }
1060
1061         uint32_t *pPrimID = (uint32_t *)&primID;
1062         DWORD primIndex = 0;
1063
1064         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1065
1066         // scan remaining valid triangles and bin each separately
1067         while (_BitScanForward(&primIndex, primMask))
1068         {
1069             uint32_t linkageCount = backendState.numAttributes;
1070             uint32_t numScalarAttribs = linkageCount * 4;
1071
1072             BE_WORK work;
1073             work.type = DRAW;
1074
1075             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1076
1077             // points are always front facing
1078             desc.triFlags.frontFacing = 1;
1079             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1080             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1081
1082             work.pfnWork = RasterizeSimplePoint;
1083
1084             auto pArena = pDC->pArena;
1085             SWR_ASSERT(pArena != nullptr);
1086
1087             // store attributes
1088             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1089             desc.pAttribs = pAttribs;
1090             desc.numAttribs = linkageCount;
1091
1092             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1093
1094             // store raster tile aligned x, y, perspective correct z
1095             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1096             desc.pTriBuffer = pTriBuffer;
1097             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1098             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1099             *pTriBuffer = aZ[primIndex];
1100
1101             uint32_t tX = aTileRelativeX[primIndex];
1102             uint32_t tY = aTileRelativeY[primIndex];
1103
1104             // pack the relative x,y into the coverageMask, the rasterizer will
1105             // generate the true coverage mask from it
1106             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1107
1108             // bin it
1109             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1110 #if KNOB_ENABLE_TOSS_POINTS
1111             if (!KNOB_TOSS_SETUP_TRIS)
1112 #endif
1113             {
1114                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1115             }
1116
1117             primMask &= ~(1 << primIndex);
1118         }
1119     }
1120     else
1121     {
1122         // non simple points need to be potentially binned to multiple macro tiles
1123         typename SIMD_T::Float vPointSize;
1124
1125         if (rastState.pointParam)
1126         {
1127             typename SIMD_T::Vec4 size[3];
1128             pa.Assemble(VERTEX_SGV_SLOT, size);
1129             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1130         }
1131         else
1132         {
1133             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1134         }
1135
1136         // bloat point to bbox
1137         SIMDBBOX_T<SIMD_T> bbox;
1138
1139         bbox.xmin = bbox.xmax = vXi;
1140         bbox.ymin = bbox.ymax = vYi;
1141
1142         typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1143         typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1144
1145         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1146         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1147         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1148         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1149
1150         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1151         // Gather the AOS effective scissor rects based on the per-prim VP index.
1152         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1153         {
1154             typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1155
1156             if (state.backendState.readViewportArrayIndex)
1157             {
1158                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1159             }
1160             else // broadcast fast path for non-VPAI case.
1161             {
1162                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1163                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1164                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1165                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1166             }
1167
1168             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1169             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1170             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1171             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1172         }
1173
1174         // Cull bloated points completely outside scissor
1175         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1176         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1177         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1178         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1179         primMask = primMask & ~maskOutsideScissor;
1180
1181         // Convert bbox to macrotile units.
1182         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1183         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1184         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1185         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1186
1187         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1188
1189         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1190         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1191         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1192         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1193
1194         // store render target array index
1195         OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1196         if (state.backendState.readRenderTargetArrayIndex)
1197         {
1198             typename SIMD_T::Vec4 vRtai[2];
1199             pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1200             typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1201             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1202         }
1203         else
1204         {
1205             SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1206         }
1207
1208         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1209         SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1210
1211         uint32_t *pPrimID = (uint32_t *)&primID;
1212
1213         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1214         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1215         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1216
1217         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1218         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1219         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1220
1221         // scan remaining valid prims and bin each separately
1222         const SWR_BACKEND_STATE& backendState = state.backendState;
1223         DWORD primIndex;
1224         while (_BitScanForward(&primIndex, primMask))
1225         {
1226             uint32_t linkageCount = backendState.numAttributes;
1227             uint32_t numScalarAttribs = linkageCount * 4;
1228
1229             BE_WORK work;
1230             work.type = DRAW;
1231
1232             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1233
1234             desc.triFlags.frontFacing = 1;
1235             desc.triFlags.pointSize = aPointSize[primIndex];
1236             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1237             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1238
1239             work.pfnWork = RasterizeTriPoint;
1240
1241             auto pArena = pDC->pArena;
1242             SWR_ASSERT(pArena != nullptr);
1243
1244             // store active attribs
1245             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1246             desc.numAttribs = linkageCount;
1247             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1248
1249             // store point vertex data
1250             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1251             desc.pTriBuffer = pTriBuffer;
1252             *pTriBuffer++ = aPrimVertsX[primIndex];
1253             *pTriBuffer++ = aPrimVertsY[primIndex];
1254             *pTriBuffer = aPrimVertsZ[primIndex];
1255
1256             // store user clip distances
1257             if (backendState.clipDistanceMask)
1258             {
1259                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1260                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1261                 float dists[8];
1262                 float one = 1.0f;
1263                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1264                 for (uint32_t i = 0; i < numClipDist; i++) {
1265                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1266                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1267                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1268                 }
1269             }
1270
1271             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1272             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1273             {
1274                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1275                 {
1276 #if KNOB_ENABLE_TOSS_POINTS
1277                     if (!KNOB_TOSS_SETUP_TRIS)
1278 #endif
1279                     {
1280                         pTileMgr->enqueue(x, y, &work);
1281                     }
1282                 }
1283             }
1284
1285             primMask &= ~(1 << primIndex);
1286         }
1287     }
1288
1289     AR_END(FEBinPoints, 1);
1290 }
1291
1292 //////////////////////////////////////////////////////////////////////////
1293 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1294 /// @param pDC - pointer to draw context.
1295 /// @param pa - The primitive assembly object.
1296 /// @param workerId - thread's worker id. Even thread has a unique id.
1297 /// @param tri - Contains point position data for SIMDs worth of points.
1298 /// @param primID - Primitive ID for each point.
1299 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1300 void BinPointsImpl(
1301     DRAW_CONTEXT *pDC,
1302     PA_STATE &pa,
1303     uint32_t workerId,
1304     typename SIMD_T::Vec4 prim[3],
1305     uint32_t primMask,
1306     typename SIMD_T::Integer const &primID)
1307 {
1308     const API_STATE& state = GetApiState(pDC);
1309     const SWR_FRONTEND_STATE& feState = state.frontendState;
1310     const SWR_RASTSTATE& rastState = state.rastState;
1311
1312     // Read back viewport index if required
1313     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1314     typename SIMD_T::Vec4 vpiAttrib[1];
1315     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1316
1317     if (state.backendState.readViewportArrayIndex)
1318     {
1319         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1320
1321         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1322     }
1323
1324
1325     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1326     {
1327         // OOB indices => forced to zero.
1328         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1329         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1330         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1331         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1332     }
1333     else
1334     {
1335         viewportIdx = vpai;
1336     }
1337
1338     if (!feState.vpTransformDisable)
1339     {
1340         // perspective divide
1341         typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1342
1343         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1344         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1345         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1346
1347         // viewport transform to screen coords
1348         if (state.backendState.readViewportArrayIndex)
1349         {
1350             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1351         }
1352         else
1353         {
1354             viewportTransform<1>(prim, state.vpMatrices);
1355         }
1356     }
1357
1358     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1359
1360     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1361     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1362
1363     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1364         pDC,
1365         pa,
1366         workerId,
1367         prim,
1368         primMask,
1369         primID,
1370         viewportIdx);
1371 }
1372
1373 void BinPoints(
1374     DRAW_CONTEXT *pDC,
1375     PA_STATE &pa,
1376     uint32_t workerId,
1377     simdvector prim[3],
1378     uint32_t primMask,
1379     simdscalari const &primID)
1380 {
1381     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1382         pDC,
1383         pa,
1384         workerId,
1385         prim,
1386         primMask,
1387         primID);
1388 }
1389
1390 #if USE_SIMD16_FRONTEND
1391 void SIMDCALL BinPoints_simd16(
1392     DRAW_CONTEXT *pDC,
1393     PA_STATE &pa,
1394     uint32_t workerId,
1395     simd16vector prim[3],
1396     uint32_t primMask,
1397     simd16scalari const &primID)
1398 {
1399     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1400         pDC,
1401         pa,
1402         workerId,
1403         prim,
1404         primMask,
1405         primID);
1406 }
1407
1408 #endif
1409 //////////////////////////////////////////////////////////////////////////
1410 /// @brief Bin SIMD lines to the backend.
1411 /// @param pDC - pointer to draw context.
1412 /// @param pa - The primitive assembly object.
1413 /// @param workerId - thread's worker id. Even thread has a unique id.
1414 /// @param tri - Contains line position data for SIMDs worth of points.
1415 /// @param primID - Primitive ID for each line.
1416 /// @param viewportIdx - Viewport Array Index for each line.
1417 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1418 void BinPostSetupLinesImpl(
1419     DRAW_CONTEXT *pDC,
1420     PA_STATE &pa,
1421     uint32_t workerId,
1422     typename SIMD_T::Vec4 prim[],
1423     typename SIMD_T::Float recipW[],
1424     uint32_t primMask,
1425     typename SIMD_T::Integer const &primID,
1426     typename SIMD_T::Integer const &viewportIdx)
1427 {
1428     SWR_CONTEXT *pContext = pDC->pContext;
1429
1430     AR_BEGIN(FEBinLines, pDC->drawId);
1431
1432     const API_STATE &state = GetApiState(pDC);
1433     const SWR_RASTSTATE &rastState = state.rastState;
1434
1435     // Select attribute processor
1436     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1437         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1438
1439     typename SIMD_T::Float &vRecipW0 = recipW[0];
1440     typename SIMD_T::Float &vRecipW1 = recipW[1];
1441
1442     // convert to fixed point
1443     typename SIMD_T::Integer vXi[2], vYi[2];
1444
1445     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1446     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1447     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1448     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1449
1450     // compute x-major vs y-major mask
1451     typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1452     typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1453     typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1454     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1455
1456     // cull zero-length lines
1457     typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1458     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1459
1460     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1461
1462     uint32_t *pPrimID = (uint32_t *)&primID;
1463     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1464
1465     // Calc bounding box of lines
1466     SIMDBBOX_T<SIMD_T> bbox;
1467     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1468     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1469     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1470     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1471
1472     // bloat bbox by line width along minor axis
1473     typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1474     typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1475
1476     SIMDBBOX_T<SIMD_T> bloatBox;
1477
1478     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1479     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1480     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1481     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1482
1483     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1484     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1485     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1486     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1487
1488     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1489     {
1490         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1491
1492         if (state.backendState.readViewportArrayIndex)
1493         {
1494             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1495         }
1496         else // broadcast fast path for non-VPAI case.
1497         {
1498             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1499             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1500             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1501             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1502         }
1503
1504         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1505         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1506         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1507         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1508     }
1509
1510     // Cull prims completely outside scissor
1511     {
1512         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1513         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1514         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1515         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1516         primMask = primMask & ~maskOutsideScissor;
1517     }
1518
1519     // transpose verts needed for backend
1520     /// @todo modify BE to take non-transformed verts
1521     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1522     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1523     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1524     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1525
1526     if (!primMask)
1527     {
1528         goto endBinLines;
1529     }
1530
1531     // Convert triangle bbox to macrotile units.
1532     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1533     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1534     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1535     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1536
1537     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1538
1539     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1540     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1541     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1542     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1543
1544     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1545     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1546     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1547     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1548
1549     // store render target array index
1550     OSALIGNSIMD16(uint32_t) aRTAI[SIMD_WIDTH];
1551     if (state.backendState.readRenderTargetArrayIndex)
1552     {
1553         typename SIMD_T::Vec4 vRtai[2];
1554         pa.Assemble(VERTEX_SGV_SLOT, vRtai);
1555         typename SIMD_T::Integer vRtaii = SIMD_T::castps_si(vRtai[0][VERTEX_SGV_RTAI_COMP]);
1556         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), vRtaii);
1557     }
1558     else
1559     {
1560         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aRTAI), SIMD_T::setzero_si());
1561     }
1562
1563     // scan remaining valid prims and bin each separately
1564     DWORD primIndex;
1565     while (_BitScanForward(&primIndex, primMask))
1566     {
1567         uint32_t linkageCount = state.backendState.numAttributes;
1568         uint32_t numScalarAttribs = linkageCount * 4;
1569
1570         BE_WORK work;
1571         work.type = DRAW;
1572
1573         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1574
1575         desc.triFlags.frontFacing = 1;
1576         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1577         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1578         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1579
1580         work.pfnWork = RasterizeLine;
1581
1582         auto pArena = pDC->pArena;
1583         SWR_ASSERT(pArena != nullptr);
1584
1585         // store active attribs
1586         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1587         desc.numAttribs = linkageCount;
1588         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1589
1590         // store line vertex data
1591         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1592
1593         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1594         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1595         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1596         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1597
1598         // store user clip distances
1599         if (state.backendState.clipDistanceMask)
1600         {
1601             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1602             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1603             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1604         }
1605
1606         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1607         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1608         {
1609             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1610             {
1611 #if KNOB_ENABLE_TOSS_POINTS
1612                 if (!KNOB_TOSS_SETUP_TRIS)
1613 #endif
1614                 {
1615                     pTileMgr->enqueue(x, y, &work);
1616                 }
1617             }
1618         }
1619
1620         primMask &= ~(1 << primIndex);
1621     }
1622
1623 endBinLines:
1624
1625     AR_END(FEBinLines, 1);
1626 }
1627
1628 //////////////////////////////////////////////////////////////////////////
1629 /// @brief Bin SIMD lines to the backend.
1630 /// @param pDC - pointer to draw context.
1631 /// @param pa - The primitive assembly object.
1632 /// @param workerId - thread's worker id. Even thread has a unique id.
1633 /// @param tri - Contains line position data for SIMDs worth of points.
1634 /// @param primID - Primitive ID for each line.
1635 /// @param viewportIdx - Viewport Array Index for each line.
1636 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1637 void SIMDCALL BinLinesImpl(
1638     DRAW_CONTEXT *pDC,
1639     PA_STATE &pa,
1640     uint32_t workerId,
1641     typename SIMD_T::Vec4 prim[3],
1642     uint32_t primMask,
1643     typename SIMD_T::Integer const &primID)
1644 {
1645     const API_STATE& state = GetApiState(pDC);
1646     const SWR_RASTSTATE& rastState = state.rastState;
1647     const SWR_FRONTEND_STATE& feState = state.frontendState;
1648
1649     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1650
1651     typename SIMD_T::Integer viewportIdx = SIMD_T::setzero_si();
1652     typename SIMD_T::Vec4 vpiAttrib[2];
1653     typename SIMD_T::Integer vpai = SIMD_T::setzero_si();
1654
1655     if (state.backendState.readViewportArrayIndex)
1656     {
1657         pa.Assemble(VERTEX_SGV_SLOT, vpiAttrib);
1658         vpai = SIMD_T::castps_si(vpiAttrib[0][VERTEX_SGV_VAI_COMP]);
1659     }
1660
1661
1662     if (state.backendState.readViewportArrayIndex) // VPAIOffsets are guaranteed 0-15 -- no OOB issues if they are offsets from 0
1663     {
1664         // OOB indices => forced to zero.
1665         vpai = SIMD_T::max_epi32(vpai, SIMD_T::setzero_si());
1666         typename SIMD_T::Integer vNumViewports = SIMD_T::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
1667         typename SIMD_T::Integer vClearMask = SIMD_T::cmplt_epi32(vpai, vNumViewports);
1668         viewportIdx = SIMD_T::and_si(vClearMask, vpai);
1669     }
1670
1671     if (!feState.vpTransformDisable)
1672     {
1673         // perspective divide
1674         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1675         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1676
1677         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1678         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1679
1680         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1681         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1682
1683         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1684         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1685
1686         // viewport transform to screen coords
1687         if (state.backendState.readViewportArrayIndex)
1688         {
1689             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1690         }
1691         else
1692         {
1693             viewportTransform<2>(prim, state.vpMatrices);
1694         }
1695     }
1696
1697     // adjust for pixel center location
1698     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1699
1700     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1701     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1702
1703     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1704     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1705
1706     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1707         pDC,
1708         pa,
1709         workerId,
1710         prim,
1711         vRecipW,
1712         primMask,
1713         primID,
1714         viewportIdx);
1715 }
1716
1717 void BinLines(
1718     DRAW_CONTEXT *pDC,
1719     PA_STATE &pa,
1720     uint32_t workerId,
1721     simdvector prim[],
1722     uint32_t primMask,
1723     simdscalari const &primID)
1724 {
1725     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1726 }
1727
1728 #if USE_SIMD16_FRONTEND
1729 void SIMDCALL BinLines_simd16(
1730     DRAW_CONTEXT *pDC,
1731     PA_STATE &pa,
1732     uint32_t workerId,
1733     simd16vector prim[3],
1734     uint32_t primMask,
1735     simd16scalari const &primID)
1736 {
1737     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID);
1738 }
1739
1740 #endif