src/gallium/drivers/swr/rasterizer/core/binner.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file binner.cpp
  24 *
  25 * @brief Implementation for the macrotile binner
  26 *
  27 ******************************************************************************/
  28
  29 #include "binner.h"
  30 #include "context.h"
  31 #include "frontend.h"
  32 #include "conservativeRast.h"
  33 #include "pa.h"
  34 #include "rasterizer.h"
  35 #include "rdtsc_core.h"
  36 #include "tilemgr.h"
  37
  38 // Function Prototype
  39 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  40 void BinPostSetupLinesImpl(
  41     DRAW_CONTEXT *pDC,
  42     PA_STATE &pa,
  43     uint32_t workerId,
  44     typename SIMD_T::Vec4 prim[],
  45     typename SIMD_T::Float recipW[],
  46     uint32_t primMask,
  47     typename SIMD_T::Integer const &primID,
  48     typename SIMD_T::Integer const &viewportIdx,
  49     typename SIMD_T::Integer const &rtIdx);
  50
  51 template <typename SIMD_T, uint32_t SIMD_WIDTH>
  52 void BinPostSetupPointsImpl(
  53     DRAW_CONTEXT *pDC,
  54     PA_STATE &pa,
  55     uint32_t workerId,
  56     typename SIMD_T::Vec4 prim[],
  57     uint32_t primMask,
  58     typename SIMD_T::Integer const &primID,
  59     typename SIMD_T::Integer const &viewportIdx,
  60     typename SIMD_T::Integer const &rtIdx);
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief Processes attributes for the backend based on linkage mask and
  64 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
  65 /// @param pDC - Draw context
  66 /// @param pa - Primitive Assembly state
  67 /// @param linkageMask - Specifies which VS outputs are routed to PS.
  68 /// @param pLinkageMap - maps VS attribute slot to PS slot
  69 /// @param triIndex - Triangle to process attributes for
  70 /// @param pBuffer - Output result
  71 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
  72 INLINE void ProcessAttributes(
  73     DRAW_CONTEXT *pDC,
  74     PA_STATE&pa,
  75     uint32_t triIndex,
  76     uint32_t primId,
  77     float *pBuffer)
  78 {
  79     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
  80     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
  81     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
  82     uint32_t constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
  83     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
  84     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
  85
  86     static const float constTable[3][4] = {
  87         { 0.0f, 0.0f, 0.0f, 0.0f },
  88         { 0.0f, 0.0f, 0.0f, 1.0f },
  89         { 1.0f, 1.0f, 1.0f, 1.0f }
  90     };
  91
  92     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
  93     {
  94         uint32_t inputSlot;
  95         if (IsSwizzledT::value)
  96         {
  97             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
  98             inputSlot = backendState.vertexAttribOffset + attribSwizzle.sourceAttrib;
  99
 100         }
 101         else
 102         {
 103             inputSlot = backendState.vertexAttribOffset + i;
 104         }
 105
 106         simd4scalar attrib[3];    // triangle attribs (always 4 wide)
 107         float* pAttribStart = pBuffer;
 108
 109         if (HasConstantInterpT::value || IsDegenerate::value)
 110         {
 111             if (CheckBit(constantInterpMask, i))
 112             {
 113                 uint32_t vid;
 114                 uint32_t adjustedTriIndex;
 115                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
 116                 static const int32_t quadProvokingTri[2][4] = { { 0, 0, 0, 1 },{ 0, -1, 0, 0 } };
 117                 static const uint32_t quadProvokingVertex[2][4] = { { 0, 1, 2, 2 },{ 0, 1, 1, 2 } };
 118                 static const int32_t qstripProvokingTri[2][4] = { { 0, 0, 0, 1 },{ -1, 0, 0, 0 } };
 119                 static const uint32_t qstripProvokingVertex[2][4] = { { 0, 1, 2, 1 },{ 0, 0, 2, 1 } };
 120
 121                 switch (topo) {
 122                 case TOP_QUAD_LIST:
 123                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
 124                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
 125                     break;
 126                 case TOP_QUAD_STRIP:
 127                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
 128                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
 129                     break;
 130                 case TOP_TRIANGLE_STRIP:
 131                     adjustedTriIndex = triIndex;
 132                     vid = (triIndex & 1)
 133                         ? tristripProvokingVertex[provokingVertex]
 134                         : provokingVertex;
 135                     break;
 136                 default:
 137                     adjustedTriIndex = triIndex;
 138                     vid = provokingVertex;
 139                     break;
 140                 }
 141
 142                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
 143
 144                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 145                 {
 146                     SIMD128::store_ps(pBuffer, attrib[vid]);
 147                     pBuffer += 4;
 148                 }
 149             }
 150             else
 151             {
 152                 pa.AssembleSingle(inputSlot, triIndex, attrib);
 153
 154                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
 155                 {
 156                     SIMD128::store_ps(pBuffer, attrib[i]);
 157                     pBuffer += 4;
 158                 }
 159             }
 160         }
 161         else
 162         {
 163             pa.AssembleSingle(inputSlot, triIndex, attrib);
 164
 165             for (uint32_t i = 0; i < NumVertsT::value; ++i)
 166             {
 167                 SIMD128::store_ps(pBuffer, attrib[i]);
 168                 pBuffer += 4;
 169             }
 170         }
 171
 172         // pad out the attrib buffer to 3 verts to ensure the triangle
 173         // interpolation code in the pixel shader works correctly for the
 174         // 3 topologies - point, line, tri.  This effectively zeros out the
 175         // effect of the missing vertices in the triangle interpolation.
 176         for (uint32_t v = NumVertsT::value; v < 3; ++v)
 177         {
 178             SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
 179             pBuffer += 4;
 180         }
 181
 182         // check for constant source overrides
 183         if (IsSwizzledT::value)
 184         {
 185             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
 186             if (mask)
 187             {
 188                 DWORD comp;
 189                 while (_BitScanForward(&comp, mask))
 190                 {
 191                     mask &= ~(1 << comp);
 192
 193                     float constantValue = 0.0f;
 194                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
 195                     {
 196                     case SWR_CONSTANT_SOURCE_CONST_0000:
 197                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
 198                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
 199                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
 200                         break;
 201                     case SWR_CONSTANT_SOURCE_PRIM_ID:
 202                         constantValue = *(float*)&primId;
 203                         break;
 204                     }
 205
 206                     // apply constant value to all 3 vertices
 207                     for (uint32_t v = 0; v < 3; ++v)
 208                     {
 209                         pAttribStart[comp + v * 4] = constantValue;
 210                     }
 211                 }
 212             }
 213         }
 214     }
 215 }
 216
 217 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
 218
 219 struct ProcessAttributesChooser
 220 {
 221     typedef PFN_PROCESS_ATTRIBUTES FuncType;
 222
 223     template <typename... ArgsB>
 224     static FuncType GetFunc()
 225     {
 226         return ProcessAttributes<ArgsB...>;
 227     }
 228 };
 229
 230 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
 231 {
 232     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
 233 }
 234
 235 //////////////////////////////////////////////////////////////////////////
 236 /// @brief Processes enabled user clip distances. Loads the active clip
 237 ///        distances from the PA, sets up barycentric equations, and
 238 ///        stores the results to the output buffer
 239 /// @param pa - Primitive Assembly state
 240 /// @param primIndex - primitive index to process
 241 /// @param clipDistMask - mask of enabled clip distances
 242 /// @param pUserClipBuffer - buffer to store results
 243 template<uint32_t NumVerts>
 244 void ProcessUserClipDist(const SWR_BACKEND_STATE& state, PA_STATE& pa, uint32_t primIndex, float *pRecipW, float* pUserClipBuffer)
 245 {
 246     DWORD clipDist;
 247     uint32_t clipDistMask = state.clipDistanceMask;
 248     while (_BitScanForward(&clipDist, clipDistMask))
 249     {
 250         clipDistMask &= ~(1 << clipDist);
 251         uint32_t clipSlot = clipDist >> 2;
 252         uint32_t clipComp = clipDist & 0x3;
 253         uint32_t clipAttribSlot = clipSlot == 0 ?
 254             state.vertexClipCullOffset : state.vertexClipCullOffset + 1;
 255
 256         simd4scalar primClipDist[3];
 257         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 258
 259         for (uint32_t e = 0; e < NumVerts; ++e)
 260         {
 261             OSALIGNSIMD(float) aVertClipDist[4];
 262             SIMD128::store_ps(aVertClipDist, primClipDist[e]);
 263             *(pUserClipBuffer++) = aVertClipDist[clipComp];
 264         };
 265     }
 266 }
 267
 268 INLINE
 269 void TransposeVertices(simd4scalar(&dst)[8], const simdscalar &src0, const simdscalar &src1, const simdscalar &src2)
 270 {
 271     vTranspose3x8(dst, src0, src1, src2);
 272 }
 273
 274 INLINE
 275 void TransposeVertices(simd4scalar(&dst)[16], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2)
 276 {
 277     vTranspose4x16(reinterpret_cast<simd16scalar(&)[4]>(dst), src0, src1, src2, _simd16_setzero_ps());
 278 }
 279
 280
 281 #if KNOB_ENABLE_EARLY_RAST
 282
 283 #define ER_SIMD_TILE_X_DIM (1 << ER_SIMD_TILE_X_SHIFT)
 284 #define ER_SIMD_TILE_Y_DIM (1 << ER_SIMD_TILE_Y_SHIFT)
 285
 286
 287 template<typename SIMD_T>
 288 struct EarlyRastHelper
 289 {
 290 };
 291
 292 template<>
 293 struct EarlyRastHelper<SIMD256>
 294 {
 295     static SIMD256::Integer InitShiftCntrl()
 296     {
 297         return SIMD256::set_epi32(24, 25, 26, 27, 28, 29, 30, 31);
 298     }
 299 };
 300
 301 #if USE_SIMD16_FRONTEND
 302 template<>
 303 struct EarlyRastHelper<SIMD512>
 304 {
 305     static SIMD512::Integer InitShiftCntrl()
 306     {
 307         return SIMD512::set_epi32(16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
 308     }
 309 };
 310
 311 #endif
 312 //////////////////////////////////////////////////////////////////////////
 313 /// @brief Early Rasterizer (ER); triangles that fit small (e.g. 4x4) tile
 314 ///        (ER tile) can be rasterized as early as in binner to check if
 315 ///        they cover any  pixels. If not - the triangles can be
 316 ///        culled in binner.
 317 ///
 318 /// @param er_bbox - coordinates of ER tile for each triangle
 319 /// @param vAi - A coefficients of triangle edges
 320 /// @param vBi - B coefficients of triangle edges
 321 /// @param vXi - X coordinates of triangle vertices
 322 /// @param vYi - Y coordinates of triangle vertices
 323 /// @param frontWindingTris - mask indicating CCW/CW triangles
 324 /// @param triMask - mask for valid SIMD lanes (triangles)
 325 /// @param oneTileMask - defines triangles for ER to work on
 326 ///                      (tris that fit into ER tile)
 327 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 328 uint32_t SIMDCALL EarlyRasterizer(
 329         SIMDBBOX_T<SIMD_T> &er_bbox,
 330         typename SIMD_T::Integer (&vAi)[3],
 331         typename SIMD_T::Integer (&vBi)[3],
 332         typename SIMD_T::Integer (&vXi)[3],
 333         typename SIMD_T::Integer (&vYi)[3],
 334         uint32_t cwTrisMask,
 335         uint32_t triMask,
 336         uint32_t oneTileMask)
 337 {
 338     // step to pixel center of top-left pixel of the triangle bbox
 339     typename SIMD_T::Integer vTopLeftX = SIMD_T::template slli_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(er_bbox.xmin);
 340     vTopLeftX = SIMD_T::add_epi32(vTopLeftX, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 341
 342     typename SIMD_T::Integer vTopLeftY = SIMD_T::template slli_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(er_bbox.ymin);
 343     vTopLeftY = SIMD_T::add_epi32(vTopLeftY, SIMD_T::set1_epi32(FIXED_POINT_SCALE / 2));
 344
 345     // negate A and B for CW tris
 346     typename SIMD_T::Integer vNegA0 = SIMD_T::mullo_epi32(vAi[0], SIMD_T::set1_epi32(-1));
 347     typename SIMD_T::Integer vNegA1 = SIMD_T::mullo_epi32(vAi[1], SIMD_T::set1_epi32(-1));
 348     typename SIMD_T::Integer vNegA2 = SIMD_T::mullo_epi32(vAi[2], SIMD_T::set1_epi32(-1));
 349     typename SIMD_T::Integer vNegB0 = SIMD_T::mullo_epi32(vBi[0], SIMD_T::set1_epi32(-1));
 350     typename SIMD_T::Integer vNegB1 = SIMD_T::mullo_epi32(vBi[1], SIMD_T::set1_epi32(-1));
 351     typename SIMD_T::Integer vNegB2 = SIMD_T::mullo_epi32(vBi[2], SIMD_T::set1_epi32(-1));
 352
 353     RDTSC_EVENT(FEEarlyRastEnter, _mm_popcnt_u32(oneTileMask & triMask), 0);
 354
 355     typename SIMD_T::Integer vShiftCntrl = EarlyRastHelper <SIMD_T>::InitShiftCntrl();
 356     typename SIMD_T::Integer vCwTris = SIMD_T::set1_epi32(cwTrisMask);
 357     typename SIMD_T::Integer vMask = SIMD_T::sllv_epi32(vCwTris, vShiftCntrl);
 358
 359     vAi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[0]), SIMD_T::castsi_ps(vNegA0), SIMD_T::castsi_ps(vMask)));
 360     vAi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[1]), SIMD_T::castsi_ps(vNegA1), SIMD_T::castsi_ps(vMask)));
 361     vAi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vAi[2]), SIMD_T::castsi_ps(vNegA2), SIMD_T::castsi_ps(vMask)));
 362     vBi[0] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[0]), SIMD_T::castsi_ps(vNegB0), SIMD_T::castsi_ps(vMask)));
 363     vBi[1] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[1]), SIMD_T::castsi_ps(vNegB1), SIMD_T::castsi_ps(vMask)));
 364     vBi[2] = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vBi[2]), SIMD_T::castsi_ps(vNegB2), SIMD_T::castsi_ps(vMask)));
 365
 366     // evaluate edge equations at top-left pixel
 367     typename SIMD_T::Integer vDeltaX0 = SIMD_T::sub_epi32(vTopLeftX, vXi[0]);
 368     typename SIMD_T::Integer vDeltaX1 = SIMD_T::sub_epi32(vTopLeftX, vXi[1]);
 369     typename SIMD_T::Integer vDeltaX2 = SIMD_T::sub_epi32(vTopLeftX, vXi[2]);
 370
 371     typename SIMD_T::Integer vDeltaY0 = SIMD_T::sub_epi32(vTopLeftY, vYi[0]);
 372     typename SIMD_T::Integer vDeltaY1 = SIMD_T::sub_epi32(vTopLeftY, vYi[1]);
 373     typename SIMD_T::Integer vDeltaY2 = SIMD_T::sub_epi32(vTopLeftY, vYi[2]);
 374
 375     typename SIMD_T::Integer vAX0 = SIMD_T::mullo_epi32(vAi[0], vDeltaX0);
 376     typename SIMD_T::Integer vAX1 = SIMD_T::mullo_epi32(vAi[1], vDeltaX1);
 377     typename SIMD_T::Integer vAX2 = SIMD_T::mullo_epi32(vAi[2], vDeltaX2);
 378
 379     typename SIMD_T::Integer vBY0 = SIMD_T::mullo_epi32(vBi[0], vDeltaY0);
 380     typename SIMD_T::Integer vBY1 = SIMD_T::mullo_epi32(vBi[1], vDeltaY1);
 381     typename SIMD_T::Integer vBY2 = SIMD_T::mullo_epi32(vBi[2], vDeltaY2);
 382
 383     typename SIMD_T::Integer vEdge0 = SIMD_T::add_epi32(vAX0, vBY0);
 384     typename SIMD_T::Integer vEdge1 = SIMD_T::add_epi32(vAX1, vBY1);
 385     typename SIMD_T::Integer vEdge2 = SIMD_T::add_epi32(vAX2, vBY2);
 386
 387     vEdge0 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge0);
 388     vEdge1 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge1);
 389     vEdge2 = SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vEdge2);
 390
 391     // top left rule
 392     typename SIMD_T::Integer vEdgeAdjust0 = SIMD_T::sub_epi32(vEdge0, SIMD_T::set1_epi32(1));
 393     typename SIMD_T::Integer vEdgeAdjust1 = SIMD_T::sub_epi32(vEdge1, SIMD_T::set1_epi32(1));
 394     typename SIMD_T::Integer vEdgeAdjust2 = SIMD_T::sub_epi32(vEdge2, SIMD_T::set1_epi32(1));
 395
 396     // vA < 0
 397     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vAi[0])));
 398     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vAi[1])));
 399     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vAi[2])));
 400
 401     // vA == 0 && vB < 0
 402     typename SIMD_T::Integer vCmp0 = SIMD_T::cmpeq_epi32(vAi[0], SIMD_T::setzero_si());
 403     typename SIMD_T::Integer vCmp1 = SIMD_T::cmpeq_epi32(vAi[1], SIMD_T::setzero_si());
 404     typename SIMD_T::Integer vCmp2 = SIMD_T::cmpeq_epi32(vAi[2], SIMD_T::setzero_si());
 405
 406     vCmp0 = SIMD_T::and_si(vCmp0, vBi[0]);
 407     vCmp1 = SIMD_T::and_si(vCmp1, vBi[1]);
 408     vCmp2 = SIMD_T::and_si(vCmp2, vBi[2]);
 409
 410     vEdge0 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge0), SIMD_T::castsi_ps(vEdgeAdjust0), SIMD_T::castsi_ps(vCmp0)));
 411     vEdge1 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge1), SIMD_T::castsi_ps(vEdgeAdjust1), SIMD_T::castsi_ps(vCmp1)));
 412     vEdge2 = SIMD_T::castps_si(SIMD_T::blendv_ps(SIMD_T::castsi_ps(vEdge2), SIMD_T::castsi_ps(vEdgeAdjust2), SIMD_T::castsi_ps(vCmp2)));
 413
 414
 415 #if ER_SIMD_TILE_X_DIM == 4 && ER_SIMD_TILE_Y_DIM == 4
 416     // Go down
 417     // coverage pixel 0
 418     typename SIMD_T::Integer vMask0 = SIMD_T::and_si(vEdge0, vEdge1);
 419     vMask0 = SIMD_T::and_si(vMask0, vEdge2);
 420
 421     // coverage pixel 1
 422     typename SIMD_T::Integer vEdge0N = SIMD_T::add_epi32(vEdge0, vBi[0]);
 423     typename SIMD_T::Integer vEdge1N = SIMD_T::add_epi32(vEdge1, vBi[1]);
 424     typename SIMD_T::Integer vEdge2N = SIMD_T::add_epi32(vEdge2, vBi[2]);
 425     typename SIMD_T::Integer vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
 426     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
 427
 428     // coverage pixel 2
 429     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 430     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 431     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 432     typename SIMD_T::Integer vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
 433     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
 434
 435     // coverage pixel 3
 436     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 437     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 438     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 439     typename SIMD_T::Integer vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
 440     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
 441
 442     // One step to the right and then up
 443
 444     // coverage pixel 4
 445     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 446     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 447     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 448     typename SIMD_T::Integer vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
 449     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
 450
 451     // coverage pixel 5
 452     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 453     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 454     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 455     typename SIMD_T::Integer vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
 456     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
 457
 458     // coverage pixel 6
 459     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 460     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 461     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 462     typename SIMD_T::Integer vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
 463     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
 464
 465     // coverage pixel 7
 466     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 467     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 468     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 469     typename SIMD_T::Integer vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
 470     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
 471
 472     typename SIMD_T::Integer vLit1 = SIMD_T::or_si(vMask0, vMask1);
 473     vLit1 = SIMD_T::or_si(vLit1, vMask2);
 474     vLit1 = SIMD_T::or_si(vLit1, vMask3);
 475     vLit1 = SIMD_T::or_si(vLit1, vMask4);
 476     vLit1 = SIMD_T::or_si(vLit1, vMask5);
 477     vLit1 = SIMD_T::or_si(vLit1, vMask6);
 478     vLit1 = SIMD_T::or_si(vLit1, vMask7);
 479
 480     // Step to the right and go down again
 481
 482     // coverage pixel 0
 483     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 484     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 485     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 486     vMask0 = SIMD_T::and_si(vEdge0N, vEdge1N);
 487     vMask0 = SIMD_T::and_si(vMask0, vEdge2N);
 488
 489     // coverage pixel 1
 490     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 491     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 492     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 493     vMask1 = SIMD_T::and_si(vEdge0N, vEdge1N);
 494     vMask1 = SIMD_T::and_si(vMask1, vEdge2N);
 495
 496     // coverage pixel 2
 497     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 498     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 499     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 500     vMask2 = SIMD_T::and_si(vEdge0N, vEdge1N);
 501     vMask2 = SIMD_T::and_si(vMask2, vEdge2N);
 502
 503     // coverage pixel 3
 504     vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 505     vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 506     vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 507     vMask3 = SIMD_T::and_si(vEdge0N, vEdge1N);
 508     vMask3 = SIMD_T::and_si(vMask3, vEdge2N);
 509
 510     // And for the last time - to the right and up
 511
 512     // coverage pixel 4
 513     vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 514     vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 515     vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 516     vMask4 = SIMD_T::and_si(vEdge0N, vEdge1N);
 517     vMask4 = SIMD_T::and_si(vMask4, vEdge2N);
 518
 519     // coverage pixel 5
 520     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 521     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 522     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 523     vMask5 = SIMD_T::and_si(vEdge0N, vEdge1N);
 524     vMask5 = SIMD_T::and_si(vMask5, vEdge2N);
 525
 526     // coverage pixel 6
 527     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 528     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 529     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 530     vMask6 = SIMD_T::and_si(vEdge0N, vEdge1N);
 531     vMask6 = SIMD_T::and_si(vMask6, vEdge2N);
 532
 533     // coverage pixel 7
 534     vEdge0N = SIMD_T::sub_epi32(vEdge0N, vBi[0]);
 535     vEdge1N = SIMD_T::sub_epi32(vEdge1N, vBi[1]);
 536     vEdge2N = SIMD_T::sub_epi32(vEdge2N, vBi[2]);
 537     vMask7 = SIMD_T::and_si(vEdge0N, vEdge1N);
 538     vMask7 = SIMD_T::and_si(vMask7, vEdge2N);
 539
 540     typename SIMD_T::Integer vLit2 = SIMD_T::or_si(vMask0, vMask1);
 541     vLit2 = SIMD_T::or_si(vLit2, vMask2);
 542     vLit2 = SIMD_T::or_si(vLit2, vMask3);
 543     vLit2 = SIMD_T::or_si(vLit2, vMask4);
 544     vLit2 = SIMD_T::or_si(vLit2, vMask5);
 545     vLit2 = SIMD_T::or_si(vLit2, vMask6);
 546     vLit2 = SIMD_T::or_si(vLit2, vMask7);
 547
 548     typename SIMD_T::Integer vLit = SIMD_T::or_si(vLit1, vLit2);
 549
 550 #else
 551     // Generic algorithm sweeping in row by row order
 552     typename SIMD_T::Integer vRowMask[ER_SIMD_TILE_Y_DIM];
 553
 554     typename SIMD_T::Integer vEdge0N = vEdge0;
 555     typename SIMD_T::Integer vEdge1N = vEdge1;
 556     typename SIMD_T::Integer vEdge2N = vEdge2;
 557
 558     for (uint32_t row = 0; row < ER_SIMD_TILE_Y_DIM; row++)
 559     {
 560         // Store edge values at the beginning of the row
 561         typename SIMD_T::Integer vRowEdge0 = vEdge0N;
 562         typename SIMD_T::Integer vRowEdge1 = vEdge1N;
 563         typename SIMD_T::Integer vRowEdge2 = vEdge2N;
 564
 565         typename SIMD_T::Integer vColMask[ER_SIMD_TILE_X_DIM];
 566
 567         for (uint32_t col = 0; col < ER_SIMD_TILE_X_DIM; col++)
 568         {
 569             vColMask[col] = SIMD_T::and_si(vEdge0N, vEdge1N);
 570             vColMask[col] = SIMD_T::and_si(vColMask[col], vEdge2N);
 571
 572             vEdge0N = SIMD_T::add_epi32(vEdge0N, vAi[0]);
 573             vEdge1N = SIMD_T::add_epi32(vEdge1N, vAi[1]);
 574             vEdge2N = SIMD_T::add_epi32(vEdge2N, vAi[2]);
 575         }
 576         vRowMask[row] = vColMask[0];
 577         for (uint32_t col = 1; col < ER_SIMD_TILE_X_DIM; col++)
 578         {
 579             vRowMask[row] = SIMD_T::or_si(vRowMask[row], vColMask[col]);
 580         }
 581         // Restore values and go to the next row
 582         vEdge0N = vRowEdge0;
 583         vEdge1N = vRowEdge1;
 584         vEdge2N = vRowEdge2;
 585
 586         vEdge0N = SIMD_T::add_epi32(vEdge0N, vBi[0]);
 587         vEdge1N = SIMD_T::add_epi32(vEdge1N, vBi[1]);
 588         vEdge2N = SIMD_T::add_epi32(vEdge2N, vBi[2]);
 589     }
 590
 591     // compress all masks
 592     typename SIMD_T::Integer vLit = vRowMask[0];
 593     for (uint32_t row = 1; row < ER_SIMD_TILE_Y_DIM; row++)
 594     {
 595         vLit = SIMD_T::or_si(vLit, vRowMask[row]);
 596     }
 597
 598 #endif
 599     // Check which triangles has any pixel lit
 600     uint32_t maskLit = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vLit));
 601     uint32_t maskUnlit = ~maskLit & oneTileMask;
 602
 603     uint32_t oldTriMask = triMask;
 604     triMask &= ~maskUnlit;
 605
 606     if (triMask ^ oldTriMask)
 607     {
 608         RDTSC_EVENT(FEEarlyRastExit, _mm_popcnt_u32(triMask & oneTileMask), 0);
 609     }
 610     return triMask;
 611 }
 612
 613 #endif // Early rasterizer
 614
 615 //////////////////////////////////////////////////////////////////////////
 616 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
 617 ///        culling, viewport transform, etc.
 618 /// @param pDC - pointer to draw context.
 619 /// @param pa - The primitive assembly object.
 620 /// @param workerId - thread's worker id. Even thread has a unique id.
 621 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
 622 /// @param primID - Primitive ID for each triangle.
 623 /// @param viewportIdx - viewport array index for each triangle.
 624 /// @tparam CT - ConservativeRastFETraits
 625 template <typename SIMD_T, uint32_t SIMD_WIDTH, typename CT>
 626 void SIMDCALL BinTrianglesImpl(
 627     DRAW_CONTEXT *pDC,
 628     PA_STATE &pa,
 629     uint32_t workerId,
 630     typename SIMD_T::Vec4 tri[3],
 631     uint32_t triMask,
 632     typename SIMD_T::Integer const &primID,
 633     typename SIMD_T::Integer const &viewportIdx,
 634     typename SIMD_T::Integer const &rtIdx)
 635 {
 636     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
 637
 638     RDTSC_BEGIN(FEBinTriangles, pDC->drawId);
 639
 640     const API_STATE& state = GetApiState(pDC);
 641     const SWR_RASTSTATE& rastState = state.rastState;
 642     const SWR_FRONTEND_STATE& feState = state.frontendState;
 643
 644     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 645
 646     typename SIMD_T::Float vRecipW0 = SIMD_T::set1_ps(1.0f);
 647     typename SIMD_T::Float vRecipW1 = SIMD_T::set1_ps(1.0f);
 648     typename SIMD_T::Float vRecipW2 = SIMD_T::set1_ps(1.0f);
 649
 650     if (feState.vpTransformDisable)
 651     {
 652         // RHW is passed in directly when VP transform is disabled
 653         vRecipW0 = tri[0].v[3];
 654         vRecipW1 = tri[1].v[3];
 655         vRecipW2 = tri[2].v[3];
 656     }
 657     else
 658     {
 659         // Perspective divide
 660         vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[0].w);
 661         vRecipW1 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[1].w);
 662         vRecipW2 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), tri[2].w);
 663
 664         tri[0].v[0] = SIMD_T::mul_ps(tri[0].v[0], vRecipW0);
 665         tri[1].v[0] = SIMD_T::mul_ps(tri[1].v[0], vRecipW1);
 666         tri[2].v[0] = SIMD_T::mul_ps(tri[2].v[0], vRecipW2);
 667
 668         tri[0].v[1] = SIMD_T::mul_ps(tri[0].v[1], vRecipW0);
 669         tri[1].v[1] = SIMD_T::mul_ps(tri[1].v[1], vRecipW1);
 670         tri[2].v[1] = SIMD_T::mul_ps(tri[2].v[1], vRecipW2);
 671
 672         tri[0].v[2] = SIMD_T::mul_ps(tri[0].v[2], vRecipW0);
 673         tri[1].v[2] = SIMD_T::mul_ps(tri[1].v[2], vRecipW1);
 674         tri[2].v[2] = SIMD_T::mul_ps(tri[2].v[2], vRecipW2);
 675
 676         // Viewport transform to screen space coords
 677         if (pa.viewportArrayActive)
 678         {
 679             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
 680         }
 681         else
 682         {
 683             viewportTransform<3>(tri, state.vpMatrices);
 684         }
 685     }
 686
 687     // Adjust for pixel center location
 688     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
 689
 690     tri[0].x = SIMD_T::add_ps(tri[0].x, offset);
 691     tri[0].y = SIMD_T::add_ps(tri[0].y, offset);
 692
 693     tri[1].x = SIMD_T::add_ps(tri[1].x, offset);
 694     tri[1].y = SIMD_T::add_ps(tri[1].y, offset);
 695
 696     tri[2].x = SIMD_T::add_ps(tri[2].x, offset);
 697     tri[2].y = SIMD_T::add_ps(tri[2].y, offset);
 698
 699     // Set vXi, vYi to required fixed point precision
 700     typename SIMD_T::Integer vXi[3], vYi[3];
 701     FPToFixedPoint<SIMD_T>(tri, vXi, vYi);
 702
 703     // triangle setup
 704     typename SIMD_T::Integer vAi[3], vBi[3];
 705     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
 706
 707     // determinant
 708     typename SIMD_T::Integer vDet[2];
 709     calcDeterminantIntVertical(vAi, vBi, vDet);
 710
 711     // cull zero area
 712     uint32_t maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[0], SIMD_T::setzero_si())));
 713     uint32_t maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpeq_epi64(vDet[1], SIMD_T::setzero_si())));
 714
 715     uint32_t cullZeroAreaMask = maskLo | (maskHi << (SIMD_WIDTH / 2));
 716
 717     // don't cull degenerate triangles if we're conservatively rasterizing
 718     uint32_t origTriMask = triMask;
 719     if (rastState.fillMode == SWR_FILLMODE_SOLID && !CT::IsConservativeT::value)
 720     {
 721         triMask &= ~cullZeroAreaMask;
 722     }
 723
 724     // determine front winding tris
 725     // CW  +det
 726     // CCW det < 0;
 727     // 0 area triangles are marked as backfacing regardless of winding order,
 728     // which is required behavior for conservative rast and wireframe rendering
 729     uint32_t frontWindingTris;
 730     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
 731     {
 732         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 733         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 734     }
 735     else
 736     {
 737         maskLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[0])));
 738         maskHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(SIMD_T::setzero_si(), vDet[1])));
 739     }
 740     frontWindingTris = maskLo | (maskHi << (SIMD_WIDTH / 2));
 741
 742     // cull
 743     uint32_t cullTris;
 744     switch ((SWR_CULLMODE)rastState.cullMode)
 745     {
 746     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
 747     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
 748     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
 749         // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
 750     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
 751     default: SWR_INVALID("Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
 752     }
 753
 754     triMask &= ~cullTris;
 755
 756     if (origTriMask ^ triMask)
 757     {
 758         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 759     }
 760
 761     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
 762     // compute per tri backface
 763     uint32_t frontFaceMask = frontWindingTris;
 764     uint32_t *pPrimID = (uint32_t *)&primID;
 765     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
 766     DWORD triIndex = 0;
 767
 768     uint32_t edgeEnable;
 769     PFN_WORK_FUNC pfnWork;
 770     if (CT::IsConservativeT::value)
 771     {
 772         // determine which edges of the degenerate tri, if any, are valid to rasterize.
 773         // used to call the appropriate templated rasterizer function
 774         if (cullZeroAreaMask > 0)
 775         {
 776             // e0 = v1-v0
 777             const typename SIMD_T::Integer x0x1Mask = SIMD_T::cmpeq_epi32(vXi[0], vXi[1]);
 778             const typename SIMD_T::Integer y0y1Mask = SIMD_T::cmpeq_epi32(vYi[0], vYi[1]);
 779
 780             uint32_t e0Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x0x1Mask, y0y1Mask)));
 781
 782             // e1 = v2-v1
 783             const typename SIMD_T::Integer x1x2Mask = SIMD_T::cmpeq_epi32(vXi[1], vXi[2]);
 784             const typename SIMD_T::Integer y1y2Mask = SIMD_T::cmpeq_epi32(vYi[1], vYi[2]);
 785
 786             uint32_t e1Mask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(x1x2Mask, y1y2Mask)));
 787
 788             // e2 = v0-v2
 789             // if v0 == v1 & v1 == v2, v0 == v2
 790             uint32_t e2Mask = e0Mask & e1Mask;
 791             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
 792
 793             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
 794             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
 795             e0Mask = pdep_u32(e0Mask, 0x00249249);
 796
 797             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
 798             e1Mask = pdep_u32(e1Mask, 0x00492492);
 799
 800             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
 801             e2Mask = pdep_u32(e2Mask, 0x00924924);
 802
 803             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
 804         }
 805         else
 806         {
 807             edgeEnable = 0x00FFFFFF;
 808         }
 809     }
 810     else
 811     {
 812         // degenerate triangles won't be sent to rasterizer; just enable all edges
 813         pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
 814             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
 815     }
 816
 817     SIMDBBOX_T<SIMD_T> bbox;
 818
 819     if (!triMask)
 820     {
 821         goto endBinTriangles;
 822     }
 823
 824     // Calc bounding box of triangles
 825     calcBoundingBoxIntVertical<SIMD_T, CT>(vXi, vYi, bbox);
 826
 827     // determine if triangle falls between pixel centers and discard
 828     // only discard for non-MSAA case and when conservative rast is disabled
 829     // (xmin + 127) & ~255
 830     // (xmax + 128) & ~255
 831     if ((rastState.sampleCount == SWR_MULTISAMPLE_1X || rastState.bIsCenterPattern) &&
 832         (!CT::IsConservativeT::value))
 833     {
 834         origTriMask = triMask;
 835
 836         int cullCenterMask;
 837
 838         {
 839             typename SIMD_T::Integer xmin = SIMD_T::add_epi32(bbox.xmin, SIMD_T::set1_epi32(127));
 840             xmin = SIMD_T::and_si(xmin, SIMD_T::set1_epi32(~255));
 841             typename SIMD_T::Integer xmax = SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(128));
 842             xmax = SIMD_T::and_si(xmax, SIMD_T::set1_epi32(~255));
 843
 844             typename SIMD_T::Integer vMaskH = SIMD_T::cmpeq_epi32(xmin, xmax);
 845
 846             typename SIMD_T::Integer ymin = SIMD_T::add_epi32(bbox.ymin, SIMD_T::set1_epi32(127));
 847             ymin = SIMD_T::and_si(ymin, SIMD_T::set1_epi32(~255));
 848             typename SIMD_T::Integer ymax = SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(128));
 849             ymax = SIMD_T::and_si(ymax, SIMD_T::set1_epi32(~255));
 850
 851             typename SIMD_T::Integer vMaskV = SIMD_T::cmpeq_epi32(ymin, ymax);
 852
 853             vMaskV = SIMD_T::or_si(vMaskH, vMaskV);
 854             cullCenterMask = SIMD_T::movemask_ps(SIMD_T::castsi_ps(vMaskV));
 855         }
 856
 857         triMask &= ~cullCenterMask;
 858
 859         if (origTriMask ^ triMask)
 860         {
 861             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
 862         }
 863     }
 864
 865     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
 866     // Gather the AOS effective scissor rects based on the per-prim VP index.
 867     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 868     {
 869         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
 870         if (pa.viewportArrayActive)
 871
 872         {
 873             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
 874         }
 875         else // broadcast fast path for non-VPAI case.
 876         {
 877             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
 878             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
 879             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
 880             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
 881         }
 882
 883         // Make triangle bbox inclusive
 884         bbox.xmax = SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1));
 885         bbox.ymax = SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1));
 886
 887         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
 888         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
 889         bbox.xmax = SIMD_T::min_epi32(bbox.xmax, scisXmax);
 890         bbox.ymax = SIMD_T::min_epi32(bbox.ymax, scisYmax);
 891     }
 892
 893     if (CT::IsConservativeT::value)
 894     {
 895         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
 896         // some area. Bump the xmax/ymax edges out
 897
 898         typename SIMD_T::Integer topEqualsBottom = SIMD_T::cmpeq_epi32(bbox.ymin, bbox.ymax);
 899         bbox.ymax = SIMD_T::blendv_epi32(bbox.ymax, SIMD_T::add_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), topEqualsBottom);
 900
 901         typename SIMD_T::Integer leftEqualsRight = SIMD_T::cmpeq_epi32(bbox.xmin, bbox.xmax);
 902         bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, SIMD_T::add_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), leftEqualsRight);
 903     }
 904
 905     // Cull tris completely outside scissor
 906     {
 907         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
 908         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
 909         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
 910         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
 911         triMask = triMask & ~maskOutsideScissor;
 912     }
 913
 914 #if KNOB_ENABLE_EARLY_RAST
 915     if (rastState.sampleCount == SWR_MULTISAMPLE_1X && !CT::IsConservativeT::value)
 916     {
 917         // Try early rasterization - culling small triangles which do not cover any pixels
 918
 919         // convert to ER tiles
 920         SIMDBBOX_T<SIMD_T> er_bbox;
 921
 922         er_bbox.xmin = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmin);
 923         er_bbox.xmax = SIMD_T::template srai_epi32<ER_SIMD_TILE_X_SHIFT + FIXED_POINT_SHIFT>(bbox.xmax);
 924         er_bbox.ymin = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymin);
 925         er_bbox.ymax = SIMD_T::template srai_epi32<ER_SIMD_TILE_Y_SHIFT + FIXED_POINT_SHIFT>(bbox.ymax);
 926
 927         typename SIMD_T::Integer vTileX = SIMD_T::cmpeq_epi32(er_bbox.xmin, er_bbox.xmax);
 928         typename SIMD_T::Integer vTileY = SIMD_T::cmpeq_epi32(er_bbox.ymin, er_bbox.ymax);
 929
 930         // Take only triangles that fit into ER tile
 931         uint32_t oneTileMask = triMask & SIMD_T::movemask_ps(SIMD_T::castsi_ps(SIMD_T::and_si(vTileX, vTileY)));
 932
 933         if (oneTileMask)
 934         {
 935             // determine CW tris (det > 0)
 936             uint32_t maskCwLo = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[0], SIMD_T::setzero_si())));
 937             uint32_t maskCwHi = SIMD_T::movemask_pd(SIMD_T::castsi_pd(SIMD_T::cmpgt_epi64(vDet[1], SIMD_T::setzero_si())));
 938             uint32_t cwTrisMask = maskCwLo | (maskCwHi << (SIMD_WIDTH / 2));
 939
 940             // Try early rasterization
 941             triMask = EarlyRasterizer<SIMD_T, SIMD_WIDTH, CT>(er_bbox, vAi, vBi, vXi, vYi, cwTrisMask, triMask, oneTileMask);
 942
 943             if (!triMask)
 944             {
 945                 RDTSC_END(FEBinTriangles, 1);
 946                 return;
 947             }
 948         }
 949
 950     }
 951 #endif
 952
 953 endBinTriangles:
 954
 955
 956     // Send surviving triangles to the line or point binner based on fill mode
 957     if (rastState.fillMode == SWR_FILLMODE_WIREFRAME)
 958     {
 959         // Simple non-conformant wireframe mode, useful for debugging
 960         // construct 3 SIMD lines out of the triangle and call the line binner for each SIMD
 961         typename SIMD_T::Vec4 line[2];
 962         typename SIMD_T::Float recipW[2];
 963
 964         line[0] = tri[0];
 965         line[1] = tri[1];
 966         recipW[0] = vRecipW0;
 967         recipW[1] = vRecipW1;
 968
 969         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 970
 971         line[0] = tri[1];
 972         line[1] = tri[2];
 973         recipW[0] = vRecipW1;
 974         recipW[1] = vRecipW2;
 975
 976         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 977
 978         line[0] = tri[2];
 979         line[1] = tri[0];
 980         recipW[0] = vRecipW2;
 981         recipW[1] = vRecipW0;
 982
 983         BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, line, recipW, triMask, primID, viewportIdx, rtIdx);
 984
 985         RDTSC_END(FEBinTriangles, 1);
 986         return;
 987     }
 988     else if (rastState.fillMode == SWR_FILLMODE_POINT)
 989     {
 990         // Bin 3 points
 991         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[0], triMask, primID, viewportIdx, rtIdx);
 992         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[1], triMask, primID, viewportIdx, rtIdx);
 993         BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(pDC, pa, workerId, &tri[2], triMask, primID, viewportIdx, rtIdx);
 994
 995         RDTSC_END(FEBinTriangles, 1);
 996         return;
 997     }
 998
 999     // Convert triangle bbox to macrotile units.
1000     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1001     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1002     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1003     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1004
1005     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1006
1007     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1008     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1009     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1010     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1011
1012     // transpose verts needed for backend
1013     /// @todo modify BE to take non-transformed verts
1014     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1015     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1016     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1017     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1018
1019     TransposeVertices(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1020     TransposeVertices(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1021     TransposeVertices(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1022     TransposeVertices(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1023
1024     // scan remaining valid triangles and bin each separately
1025     while (_BitScanForward(&triIndex, triMask))
1026     {
1027         uint32_t linkageCount = state.backendState.numAttributes;
1028         uint32_t numScalarAttribs = linkageCount * 4;
1029
1030         BE_WORK work;
1031         work.type = DRAW;
1032
1033         bool isDegenerate;
1034         if (CT::IsConservativeT::value)
1035         {
1036             // only rasterize valid edges if we have a degenerate primitive
1037             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
1038             work.pfnWork = GetRasterizerFunc(rastState.sampleCount, rastState.bIsCenterPattern, (rastState.conservativeRast > 0),
1039                 (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(triEdgeEnable), (state.scissorsTileAligned == false));
1040
1041             // Degenerate triangles are required to be constant interpolated
1042             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
1043         }
1044         else
1045         {
1046             isDegenerate = false;
1047             work.pfnWork = pfnWork;
1048         }
1049
1050         // Select attribute processor
1051         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
1052             state.backendState.swizzleEnable, state.backendState.constantInterpolationMask, isDegenerate);
1053
1054         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1055
1056         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1057         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1058         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
1059
1060         auto pArena = pDC->pArena;
1061         SWR_ASSERT(pArena != nullptr);
1062
1063         // store active attribs
1064         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1065         desc.pAttribs = pAttribs;
1066         desc.numAttribs = linkageCount;
1067         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
1068
1069         // store triangle vertex data
1070         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1071
1072         SIMD128::store_ps(&desc.pTriBuffer[0],  vHorizX[triIndex]);
1073         SIMD128::store_ps(&desc.pTriBuffer[4],  vHorizY[triIndex]);
1074         SIMD128::store_ps(&desc.pTriBuffer[8],  vHorizZ[triIndex]);
1075         SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1076
1077         // store user clip distances
1078         if (state.backendState.clipDistanceMask)
1079         {
1080             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1081             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1082             ProcessUserClipDist<3>(state.backendState, pa, triIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1083         }
1084
1085         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1086         {
1087             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1088             {
1089 #if KNOB_ENABLE_TOSS_POINTS
1090                 if (!KNOB_TOSS_SETUP_TRIS)
1091 #endif
1092                 {
1093                     pTileMgr->enqueue(x, y, &work);
1094                 }
1095             }
1096         }
1097
1098                      triMask &= ~(1 << triIndex);
1099     }
1100
1101     RDTSC_END(FEBinTriangles, 1);
1102 }
1103
1104 template <typename CT>
1105 void BinTriangles(
1106     DRAW_CONTEXT *pDC,
1107     PA_STATE &pa,
1108     uint32_t workerId,
1109     simdvector tri[3],
1110     uint32_t triMask,
1111     simdscalari const &primID,
1112     simdscalari const &viewportIdx,
1113     simdscalari const &rtIdx)
1114 {
1115     BinTrianglesImpl<SIMD256, KNOB_SIMD_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1116 }
1117
1118 #if USE_SIMD16_FRONTEND
1119 template <typename CT>
1120 void SIMDCALL BinTriangles_simd16(
1121     DRAW_CONTEXT *pDC,
1122     PA_STATE &pa,
1123     uint32_t workerId,
1124     simd16vector tri[3],
1125     uint32_t triMask,
1126     simd16scalari const &primID,
1127     simd16scalari const &viewportIdx,
1128     simd16scalari const &rtIdx)
1129 {
1130     BinTrianglesImpl<SIMD512, KNOB_SIMD16_WIDTH, CT>(pDC, pa, workerId, tri, triMask, primID, viewportIdx, rtIdx);
1131 }
1132
1133 #endif
1134 struct FEBinTrianglesChooser
1135 {
1136     typedef PFN_PROCESS_PRIMS FuncType;
1137
1138     template <typename... ArgsB>
1139     static FuncType GetFunc()
1140     {
1141         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
1142     }
1143 };
1144
1145 // Selector for correct templated BinTrinagles function
1146 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
1147 {
1148     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
1149 }
1150
1151 #if USE_SIMD16_FRONTEND
1152 struct FEBinTrianglesChooser_simd16
1153 {
1154     typedef PFN_PROCESS_PRIMS_SIMD16 FuncType;
1155
1156     template <typename... ArgsB>
1157     static FuncType GetFunc()
1158     {
1159         return BinTriangles_simd16<ConservativeRastFETraits<ArgsB...>>;
1160     }
1161 };
1162
1163 // Selector for correct templated BinTrinagles function
1164 PFN_PROCESS_PRIMS_SIMD16 GetBinTrianglesFunc_simd16(bool IsConservative)
1165 {
1166     return TemplateArgUnroller<FEBinTrianglesChooser_simd16>::GetFunc(IsConservative);
1167 }
1168
1169 #endif
1170
1171 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1172 void BinPostSetupPointsImpl(
1173     DRAW_CONTEXT *pDC,
1174     PA_STATE &pa,
1175     uint32_t workerId,
1176     typename SIMD_T::Vec4 prim[],
1177     uint32_t primMask,
1178     typename SIMD_T::Integer const &primID,
1179     typename SIMD_T::Integer const &viewportIdx,
1180     typename SIMD_T::Integer const &rtIdx)
1181 {
1182     RDTSC_BEGIN(FEBinPoints, pDC->drawId);
1183
1184     typename SIMD_T::Vec4 &primVerts = prim[0];
1185
1186     const API_STATE& state = GetApiState(pDC);
1187     const SWR_RASTSTATE& rastState = state.rastState;
1188     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1189
1190     // Select attribute processor
1191     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
1192         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1193
1194     // convert to fixed point
1195     typename SIMD_T::Integer vXi, vYi;
1196
1197     vXi = fpToFixedPointVertical<SIMD_T>(primVerts.x);
1198     vYi = fpToFixedPointVertical<SIMD_T>(primVerts.y);
1199
1200     if (CanUseSimplePoints(pDC))
1201     {
1202         // adjust for ymin-xmin rule
1203         vXi = SIMD_T::sub_epi32(vXi, SIMD_T::set1_epi32(1));
1204         vYi = SIMD_T::sub_epi32(vYi, SIMD_T::set1_epi32(1));
1205
1206         // cull points off the ymin-xmin edge of the viewport
1207         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vXi));
1208         primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vYi));
1209
1210         // compute macro tile coordinates
1211         typename SIMD_T::Integer macroX = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(vXi);
1212         typename SIMD_T::Integer macroY = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(vYi);
1213
1214         OSALIGNSIMD16(uint32_t) aMacroX[SIMD_WIDTH], aMacroY[SIMD_WIDTH];
1215
1216         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroX), macroX);
1217         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMacroY), macroY);
1218
1219         // compute raster tile coordinates
1220         typename SIMD_T::Integer rasterX = SIMD_T::template srai_epi32<KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT>(vXi);
1221         typename SIMD_T::Integer rasterY = SIMD_T::template srai_epi32<KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT>(vYi);
1222
1223         // compute raster tile relative x,y for coverage mask
1224         typename SIMD_T::Integer tileAlignedX = SIMD_T::template slli_epi32<KNOB_TILE_X_DIM_SHIFT>(rasterX);
1225         typename SIMD_T::Integer tileAlignedY = SIMD_T::template slli_epi32<KNOB_TILE_Y_DIM_SHIFT>(rasterY);
1226
1227         typename SIMD_T::Integer tileRelativeX = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vXi), tileAlignedX);
1228         typename SIMD_T::Integer tileRelativeY = SIMD_T::sub_epi32(SIMD_T::template srai_epi32<FIXED_POINT_SHIFT>(vYi), tileAlignedY);
1229
1230         OSALIGNSIMD16(uint32_t) aTileRelativeX[SIMD_WIDTH];
1231         OSALIGNSIMD16(uint32_t) aTileRelativeY[SIMD_WIDTH];
1232
1233         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeX), tileRelativeX);
1234         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileRelativeY), tileRelativeY);
1235
1236         OSALIGNSIMD16(uint32_t) aTileAlignedX[SIMD_WIDTH];
1237         OSALIGNSIMD16(uint32_t) aTileAlignedY[SIMD_WIDTH];
1238
1239         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedX), tileAlignedX);
1240         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aTileAlignedY), tileAlignedY);
1241
1242         OSALIGNSIMD16(float) aZ[SIMD_WIDTH];
1243         SIMD_T::store_ps(reinterpret_cast<float *>(aZ), primVerts.z);
1244
1245         // store render target array index
1246         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1247
1248         uint32_t *pPrimID = (uint32_t *)&primID;
1249         DWORD primIndex = 0;
1250
1251         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1252
1253         // scan remaining valid triangles and bin each separately
1254         while (_BitScanForward(&primIndex, primMask))
1255         {
1256             uint32_t linkageCount = backendState.numAttributes;
1257             uint32_t numScalarAttribs = linkageCount * 4;
1258
1259             BE_WORK work;
1260             work.type = DRAW;
1261
1262             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1263
1264             // points are always front facing
1265             desc.triFlags.frontFacing = 1;
1266             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1267             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1268
1269             work.pfnWork = RasterizeSimplePoint;
1270
1271             auto pArena = pDC->pArena;
1272             SWR_ASSERT(pArena != nullptr);
1273
1274             // store attributes
1275             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1276             desc.pAttribs = pAttribs;
1277             desc.numAttribs = linkageCount;
1278
1279             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
1280
1281             // store raster tile aligned x, y, perspective correct z
1282             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1283             desc.pTriBuffer = pTriBuffer;
1284             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1285             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1286             *pTriBuffer = aZ[primIndex];
1287
1288             uint32_t tX = aTileRelativeX[primIndex];
1289             uint32_t tY = aTileRelativeY[primIndex];
1290
1291             // pack the relative x,y into the coverageMask, the rasterizer will
1292             // generate the true coverage mask from it
1293             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
1294
1295             // bin it
1296             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1297 #if KNOB_ENABLE_TOSS_POINTS
1298             if (!KNOB_TOSS_SETUP_TRIS)
1299 #endif
1300             {
1301                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
1302             }
1303
1304             primMask &= ~(1 << primIndex);
1305         }
1306     }
1307     else
1308     {
1309         // non simple points need to be potentially binned to multiple macro tiles
1310         typename SIMD_T::Float vPointSize;
1311
1312         if (rastState.pointParam)
1313         {
1314             typename SIMD_T::Vec4 size[3];
1315             pa.Assemble(VERTEX_SGV_SLOT, size);
1316             vPointSize = size[0][VERTEX_SGV_POINT_SIZE_COMP];
1317         }
1318         else
1319         {
1320             vPointSize = SIMD_T::set1_ps(rastState.pointSize);
1321         }
1322
1323         // bloat point to bbox
1324         SIMDBBOX_T<SIMD_T> bbox;
1325
1326         bbox.xmin = bbox.xmax = vXi;
1327         bbox.ymin = bbox.ymax = vYi;
1328
1329         typename SIMD_T::Float vHalfWidth = SIMD_T::mul_ps(vPointSize, SIMD_T::set1_ps(0.5f));
1330         typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1331
1332         bbox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1333         bbox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1334         bbox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1335         bbox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1336
1337         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1338         // Gather the AOS effective scissor rects based on the per-prim VP index.
1339         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
1340         {
1341             typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1342
1343             if (pa.viewportArrayActive)
1344             {
1345                 GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1346             }
1347             else // broadcast fast path for non-VPAI case.
1348             {
1349                 scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1350                 scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1351                 scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1352                 scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1353             }
1354
1355             bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1356             bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1357             bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1358             bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1359         }
1360
1361         // Cull bloated points completely outside scissor
1362         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1363         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1364         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1365         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1366         primMask = primMask & ~maskOutsideScissor;
1367
1368         // Convert bbox to macrotile units.
1369         bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1370         bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1371         bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1372         bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1373
1374         OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1375
1376         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1377         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1378         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1379         SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1380
1381         // store render target array index
1382         const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1383
1384         OSALIGNSIMD16(float) aPointSize[SIMD_WIDTH];
1385         SIMD_T::store_ps(reinterpret_cast<float *>(aPointSize), vPointSize);
1386
1387         uint32_t *pPrimID = (uint32_t *)&primID;
1388
1389         OSALIGNSIMD16(float) aPrimVertsX[SIMD_WIDTH];
1390         OSALIGNSIMD16(float) aPrimVertsY[SIMD_WIDTH];
1391         OSALIGNSIMD16(float) aPrimVertsZ[SIMD_WIDTH];
1392
1393         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsX), primVerts.x);
1394         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsY), primVerts.y);
1395         SIMD_T::store_ps(reinterpret_cast<float *>(aPrimVertsZ), primVerts.z);
1396
1397         // scan remaining valid prims and bin each separately
1398         const SWR_BACKEND_STATE& backendState = state.backendState;
1399         DWORD primIndex;
1400         while (_BitScanForward(&primIndex, primMask))
1401         {
1402             uint32_t linkageCount = backendState.numAttributes;
1403             uint32_t numScalarAttribs = linkageCount * 4;
1404
1405             BE_WORK work;
1406             work.type = DRAW;
1407
1408             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1409
1410             desc.triFlags.frontFacing = 1;
1411             desc.triFlags.pointSize = aPointSize[primIndex];
1412             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1413             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1414
1415             work.pfnWork = RasterizeTriPoint;
1416
1417             auto pArena = pDC->pArena;
1418             SWR_ASSERT(pArena != nullptr);
1419
1420             // store active attribs
1421             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1422             desc.numAttribs = linkageCount;
1423             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1424
1425             // store point vertex data
1426             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1427             desc.pTriBuffer = pTriBuffer;
1428             *pTriBuffer++ = aPrimVertsX[primIndex];
1429             *pTriBuffer++ = aPrimVertsY[primIndex];
1430             *pTriBuffer = aPrimVertsZ[primIndex];
1431
1432             // store user clip distances
1433             if (backendState.clipDistanceMask)
1434             {
1435                 uint32_t numClipDist = _mm_popcnt_u32(backendState.clipDistanceMask);
1436                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1437                 float dists[8];
1438                 float one = 1.0f;
1439                 ProcessUserClipDist<1>(backendState, pa, primIndex, &one, dists);
1440                 for (uint32_t i = 0; i < numClipDist; i++) {
1441                     desc.pUserClipBuffer[3 * i + 0] = 0.0f;
1442                     desc.pUserClipBuffer[3 * i + 1] = 0.0f;
1443                     desc.pUserClipBuffer[3 * i + 2] = dists[i];
1444                 }
1445             }
1446
1447             MacroTileMgr *pTileMgr = pDC->pTileMgr;
1448             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1449             {
1450                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1451                 {
1452 #if KNOB_ENABLE_TOSS_POINTS
1453                     if (!KNOB_TOSS_SETUP_TRIS)
1454 #endif
1455                     {
1456                         pTileMgr->enqueue(x, y, &work);
1457                     }
1458                 }
1459             }
1460
1461             primMask &= ~(1 << primIndex);
1462         }
1463     }
1464
1465     RDTSC_END(FEBinPoints, 1);
1466 }
1467
1468 //////////////////////////////////////////////////////////////////////////
1469 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1470 /// @param pDC - pointer to draw context.
1471 /// @param pa - The primitive assembly object.
1472 /// @param workerId - thread's worker id. Even thread has a unique id.
1473 /// @param tri - Contains point position data for SIMDs worth of points.
1474 /// @param primID - Primitive ID for each point.
1475 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1476 void BinPointsImpl(
1477     DRAW_CONTEXT *pDC,
1478     PA_STATE &pa,
1479     uint32_t workerId,
1480     typename SIMD_T::Vec4 prim[3],
1481     uint32_t primMask,
1482     typename SIMD_T::Integer const &primID,
1483     typename SIMD_T::Integer const &viewportIdx,
1484     typename SIMD_T::Integer const &rtIdx)
1485 {
1486     const API_STATE& state = GetApiState(pDC);
1487     const SWR_FRONTEND_STATE& feState = state.frontendState;
1488     const SWR_RASTSTATE& rastState = state.rastState;
1489
1490     if (!feState.vpTransformDisable)
1491     {
1492         // perspective divide
1493         typename SIMD_T::Float vRecipW0 = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1494
1495         prim[0].x = SIMD_T::mul_ps(prim[0].x, vRecipW0);
1496         prim[0].y = SIMD_T::mul_ps(prim[0].y, vRecipW0);
1497         prim[0].z = SIMD_T::mul_ps(prim[0].z, vRecipW0);
1498
1499         // viewport transform to screen coords
1500         if (pa.viewportArrayActive)
1501         {
1502             viewportTransform<1>(prim, state.vpMatrices, viewportIdx);
1503         }
1504         else
1505         {
1506             viewportTransform<1>(prim, state.vpMatrices);
1507         }
1508     }
1509
1510     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1511
1512     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1513     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1514
1515     BinPostSetupPointsImpl<SIMD_T, SIMD_WIDTH>(
1516         pDC,
1517         pa,
1518         workerId,
1519         prim,
1520         primMask,
1521         primID,
1522         viewportIdx,
1523         rtIdx);
1524 }
1525
1526 void BinPoints(
1527     DRAW_CONTEXT *pDC,
1528     PA_STATE &pa,
1529     uint32_t workerId,
1530     simdvector prim[3],
1531     uint32_t primMask,
1532     simdscalari const &primID,
1533     simdscalari const &viewportIdx,
1534     simdscalari const &rtIdx)
1535 {
1536     BinPointsImpl<SIMD256, KNOB_SIMD_WIDTH>(
1537         pDC,
1538         pa,
1539         workerId,
1540         prim,
1541         primMask,
1542         primID,
1543         viewportIdx,
1544         rtIdx);
1545 }
1546
1547 #if USE_SIMD16_FRONTEND
1548 void SIMDCALL BinPoints_simd16(
1549     DRAW_CONTEXT *pDC,
1550     PA_STATE &pa,
1551     uint32_t workerId,
1552     simd16vector prim[3],
1553     uint32_t primMask,
1554     simd16scalari const &primID,
1555     simd16scalari const &viewportIdx,
1556     simd16scalari const & rtIdx)
1557 {
1558     BinPointsImpl<SIMD512, KNOB_SIMD16_WIDTH>(
1559         pDC,
1560         pa,
1561         workerId,
1562         prim,
1563         primMask,
1564         primID,
1565         viewportIdx,
1566         rtIdx);
1567 }
1568
1569 #endif
1570 //////////////////////////////////////////////////////////////////////////
1571 /// @brief Bin SIMD lines to the backend.
1572 /// @param pDC - pointer to draw context.
1573 /// @param pa - The primitive assembly object.
1574 /// @param workerId - thread's worker id. Even thread has a unique id.
1575 /// @param tri - Contains line position data for SIMDs worth of points.
1576 /// @param primID - Primitive ID for each line.
1577 /// @param viewportIdx - Viewport Array Index for each line.
1578 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1579 void BinPostSetupLinesImpl(
1580     DRAW_CONTEXT *pDC,
1581     PA_STATE &pa,
1582     uint32_t workerId,
1583     typename SIMD_T::Vec4 prim[],
1584     typename SIMD_T::Float recipW[],
1585     uint32_t primMask,
1586     typename SIMD_T::Integer const &primID,
1587     typename SIMD_T::Integer const &viewportIdx,
1588     typename SIMD_T::Integer const &rtIdx)
1589 {
1590     const uint32_t *aRTAI = reinterpret_cast<const uint32_t *>(&rtIdx);
1591
1592     RDTSC_BEGIN(FEBinLines, pDC->drawId);
1593
1594     const API_STATE &state = GetApiState(pDC);
1595     const SWR_RASTSTATE &rastState = state.rastState;
1596
1597     // Select attribute processor
1598     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
1599         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
1600
1601     typename SIMD_T::Float &vRecipW0 = recipW[0];
1602     typename SIMD_T::Float &vRecipW1 = recipW[1];
1603
1604     // convert to fixed point
1605     typename SIMD_T::Integer vXi[2], vYi[2];
1606
1607     vXi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].x);
1608     vYi[0] = fpToFixedPointVertical<SIMD_T>(prim[0].y);
1609     vXi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].x);
1610     vYi[1] = fpToFixedPointVertical<SIMD_T>(prim[1].y);
1611
1612     // compute x-major vs y-major mask
1613     typename SIMD_T::Integer xLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vXi[0], vXi[1]));
1614     typename SIMD_T::Integer yLength = SIMD_T::abs_epi32(SIMD_T::sub_epi32(vYi[0], vYi[1]));
1615     typename SIMD_T::Float vYmajorMask = SIMD_T::castsi_ps(SIMD_T::cmpgt_epi32(yLength, xLength));
1616     uint32_t yMajorMask = SIMD_T::movemask_ps(vYmajorMask);
1617
1618     // cull zero-length lines
1619     typename SIMD_T::Integer vZeroLengthMask = SIMD_T::cmpeq_epi32(xLength, SIMD_T::setzero_si());
1620     vZeroLengthMask = SIMD_T::and_si(vZeroLengthMask, SIMD_T::cmpeq_epi32(yLength, SIMD_T::setzero_si()));
1621
1622     primMask &= ~SIMD_T::movemask_ps(SIMD_T::castsi_ps(vZeroLengthMask));
1623
1624     uint32_t *pPrimID = (uint32_t *)&primID;
1625     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1626
1627     // Calc bounding box of lines
1628     SIMDBBOX_T<SIMD_T> bbox;
1629     bbox.xmin = SIMD_T::min_epi32(vXi[0], vXi[1]);
1630     bbox.xmax = SIMD_T::max_epi32(vXi[0], vXi[1]);
1631     bbox.ymin = SIMD_T::min_epi32(vYi[0], vYi[1]);
1632     bbox.ymax = SIMD_T::max_epi32(vYi[0], vYi[1]);
1633
1634     // bloat bbox by line width along minor axis
1635     typename SIMD_T::Float vHalfWidth = SIMD_T::set1_ps(rastState.lineWidth / 2.0f);
1636     typename SIMD_T::Integer vHalfWidthi = fpToFixedPointVertical<SIMD_T>(vHalfWidth);
1637
1638     SIMDBBOX_T<SIMD_T> bloatBox;
1639
1640     bloatBox.xmin = SIMD_T::sub_epi32(bbox.xmin, vHalfWidthi);
1641     bloatBox.xmax = SIMD_T::add_epi32(bbox.xmax, vHalfWidthi);
1642     bloatBox.ymin = SIMD_T::sub_epi32(bbox.ymin, vHalfWidthi);
1643     bloatBox.ymax = SIMD_T::add_epi32(bbox.ymax, vHalfWidthi);
1644
1645     bbox.xmin = SIMD_T::blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
1646     bbox.xmax = SIMD_T::blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
1647     bbox.ymin = SIMD_T::blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
1648     bbox.ymax = SIMD_T::blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
1649
1650     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
1651     {
1652         typename SIMD_T::Integer scisXmin, scisYmin, scisXmax, scisYmax;
1653
1654         if (pa.viewportArrayActive)
1655         {
1656             GatherScissors(&state.scissorsInFixedPoint[0], pViewportIndex, scisXmin, scisYmin, scisXmax, scisYmax);
1657         }
1658         else // broadcast fast path for non-VPAI case.
1659         {
1660             scisXmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmin);
1661             scisYmin = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymin);
1662             scisXmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].xmax);
1663             scisYmax = SIMD_T::set1_epi32(state.scissorsInFixedPoint[0].ymax);
1664         }
1665
1666         bbox.xmin = SIMD_T::max_epi32(bbox.xmin, scisXmin);
1667         bbox.ymin = SIMD_T::max_epi32(bbox.ymin, scisYmin);
1668         bbox.xmax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.xmax, SIMD_T::set1_epi32(1)), scisXmax);
1669         bbox.ymax = SIMD_T::min_epi32(SIMD_T::sub_epi32(bbox.ymax, SIMD_T::set1_epi32(1)), scisYmax);
1670     }
1671
1672     // Cull prims completely outside scissor
1673     {
1674         typename SIMD_T::Integer maskOutsideScissorX = SIMD_T::cmpgt_epi32(bbox.xmin, bbox.xmax);
1675         typename SIMD_T::Integer maskOutsideScissorY = SIMD_T::cmpgt_epi32(bbox.ymin, bbox.ymax);
1676         typename SIMD_T::Integer maskOutsideScissorXY = SIMD_T::or_si(maskOutsideScissorX, maskOutsideScissorY);
1677         uint32_t maskOutsideScissor = SIMD_T::movemask_ps(SIMD_T::castsi_ps(maskOutsideScissorXY));
1678         primMask = primMask & ~maskOutsideScissor;
1679     }
1680
1681     // transpose verts needed for backend
1682     /// @todo modify BE to take non-transformed verts
1683     OSALIGNSIMD16(simd4scalar) vHorizX[SIMD_WIDTH];
1684     OSALIGNSIMD16(simd4scalar) vHorizY[SIMD_WIDTH];
1685     OSALIGNSIMD16(simd4scalar) vHorizZ[SIMD_WIDTH];
1686     OSALIGNSIMD16(simd4scalar) vHorizW[SIMD_WIDTH];
1687
1688     if (!primMask)
1689     {
1690         goto endBinLines;
1691     }
1692
1693     // Convert triangle bbox to macrotile units.
1694     bbox.xmin = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmin);
1695     bbox.ymin = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymin);
1696     bbox.xmax = SIMD_T::template srai_epi32<KNOB_MACROTILE_X_DIM_FIXED_SHIFT>(bbox.xmax);
1697     bbox.ymax = SIMD_T::template srai_epi32<KNOB_MACROTILE_Y_DIM_FIXED_SHIFT>(bbox.ymax);
1698
1699     OSALIGNSIMD16(uint32_t) aMTLeft[SIMD_WIDTH], aMTRight[SIMD_WIDTH], aMTTop[SIMD_WIDTH], aMTBottom[SIMD_WIDTH];
1700
1701     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTLeft),   bbox.xmin);
1702     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTRight),  bbox.xmax);
1703     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTTop),    bbox.ymin);
1704     SIMD_T::store_si(reinterpret_cast<typename SIMD_T::Integer *>(aMTBottom), bbox.ymax);
1705
1706     TransposeVertices(vHorizX, prim[0].x, prim[1].x, SIMD_T::setzero_ps());
1707     TransposeVertices(vHorizY, prim[0].y, prim[1].y, SIMD_T::setzero_ps());
1708     TransposeVertices(vHorizZ, prim[0].z, prim[1].z, SIMD_T::setzero_ps());
1709     TransposeVertices(vHorizW, vRecipW0,  vRecipW1,  SIMD_T::setzero_ps());
1710
1711     // scan remaining valid prims and bin each separately
1712     DWORD primIndex;
1713     while (_BitScanForward(&primIndex, primMask))
1714     {
1715         uint32_t linkageCount = state.backendState.numAttributes;
1716         uint32_t numScalarAttribs = linkageCount * 4;
1717
1718         BE_WORK work;
1719         work.type = DRAW;
1720
1721         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1722
1723         desc.triFlags.frontFacing = 1;
1724         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
1725         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1726         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
1727
1728         work.pfnWork = RasterizeLine;
1729
1730         auto pArena = pDC->pArena;
1731         SWR_ASSERT(pArena != nullptr);
1732
1733         // store active attribs
1734         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1735         desc.numAttribs = linkageCount;
1736         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
1737
1738         // store line vertex data
1739         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1740
1741         _mm_store_ps(&desc.pTriBuffer[0],  vHorizX[primIndex]);
1742         _mm_store_ps(&desc.pTriBuffer[4],  vHorizY[primIndex]);
1743         _mm_store_ps(&desc.pTriBuffer[8],  vHorizZ[primIndex]);
1744         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
1745
1746         // store user clip distances
1747         if (state.backendState.clipDistanceMask)
1748         {
1749             uint32_t numClipDist = _mm_popcnt_u32(state.backendState.clipDistanceMask);
1750             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
1751             ProcessUserClipDist<2>(state.backendState, pa, primIndex, &desc.pTriBuffer[12], desc.pUserClipBuffer);
1752         }
1753
1754         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1755         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
1756         {
1757             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
1758             {
1759 #if KNOB_ENABLE_TOSS_POINTS
1760                 if (!KNOB_TOSS_SETUP_TRIS)
1761 #endif
1762                 {
1763                     pTileMgr->enqueue(x, y, &work);
1764                 }
1765             }
1766         }
1767
1768         primMask &= ~(1 << primIndex);
1769     }
1770
1771 endBinLines:
1772
1773     RDTSC_END(FEBinLines, 1);
1774 }
1775
1776 //////////////////////////////////////////////////////////////////////////
1777 /// @brief Bin SIMD lines to the backend.
1778 /// @param pDC - pointer to draw context.
1779 /// @param pa - The primitive assembly object.
1780 /// @param workerId - thread's worker id. Even thread has a unique id.
1781 /// @param tri - Contains line position data for SIMDs worth of points.
1782 /// @param primID - Primitive ID for each line.
1783 /// @param viewportIdx - Viewport Array Index for each line.
1784 template <typename SIMD_T, uint32_t SIMD_WIDTH>
1785 void SIMDCALL BinLinesImpl(
1786     DRAW_CONTEXT *pDC,
1787     PA_STATE &pa,
1788     uint32_t workerId,
1789     typename SIMD_T::Vec4 prim[3],
1790     uint32_t primMask,
1791     typename SIMD_T::Integer const &primID,
1792     typename SIMD_T::Integer const &viewportIdx,
1793     typename SIMD_T::Integer const & rtIdx)
1794 {
1795     const API_STATE& state = GetApiState(pDC);
1796     const SWR_RASTSTATE& rastState = state.rastState;
1797     const SWR_FRONTEND_STATE& feState = state.frontendState;
1798
1799     typename SIMD_T::Float vRecipW[2] = { SIMD_T::set1_ps(1.0f), SIMD_T::set1_ps(1.0f) };
1800
1801     if (!feState.vpTransformDisable)
1802     {
1803         // perspective divide
1804         vRecipW[0] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[0].w);
1805         vRecipW[1] = SIMD_T::div_ps(SIMD_T::set1_ps(1.0f), prim[1].w);
1806
1807         prim[0].v[0] = SIMD_T::mul_ps(prim[0].v[0], vRecipW[0]);
1808         prim[1].v[0] = SIMD_T::mul_ps(prim[1].v[0], vRecipW[1]);
1809
1810         prim[0].v[1] = SIMD_T::mul_ps(prim[0].v[1], vRecipW[0]);
1811         prim[1].v[1] = SIMD_T::mul_ps(prim[1].v[1], vRecipW[1]);
1812
1813         prim[0].v[2] = SIMD_T::mul_ps(prim[0].v[2], vRecipW[0]);
1814         prim[1].v[2] = SIMD_T::mul_ps(prim[1].v[2], vRecipW[1]);
1815
1816         // viewport transform to screen coords
1817         if (pa.viewportArrayActive)
1818         {
1819             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
1820         }
1821         else
1822         {
1823             viewportTransform<2>(prim, state.vpMatrices);
1824         }
1825     }
1826
1827     // adjust for pixel center location
1828     typename SIMD_T::Float offset = SwrPixelOffsets<SIMD_T>::GetOffset(rastState.pixelLocation);
1829
1830     prim[0].x = SIMD_T::add_ps(prim[0].x, offset);
1831     prim[0].y = SIMD_T::add_ps(prim[0].y, offset);
1832
1833     prim[1].x = SIMD_T::add_ps(prim[1].x, offset);
1834     prim[1].y = SIMD_T::add_ps(prim[1].y, offset);
1835
1836     BinPostSetupLinesImpl<SIMD_T, SIMD_WIDTH>(
1837         pDC,
1838         pa,
1839         workerId,
1840         prim,
1841         vRecipW,
1842         primMask,
1843         primID,
1844         viewportIdx,
1845         rtIdx);
1846 }
1847
1848 void BinLines(
1849     DRAW_CONTEXT *pDC,
1850     PA_STATE &pa,
1851     uint32_t workerId,
1852     simdvector prim[],
1853     uint32_t primMask,
1854     simdscalari const &primID,
1855     simdscalari const &viewportIdx,
1856     simdscalari const &rtIdx)
1857 {
1858     BinLinesImpl<SIMD256, KNOB_SIMD_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1859 }
1860
1861 #if USE_SIMD16_FRONTEND
1862 void SIMDCALL BinLines_simd16(
1863     DRAW_CONTEXT *pDC,
1864     PA_STATE &pa,
1865     uint32_t workerId,
1866     simd16vector prim[3],
1867     uint32_t primMask,
1868     simd16scalari const &primID,
1869     simd16scalari const &viewportIdx,
1870     simd16scalari const &rtIdx)
1871 {
1872     BinLinesImpl<SIMD512, KNOB_SIMD16_WIDTH>(pDC, pa, workerId, prim, primMask, primID, viewportIdx, rtIdx);
1873 }
1874
1875 #endif