src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrDestroyContext.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to sync callback.
  89 void ProcessShutdown(
  90     SWR_CONTEXT *pContext,
  91     DRAW_CONTEXT *pDC,
  92     uint32_t workerId,
  93     void *pUserData)
  94 {
  95     BE_WORK work;
  96     work.type = SHUTDOWN;
  97     work.pfnWork = ProcessShutdownBE;
  98
  99     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 100     // Enqueue at least 1 work item for each worker thread
 101     // account for number of numa nodes
 102     uint32_t numNumaNodes = pContext->threadPool.numaMask + 1;
 103
 104     for (uint32_t i = 0; i < pContext->threadPool.numThreads; ++i)
 105     {
 106         for (uint32_t n = 0; n < numNumaNodes; ++n)
 107         {
 108             pTileMgr->enqueue(i, n, &work);
 109         }
 110     }
 111 }
 112
 113 //////////////////////////////////////////////////////////////////////////
 114 /// @brief FE handler for SwrClearRenderTarget.
 115 /// @param pContext - pointer to SWR context.
 116 /// @param pDC - pointer to draw context.
 117 /// @param workerId - thread's worker id. Even thread has a unique id.
 118 /// @param pUserData - Pointer to user data passed back to clear callback.
 119 /// @todo This should go away when we switch this to use compute threading.
 120 void ProcessClear(
 121     SWR_CONTEXT *pContext,
 122     DRAW_CONTEXT *pDC,
 123     uint32_t workerId,
 124     void *pUserData)
 125 {
 126     CLEAR_DESC *pDesc = (CLEAR_DESC*)pUserData;
 127     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 128
 129     // queue a clear to each macro tile
 130     // compute macro tile bounds for the specified rect
 131     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 132     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 133     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 134     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 135
 136     BE_WORK work;
 137     work.type = CLEAR;
 138     work.pfnWork = ProcessClearBE;
 139     work.desc.clear = *pDesc;
 140
 141     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 142     {
 143         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 144         {
 145             pTileMgr->enqueue(x, y, &work);
 146         }
 147     }
 148 }
 149
 150 //////////////////////////////////////////////////////////////////////////
 151 /// @brief FE handler for SwrStoreTiles.
 152 /// @param pContext - pointer to SWR context.
 153 /// @param pDC - pointer to draw context.
 154 /// @param workerId - thread's worker id. Even thread has a unique id.
 155 /// @param pUserData - Pointer to user data passed back to callback.
 156 /// @todo This should go away when we switch this to use compute threading.
 157 void ProcessStoreTiles(
 158     SWR_CONTEXT *pContext,
 159     DRAW_CONTEXT *pDC,
 160     uint32_t workerId,
 161     void *pUserData)
 162 {
 163     AR_BEGIN(FEProcessStoreTiles, pDC->drawId);
 164     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 165     STORE_TILES_DESC* pDesc = (STORE_TILES_DESC*)pUserData;
 166
 167     // queue a store to each macro tile
 168     // compute macro tile bounds for the specified rect
 169     uint32_t macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 170     uint32_t macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 171     uint32_t macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 172     uint32_t macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 173
 174     // store tiles
 175     BE_WORK work;
 176     work.type = STORETILES;
 177     work.pfnWork = ProcessStoreTileBE;
 178     work.desc.storeTiles = *pDesc;
 179
 180     for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 181     {
 182         for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 183         {
 184             pTileMgr->enqueue(x, y, &work);
 185         }
 186     }
 187
 188     AR_END(FEProcessStoreTiles, 0);
 189 }
 190
 191 //////////////////////////////////////////////////////////////////////////
 192 /// @brief FE handler for SwrInvalidateTiles.
 193 /// @param pContext - pointer to SWR context.
 194 /// @param pDC - pointer to draw context.
 195 /// @param workerId - thread's worker id. Even thread has a unique id.
 196 /// @param pUserData - Pointer to user data passed back to callback.
 197 /// @todo This should go away when we switch this to use compute threading.
 198 void ProcessDiscardInvalidateTiles(
 199     SWR_CONTEXT *pContext,
 200     DRAW_CONTEXT *pDC,
 201     uint32_t workerId,
 202     void *pUserData)
 203 {
 204     AR_BEGIN(FEProcessInvalidateTiles, pDC->drawId);
 205     DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 206     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 207
 208     // compute macro tile bounds for the specified rect
 209     uint32_t macroTileXMin = (pDesc->rect.xmin + KNOB_MACROTILE_X_DIM - 1) / KNOB_MACROTILE_X_DIM;
 210     uint32_t macroTileXMax = (pDesc->rect.xmax / KNOB_MACROTILE_X_DIM) - 1;
 211     uint32_t macroTileYMin = (pDesc->rect.ymin + KNOB_MACROTILE_Y_DIM - 1) / KNOB_MACROTILE_Y_DIM;
 212     uint32_t macroTileYMax = (pDesc->rect.ymax / KNOB_MACROTILE_Y_DIM) - 1;
 213
 214     if (pDesc->fullTilesOnly == false)
 215     {
 216         // include partial tiles
 217         macroTileXMin = pDesc->rect.xmin / KNOB_MACROTILE_X_DIM;
 218         macroTileXMax = (pDesc->rect.xmax - 1) / KNOB_MACROTILE_X_DIM;
 219         macroTileYMin = pDesc->rect.ymin / KNOB_MACROTILE_Y_DIM;
 220         macroTileYMax = (pDesc->rect.ymax - 1) / KNOB_MACROTILE_Y_DIM;
 221     }
 222
 223     SWR_ASSERT(macroTileXMax <= KNOB_NUM_HOT_TILES_X);
 224     SWR_ASSERT(macroTileYMax <= KNOB_NUM_HOT_TILES_Y);
 225
 226     macroTileXMax = std::min<int32_t>(macroTileXMax, KNOB_NUM_HOT_TILES_X);
 227     macroTileYMax = std::min<int32_t>(macroTileYMax, KNOB_NUM_HOT_TILES_Y);
 228
 229     // load tiles
 230     BE_WORK work;
 231     work.type = DISCARDINVALIDATETILES;
 232     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 233     work.desc.discardInvalidateTiles = *pDesc;
 234
 235     for (uint32_t x = macroTileXMin; x <= macroTileXMax; ++x)
 236     {
 237         for (uint32_t y = macroTileYMin; y <= macroTileYMax; ++y)
 238         {
 239             pTileMgr->enqueue(x, y, &work);
 240         }
 241     }
 242
 243     AR_END(FEProcessInvalidateTiles, 0);
 244 }
 245
 246 //////////////////////////////////////////////////////////////////////////
 247 /// @brief Computes the number of primitives given the number of verts.
 248 /// @param mode - primitive topology for draw operation.
 249 /// @param numPrims - number of vertices or indices for draw.
 250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 251 uint32_t GetNumPrims(
 252     PRIMITIVE_TOPOLOGY mode,
 253     uint32_t numPrims)
 254 {
 255     switch (mode)
 256     {
 257     case TOP_POINT_LIST: return numPrims;
 258     case TOP_TRIANGLE_LIST: return numPrims / 3;
 259     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 260     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 261     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 262     case TOP_QUAD_LIST: return numPrims / 4;
 263     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 264     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 265     case TOP_LINE_LIST: return numPrims / 2;
 266     case TOP_LINE_LOOP: return numPrims;
 267     case TOP_RECT_LIST: return numPrims / 3;
 268     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 269     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 270     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 271     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 272
 273     case TOP_PATCHLIST_1:
 274     case TOP_PATCHLIST_2:
 275     case TOP_PATCHLIST_3:
 276     case TOP_PATCHLIST_4:
 277     case TOP_PATCHLIST_5:
 278     case TOP_PATCHLIST_6:
 279     case TOP_PATCHLIST_7:
 280     case TOP_PATCHLIST_8:
 281     case TOP_PATCHLIST_9:
 282     case TOP_PATCHLIST_10:
 283     case TOP_PATCHLIST_11:
 284     case TOP_PATCHLIST_12:
 285     case TOP_PATCHLIST_13:
 286     case TOP_PATCHLIST_14:
 287     case TOP_PATCHLIST_15:
 288     case TOP_PATCHLIST_16:
 289     case TOP_PATCHLIST_17:
 290     case TOP_PATCHLIST_18:
 291     case TOP_PATCHLIST_19:
 292     case TOP_PATCHLIST_20:
 293     case TOP_PATCHLIST_21:
 294     case TOP_PATCHLIST_22:
 295     case TOP_PATCHLIST_23:
 296     case TOP_PATCHLIST_24:
 297     case TOP_PATCHLIST_25:
 298     case TOP_PATCHLIST_26:
 299     case TOP_PATCHLIST_27:
 300     case TOP_PATCHLIST_28:
 301     case TOP_PATCHLIST_29:
 302     case TOP_PATCHLIST_30:
 303     case TOP_PATCHLIST_31:
 304     case TOP_PATCHLIST_32:
 305         return numPrims / (mode - TOP_PATCHLIST_BASE);
 306
 307     case TOP_POLYGON:
 308     case TOP_POINT_LIST_BF:
 309     case TOP_LINE_STRIP_CONT:
 310     case TOP_LINE_STRIP_BF:
 311     case TOP_LINE_STRIP_CONT_BF:
 312     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 313     case TOP_TRI_STRIP_REVERSE:
 314     case TOP_PATCHLIST_BASE:
 315     case TOP_UNKNOWN:
 316         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 317         return 0;
 318     }
 319
 320     return 0;
 321 }
 322
 323 //////////////////////////////////////////////////////////////////////////
 324 /// @brief Computes the number of verts given the number of primitives.
 325 /// @param mode - primitive topology for draw operation.
 326 /// @param numPrims - number of primitives for draw.
 327 uint32_t GetNumVerts(
 328     PRIMITIVE_TOPOLOGY mode,
 329     uint32_t numPrims)
 330 {
 331     switch (mode)
 332     {
 333     case TOP_POINT_LIST: return numPrims;
 334     case TOP_TRIANGLE_LIST: return numPrims * 3;
 335     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 336     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 337     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 338     case TOP_QUAD_LIST: return numPrims * 4;
 339     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 340     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 341     case TOP_LINE_LIST: return numPrims * 2;
 342     case TOP_LINE_LOOP: return numPrims;
 343     case TOP_RECT_LIST: return numPrims * 3;
 344     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 345     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 346     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 347     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 348
 349     case TOP_PATCHLIST_1:
 350     case TOP_PATCHLIST_2:
 351     case TOP_PATCHLIST_3:
 352     case TOP_PATCHLIST_4:
 353     case TOP_PATCHLIST_5:
 354     case TOP_PATCHLIST_6:
 355     case TOP_PATCHLIST_7:
 356     case TOP_PATCHLIST_8:
 357     case TOP_PATCHLIST_9:
 358     case TOP_PATCHLIST_10:
 359     case TOP_PATCHLIST_11:
 360     case TOP_PATCHLIST_12:
 361     case TOP_PATCHLIST_13:
 362     case TOP_PATCHLIST_14:
 363     case TOP_PATCHLIST_15:
 364     case TOP_PATCHLIST_16:
 365     case TOP_PATCHLIST_17:
 366     case TOP_PATCHLIST_18:
 367     case TOP_PATCHLIST_19:
 368     case TOP_PATCHLIST_20:
 369     case TOP_PATCHLIST_21:
 370     case TOP_PATCHLIST_22:
 371     case TOP_PATCHLIST_23:
 372     case TOP_PATCHLIST_24:
 373     case TOP_PATCHLIST_25:
 374     case TOP_PATCHLIST_26:
 375     case TOP_PATCHLIST_27:
 376     case TOP_PATCHLIST_28:
 377     case TOP_PATCHLIST_29:
 378     case TOP_PATCHLIST_30:
 379     case TOP_PATCHLIST_31:
 380     case TOP_PATCHLIST_32:
 381         return numPrims * (mode - TOP_PATCHLIST_BASE);
 382
 383     case TOP_POLYGON:
 384     case TOP_POINT_LIST_BF:
 385     case TOP_LINE_STRIP_CONT:
 386     case TOP_LINE_STRIP_BF:
 387     case TOP_LINE_STRIP_CONT_BF:
 388     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 389     case TOP_TRI_STRIP_REVERSE:
 390     case TOP_PATCHLIST_BASE:
 391     case TOP_UNKNOWN:
 392         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 393         return 0;
 394     }
 395
 396     return 0;
 397 }
 398
 399 //////////////////////////////////////////////////////////////////////////
 400 /// @brief Return number of verts per primitive.
 401 /// @param topology - topology
 402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 404 {
 405     uint32_t numVerts = 0;
 406     switch (topology)
 407     {
 408     case TOP_POINT_LIST:
 409     case TOP_POINT_LIST_BF:
 410         numVerts = 1;
 411         break;
 412     case TOP_LINE_LIST:
 413     case TOP_LINE_STRIP:
 414     case TOP_LINE_LIST_ADJ:
 415     case TOP_LINE_LOOP:
 416     case TOP_LINE_STRIP_CONT:
 417     case TOP_LINE_STRIP_BF:
 418     case TOP_LISTSTRIP_ADJ:
 419         numVerts = 2;
 420         break;
 421     case TOP_TRIANGLE_LIST:
 422     case TOP_TRIANGLE_STRIP:
 423     case TOP_TRIANGLE_FAN:
 424     case TOP_TRI_LIST_ADJ:
 425     case TOP_TRI_STRIP_ADJ:
 426     case TOP_TRI_STRIP_REVERSE:
 427     case TOP_RECT_LIST:
 428         numVerts = 3;
 429         break;
 430     case TOP_QUAD_LIST:
 431     case TOP_QUAD_STRIP:
 432         numVerts = 4;
 433         break;
 434     case TOP_PATCHLIST_1:
 435     case TOP_PATCHLIST_2:
 436     case TOP_PATCHLIST_3:
 437     case TOP_PATCHLIST_4:
 438     case TOP_PATCHLIST_5:
 439     case TOP_PATCHLIST_6:
 440     case TOP_PATCHLIST_7:
 441     case TOP_PATCHLIST_8:
 442     case TOP_PATCHLIST_9:
 443     case TOP_PATCHLIST_10:
 444     case TOP_PATCHLIST_11:
 445     case TOP_PATCHLIST_12:
 446     case TOP_PATCHLIST_13:
 447     case TOP_PATCHLIST_14:
 448     case TOP_PATCHLIST_15:
 449     case TOP_PATCHLIST_16:
 450     case TOP_PATCHLIST_17:
 451     case TOP_PATCHLIST_18:
 452     case TOP_PATCHLIST_19:
 453     case TOP_PATCHLIST_20:
 454     case TOP_PATCHLIST_21:
 455     case TOP_PATCHLIST_22:
 456     case TOP_PATCHLIST_23:
 457     case TOP_PATCHLIST_24:
 458     case TOP_PATCHLIST_25:
 459     case TOP_PATCHLIST_26:
 460     case TOP_PATCHLIST_27:
 461     case TOP_PATCHLIST_28:
 462     case TOP_PATCHLIST_29:
 463     case TOP_PATCHLIST_30:
 464     case TOP_PATCHLIST_31:
 465     case TOP_PATCHLIST_32:
 466         numVerts = topology - TOP_PATCHLIST_BASE;
 467         break;
 468     default:
 469         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 470         break;
 471     }
 472
 473     if (includeAdjVerts)
 474     {
 475         switch (topology)
 476         {
 477         case TOP_LISTSTRIP_ADJ:
 478         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 479         case TOP_TRI_STRIP_ADJ:
 480         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 481         default: break;
 482         }
 483     }
 484
 485     return numVerts;
 486 }
 487
 488 //////////////////////////////////////////////////////////////////////////
 489 /// @brief Generate mask from remaining work.
 490 /// @param numWorkItems - Number of items being worked on by a SIMD.
 491 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 492 {
 493     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 494     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 495     return _simd_castps_si(vMask(mask));
 496 }
 497
 498
 499 //////////////////////////////////////////////////////////////////////////
 500 /// @brief  Gather scissor rect data based on per-prim viewport indices.
 501 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
 502 /// @param pViewportIndex - array of per-primitive vewport indexes.
 503 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
 504 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
 505 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
 506 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
 507 //
 508 /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
 509 template<size_t SimdWidth>
 510 struct GatherScissors
 511 {
 512     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 513         simdscalari &scisXmin, simdscalari &scisYmin,
 514         simdscalari &scisXmax, simdscalari &scisYmax)
 515     {
 516         SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
 517     }
 518 };
 519
 520 template<>
 521 struct GatherScissors<8>
 522 {
 523     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
 524         simdscalari &scisXmin, simdscalari &scisYmin,
 525         simdscalari &scisXmax, simdscalari &scisYmax)
 526     {
 527         scisXmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
 528             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
 529             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
 530             pScissorsInFixedPoint[pViewportIndex[3]].xmin,
 531             pScissorsInFixedPoint[pViewportIndex[4]].xmin,
 532             pScissorsInFixedPoint[pViewportIndex[5]].xmin,
 533             pScissorsInFixedPoint[pViewportIndex[6]].xmin,
 534             pScissorsInFixedPoint[pViewportIndex[7]].xmin);
 535         scisYmin = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymin,
 536             pScissorsInFixedPoint[pViewportIndex[1]].ymin,
 537             pScissorsInFixedPoint[pViewportIndex[2]].ymin,
 538             pScissorsInFixedPoint[pViewportIndex[3]].ymin,
 539             pScissorsInFixedPoint[pViewportIndex[4]].ymin,
 540             pScissorsInFixedPoint[pViewportIndex[5]].ymin,
 541             pScissorsInFixedPoint[pViewportIndex[6]].ymin,
 542             pScissorsInFixedPoint[pViewportIndex[7]].ymin);
 543         scisXmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmax,
 544             pScissorsInFixedPoint[pViewportIndex[1]].xmax,
 545             pScissorsInFixedPoint[pViewportIndex[2]].xmax,
 546             pScissorsInFixedPoint[pViewportIndex[3]].xmax,
 547             pScissorsInFixedPoint[pViewportIndex[4]].xmax,
 548             pScissorsInFixedPoint[pViewportIndex[5]].xmax,
 549             pScissorsInFixedPoint[pViewportIndex[6]].xmax,
 550             pScissorsInFixedPoint[pViewportIndex[7]].xmax);
 551         scisYmax = _simd_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].ymax,
 552             pScissorsInFixedPoint[pViewportIndex[1]].ymax,
 553             pScissorsInFixedPoint[pViewportIndex[2]].ymax,
 554             pScissorsInFixedPoint[pViewportIndex[3]].ymax,
 555             pScissorsInFixedPoint[pViewportIndex[4]].ymax,
 556             pScissorsInFixedPoint[pViewportIndex[5]].ymax,
 557             pScissorsInFixedPoint[pViewportIndex[6]].ymax,
 558             pScissorsInFixedPoint[pViewportIndex[7]].ymax);
 559     }
 560 };
 561
 562 //////////////////////////////////////////////////////////////////////////
 563 /// @brief StreamOut - Streams vertex data out to SO buffers.
 564 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 565 /// @param pDC - pointer to draw context.
 566 /// @param workerId - thread's worker id. Even thread has a unique id.
 567 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 568 static void StreamOut(
 569     DRAW_CONTEXT* pDC,
 570     PA_STATE& pa,
 571     uint32_t workerId,
 572     uint32_t* pPrimData,
 573     uint32_t streamIndex)
 574 {
 575     SWR_CONTEXT *pContext = pDC->pContext;
 576
 577     AR_BEGIN(FEStreamout, pDC->drawId);
 578
 579     const API_STATE& state = GetApiState(pDC);
 580     const SWR_STREAMOUT_STATE &soState = state.soState;
 581
 582     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 583
 584     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 585     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 586
 587     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 588
 589     // Setup buffer state pointers.
 590     for (uint32_t i = 0; i < 4; ++i)
 591     {
 592         soContext.pBuffer[i] = &state.soBuffer[i];
 593     }
 594
 595     uint32_t numPrims = pa.NumPrims();
 596     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 597     {
 598         DWORD slot = 0;
 599         uint32_t soMask = soState.streamMasks[streamIndex];
 600
 601         // Write all entries into primitive data buffer for SOS.
 602         while (_BitScanForward(&slot, soMask))
 603         {
 604             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 605             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 606             pa.AssembleSingle(paSlot, primIndex, attrib);
 607
 608             // Attribute offset is relative offset from start of vertex.
 609             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 610             // to prim data starting at slot 0. Which is why we do (slot - 1).
 611             // Also note: GL works slightly differently, and needs slot 0
 612             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 613
 614             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 615             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 616             {
 617                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 618
 619                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 620             }
 621             soMask &= ~(1 << slot);
 622         }
 623
 624         // Update pPrimData pointer
 625         soContext.pPrimData = pPrimData;
 626
 627         // Call SOS
 628         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 629         state.pfnSoFunc[streamIndex](soContext);
 630     }
 631
 632     // Update SO write offset. The driver provides memory for the update.
 633     for (uint32_t i = 0; i < 4; ++i)
 634     {
 635         if (state.soBuffer[i].pWriteOffset)
 636         {
 637             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 638         }
 639
 640         if (state.soBuffer[i].soWriteEnable)
 641         {
 642             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 643             pDC->dynState.SoWriteOffsetDirty[i] = true;
 644         }
 645     }
 646
 647     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 648     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 649
 650     AR_END(FEStreamout, 1);
 651 }
 652
 653 //////////////////////////////////////////////////////////////////////////
 654 /// @brief Computes number of invocations. The current index represents
 655 ///        the start of the SIMD. The max index represents how much work
 656 ///        items are remaining. If there is less then a SIMD's xmin of work
 657 ///        then return the remaining amount of work.
 658 /// @param curIndex - The start index for the SIMD.
 659 /// @param maxIndex - The last index for all work items.
 660 static INLINE uint32_t GetNumInvocations(
 661     uint32_t curIndex,
 662     uint32_t maxIndex)
 663 {
 664     uint32_t remainder = (maxIndex - curIndex);
 665     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 666 }
 667
 668 //////////////////////////////////////////////////////////////////////////
 669 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 670 ///        The geometry shader will loop over each active streamout buffer, assembling
 671 ///        primitives for the downstream stages. When multistream output is enabled,
 672 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 673 ///        buffer for the primitive assembler.
 674 /// @param stream - stream id to generate the cut buffer for
 675 /// @param pStreamIdBase - pointer to the stream ID buffer
 676 /// @param numEmittedVerts - Number of total verts emitted by the GS
 677 /// @param pCutBuffer - output buffer to write cuts to
 678 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 679 {
 680     SWR_ASSERT(stream < MAX_SO_STREAMS);
 681
 682     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 683     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 684
 685     for (uint32_t b = 0; b < numOutputBytes; ++b)
 686     {
 687         uint8_t curInputByte = pStreamIdBase[2*b];
 688         uint8_t outByte = 0;
 689         for (uint32_t i = 0; i < 4; ++i)
 690         {
 691             if ((curInputByte & 0x3) != stream)
 692             {
 693                 outByte |= (1 << i);
 694             }
 695             curInputByte >>= 2;
 696         }
 697
 698         curInputByte = pStreamIdBase[2 * b + 1];
 699         for (uint32_t i = 0; i < 4; ++i)
 700         {
 701             if ((curInputByte & 0x3) != stream)
 702             {
 703                 outByte |= (1 << (i + 4));
 704             }
 705             curInputByte >>= 2;
 706         }
 707
 708         *pCutBuffer++ = outByte;
 709     }
 710 }
 711
 712 THREAD SWR_GS_CONTEXT tlsGsContext;
 713
 714 //////////////////////////////////////////////////////////////////////////
 715 /// @brief Implements GS stage.
 716 /// @param pDC - pointer to draw context.
 717 /// @param workerId - thread's worker id. Even thread has a unique id.
 718 /// @param pa - The primitive assembly object.
 719 /// @param pGsOut - output stream for GS
 720 template <
 721     typename HasStreamOutT,
 722     typename HasRastT>
 723 static void GeometryShaderStage(
 724     DRAW_CONTEXT *pDC,
 725     uint32_t workerId,
 726     PA_STATE& pa,
 727     void* pGsOut,
 728     void* pCutBuffer,
 729     void* pStreamCutBuffer,
 730     uint32_t* pSoPrimData,
 731     simdscalari primID)
 732 {
 733     SWR_CONTEXT *pContext = pDC->pContext;
 734
 735     AR_BEGIN(FEGeometryShader, pDC->drawId);
 736
 737     const API_STATE& state = GetApiState(pDC);
 738     const SWR_GS_STATE* pState = &state.gsState;
 739
 740     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 741     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 742
 743     tlsGsContext.pStream = (uint8_t*)pGsOut;
 744     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 745     tlsGsContext.PrimitiveID = primID;
 746
 747     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 748     simdvector attrib[MAX_ATTRIBUTES];
 749
 750     // assemble all attributes for the input primitive
 751     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 752     {
 753         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 754         pa.Assemble(attribSlot, attrib);
 755
 756         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 757         {
 758             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 759         }
 760     }
 761
 762     // assemble position
 763     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 764     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 765     {
 766         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 767     }
 768
 769     const uint32_t vertexStride = sizeof(simdvertex);
 770     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 771     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 772     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 773     uint32_t cutPrimStride;
 774     uint32_t cutInstanceStride;
 775
 776     if (pState->isSingleStream)
 777     {
 778         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 779         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 780     }
 781     else
 782     {
 783         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 784         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 785     }
 786
 787     // record valid prims from the frontend to avoid over binning the newly generated
 788     // prims from the GS
 789     uint32_t numInputPrims = pa.NumPrims();
 790
 791     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 792     {
 793         tlsGsContext.InstanceID = instance;
 794         tlsGsContext.mask = GenerateMask(numInputPrims);
 795
 796         // execute the geometry shader
 797         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 798
 799         tlsGsContext.pStream += instanceStride;
 800         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 801     }
 802
 803     // set up new binner and state for the GS output topology
 804     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 805     if (HasRastT::value)
 806     {
 807         switch (pState->outputTopology)
 808         {
 809         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 810         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 811         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 812         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 813         }
 814     }
 815
 816     // foreach input prim:
 817     // - setup a new PA based on the emitted verts for that prim
 818     // - loop over the new verts, calling PA to assemble each prim
 819     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 820     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 821
 822     uint32_t totalPrimsGenerated = 0;
 823     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 824     {
 825         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 826         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 827         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 828         {
 829             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 830             if (numEmittedVerts == 0)
 831             {
 832                 continue;
 833             }
 834
 835             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 836             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 837
 838             uint32_t numAttribs = state.feNumAttributes;
 839
 840             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 841             {
 842                 bool processCutVerts = false;
 843
 844                 uint8_t* pCutBuffer = pCutBase;
 845
 846                 // assign default stream ID, only relevant when GS is outputting a single stream
 847                 uint32_t streamID = 0;
 848                 if (pState->isSingleStream)
 849                 {
 850                     processCutVerts = true;
 851                     streamID = pState->singleStreamID;
 852                     if (streamID != stream) continue;
 853                 }
 854                 else
 855                 {
 856                     // early exit if this stream is not enabled for streamout
 857                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 858                     {
 859                         continue;
 860                     }
 861
 862                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 863                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 864                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 865                     processCutVerts = false;
 866                 }
 867
 868                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 869
 870                 while (gsPa.GetNextStreamOutput())
 871                 {
 872                     do
 873                     {
 874                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 875
 876                         if (assemble)
 877                         {
 878                             totalPrimsGenerated += gsPa.NumPrims();
 879
 880                             if (HasStreamOutT::value)
 881                             {
 882                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 883                             }
 884
 885                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 886                             {
 887                                 simdscalari vPrimId;
 888                                 // pull primitiveID from the GS output if available
 889                                 if (state.gsState.emitsPrimitiveID)
 890                                 {
 891                                     simdvector primIdAttrib[3];
 892                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 893                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 894                                 }
 895                                 else
 896                                 {
 897                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 898                                 }
 899
 900                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 901                                 simdscalari vViewPortIdx;
 902                                 if (state.gsState.emitsViewportArrayIndex)
 903                                 {
 904                                     simdvector vpiAttrib[3];
 905                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 906
 907                                     // OOB indices => forced to zero.
 908                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 909                                     simdscalari vClearMask = _simd_cmplt_epi32(_simd_castps_si(vpiAttrib[0].x), vNumViewports);
 910                                     vpiAttrib[0].x = _simd_and_ps(_simd_castsi_ps(vClearMask), vpiAttrib[0].x);
 911
 912                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
 913                                 }
 914                                 else
 915                                 {
 916                                     vViewPortIdx = _simd_set1_epi32(0);
 917                                 }
 918
 919                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 920                             }
 921                         }
 922                     } while (gsPa.NextPrim());
 923                 }
 924             }
 925         }
 926     }
 927
 928     // update GS pipeline stats
 929     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
 930     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 931
 932     AR_END(FEGeometryShader, 1);
 933 }
 934
 935 //////////////////////////////////////////////////////////////////////////
 936 /// @brief Allocate GS buffers
 937 /// @param pDC - pointer to draw context.
 938 /// @param state - API state
 939 /// @param ppGsOut - pointer to GS output buffer allocation
 940 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 941 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 942     void **ppStreamCutBuffer)
 943 {
 944     auto pArena = pDC->pArena;
 945     SWR_ASSERT(pArena != nullptr);
 946     SWR_ASSERT(state.gsState.gsEnable);
 947     // allocate arena space to hold GS output verts
 948     // @todo pack attribs
 949     // @todo support multiple streams
 950     const uint32_t vertexStride = sizeof(simdvertex);
 951     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 952     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 953     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 954
 955     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 956     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 957     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 958     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 959
 960     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 961     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 962
 963     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 964     if (state.gsState.isSingleStream)
 965     {
 966         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 967         *ppStreamCutBuffer = nullptr;
 968     }
 969     else
 970     {
 971         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 972         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 973     }
 974
 975 }
 976
 977 //////////////////////////////////////////////////////////////////////////
 978 /// @brief Contains all data generated by the HS and passed to the
 979 /// tessellator and DS.
 980 struct TessellationThreadLocalData
 981 {
 982     SWR_HS_CONTEXT hsContext;
 983     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 984     void* pTxCtx;
 985     size_t tsCtxSize;
 986
 987     simdscalar* pDSOutput;
 988     size_t numDSOutputVectors;
 989 };
 990
 991 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 992
 993 //////////////////////////////////////////////////////////////////////////
 994 /// @brief Allocate tessellation data for this worker thread.
 995 INLINE
 996 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 997 {
 998     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 999     if (gt_pTessellationThreadData == nullptr)
1000     {
1001         gt_pTessellationThreadData = (TessellationThreadLocalData*)
1002             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
1003         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
1004     }
1005 }
1006
1007 //////////////////////////////////////////////////////////////////////////
1008 /// @brief Implements Tessellation Stages.
1009 /// @param pDC - pointer to draw context.
1010 /// @param workerId - thread's worker id. Even thread has a unique id.
1011 /// @param pa - The primitive assembly object.
1012 /// @param pGsOut - output stream for GS
1013 template <
1014     typename HasGeometryShaderT,
1015     typename HasStreamOutT,
1016     typename HasRastT>
1017 static void TessellationStages(
1018     DRAW_CONTEXT *pDC,
1019     uint32_t workerId,
1020     PA_STATE& pa,
1021     void* pGsOut,
1022     void* pCutBuffer,
1023     void* pCutStreamBuffer,
1024     uint32_t* pSoPrimData,
1025     simdscalari primID)
1026 {
1027     SWR_CONTEXT *pContext = pDC->pContext;
1028     const API_STATE& state = GetApiState(pDC);
1029     const SWR_TS_STATE& tsState = state.tsState;
1030
1031     SWR_ASSERT(gt_pTessellationThreadData);
1032
1033     HANDLE tsCtx = TSInitCtx(
1034         tsState.domain,
1035         tsState.partitioning,
1036         tsState.tsOutputTopology,
1037         gt_pTessellationThreadData->pTxCtx,
1038         gt_pTessellationThreadData->tsCtxSize);
1039     if (tsCtx == nullptr)
1040     {
1041         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
1042         tsCtx = TSInitCtx(
1043             tsState.domain,
1044             tsState.partitioning,
1045             tsState.tsOutputTopology,
1046             gt_pTessellationThreadData->pTxCtx,
1047             gt_pTessellationThreadData->tsCtxSize);
1048     }
1049     SWR_ASSERT(tsCtx);
1050
1051     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1052     if (HasRastT::value)
1053     {
1054         switch (tsState.postDSTopology)
1055         {
1056         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1057         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1058         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1059         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1060         }
1061     }
1062
1063     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1064     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1065     hsContext.PrimitiveID = primID;
1066
1067     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1068     // Max storage for one attribute for an entire simdprimitive
1069     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1070
1071     // assemble all attributes for the input primitives
1072     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1073     {
1074         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1075         pa.Assemble(attribSlot, simdattrib);
1076
1077         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1078         {
1079             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1080         }
1081     }
1082
1083 #if defined(_DEBUG)
1084     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1085 #endif
1086
1087     uint32_t numPrims = pa.NumPrims();
1088     hsContext.mask = GenerateMask(numPrims);
1089
1090     // Run the HS
1091     AR_BEGIN(FEHullShader, pDC->drawId);
1092     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1093     AR_END(FEHullShader, 0);
1094
1095     UPDATE_STAT_FE(HsInvocations, numPrims);
1096
1097     const uint32_t* pPrimId = (const uint32_t*)&primID;
1098
1099     for (uint32_t p = 0; p < numPrims; ++p)
1100     {
1101         // Run Tessellator
1102         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1103         AR_BEGIN(FETessellation, pDC->drawId);
1104         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1105         AR_END(FETessellation, 0);
1106
1107         if (tsData.NumPrimitives == 0)
1108         {
1109             continue;
1110         }
1111         SWR_ASSERT(tsData.NumDomainPoints);
1112
1113         // Allocate DS Output memory
1114         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1115         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1116         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1117         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1118         {
1119             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1120             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1121             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1122         }
1123         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1124         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1125
1126 #if defined(_DEBUG)
1127         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1128 #endif
1129
1130         // Run Domain Shader
1131         SWR_DS_CONTEXT dsContext;
1132         dsContext.PrimitiveID = pPrimId[p];
1133         dsContext.pCpIn = &hsContext.pCPout[p];
1134         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1135         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1136         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1137         dsContext.vectorStride = requiredDSVectorInvocations;
1138
1139         uint32_t dsInvocations = 0;
1140
1141         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1142         {
1143             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1144
1145             AR_BEGIN(FEDomainShader, pDC->drawId);
1146             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1147             AR_END(FEDomainShader, 0);
1148
1149             dsInvocations += KNOB_SIMD_WIDTH;
1150         }
1151         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1152
1153         PA_TESS tessPa(
1154             pDC,
1155             dsContext.pOutputData,
1156             dsContext.vectorStride,
1157             tsState.numDsOutputAttribs,
1158             tsData.ppIndices,
1159             tsData.NumPrimitives,
1160             tsState.postDSTopology);
1161
1162         while (tessPa.HasWork())
1163         {
1164             if (HasGeometryShaderT::value)
1165             {
1166                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1167                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1168                     _simd_set1_epi32(dsContext.PrimitiveID));
1169             }
1170             else
1171             {
1172                 if (HasStreamOutT::value)
1173                 {
1174                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1175                 }
1176
1177                 if (HasRastT::value)
1178                 {
1179                     simdvector prim[3]; // Only deal with triangles, lines, or points
1180                     AR_BEGIN(FEPAAssemble, pDC->drawId);
1181 #if SWR_ENABLE_ASSERTS
1182                     bool assemble =
1183 #endif
1184                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1185                     AR_END(FEPAAssemble, 1);
1186                     SWR_ASSERT(assemble);
1187
1188                     SWR_ASSERT(pfnClipFunc);
1189                     pfnClipFunc(pDC, tessPa, workerId, prim,
1190                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1191                 }
1192             }
1193
1194             tessPa.NextPrim();
1195
1196         } // while (tessPa.HasWork())
1197     } // for (uint32_t p = 0; p < numPrims; ++p)
1198
1199     TSDestroyCtx(tsCtx);
1200 }
1201
1202 //////////////////////////////////////////////////////////////////////////
1203 /// @brief FE handler for SwrDraw.
1204 /// @tparam IsIndexedT - Is indexed drawing enabled
1205 /// @tparam HasTessellationT - Is tessellation enabled
1206 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1207 /// @tparam HasStreamOutT - Is stream-out enabled
1208 /// @tparam HasRastT - Is rasterization enabled
1209 /// @param pContext - pointer to SWR context.
1210 /// @param pDC - pointer to draw context.
1211 /// @param workerId - thread's worker id.
1212 /// @param pUserData - Pointer to DRAW_WORK
1213 template <
1214     typename IsIndexedT,
1215     typename IsCutIndexEnabledT,
1216     typename HasTessellationT,
1217     typename HasGeometryShaderT,
1218     typename HasStreamOutT,
1219     typename HasRastT>
1220 void ProcessDraw(
1221     SWR_CONTEXT *pContext,
1222     DRAW_CONTEXT *pDC,
1223     uint32_t workerId,
1224     void *pUserData)
1225 {
1226
1227 #if KNOB_ENABLE_TOSS_POINTS
1228     if (KNOB_TOSS_QUEUE_FE)
1229     {
1230         return;
1231     }
1232 #endif
1233
1234     AR_BEGIN(FEProcessDraw, pDC->drawId);
1235
1236     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1237     const API_STATE&    state = GetApiState(pDC);
1238     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1239     SWR_VS_CONTEXT      vsContext;
1240     simdvertex          vin;
1241
1242     int indexSize = 0;
1243     uint32_t endVertex = work.numVerts;
1244
1245     const int32_t* pLastRequestedIndex = nullptr;
1246     if (IsIndexedT::value)
1247     {
1248         switch (work.type)
1249         {
1250         case R32_UINT:
1251             indexSize = sizeof(uint32_t);
1252             pLastRequestedIndex = &(work.pIB[endVertex]);
1253             break;
1254         case R16_UINT:
1255             indexSize = sizeof(uint16_t);
1256             // nasty address offset to last index
1257             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1258             break;
1259         case R8_UINT:
1260             indexSize = sizeof(uint8_t);
1261             // nasty address offset to last index
1262             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1263             break;
1264         default:
1265             SWR_ASSERT(0);
1266         }
1267     }
1268     else
1269     {
1270         // No cuts, prune partial primitives.
1271         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1272     }
1273
1274     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1275     fetchInfo.pStreams = &state.vertexBuffers[0];
1276     fetchInfo.StartInstance = work.startInstance;
1277     fetchInfo.StartVertex = 0;
1278
1279     vsContext.pVin = &vin;
1280
1281     if (IsIndexedT::value)
1282     {
1283         fetchInfo.BaseVertex = work.baseVertex;
1284
1285         // if the entire index buffer isn't being consumed, set the last index
1286         // so that fetches < a SIMD wide will be masked off
1287         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1288         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1289         {
1290             fetchInfo.pLastIndex = pLastRequestedIndex;
1291         }
1292     }
1293     else
1294     {
1295         fetchInfo.StartVertex = work.startVertex;
1296     }
1297
1298 #ifdef KNOB_ENABLE_RDTSC
1299     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1300 #endif
1301
1302     void* pGsOut = nullptr;
1303     void* pCutBuffer = nullptr;
1304     void* pStreamCutBuffer = nullptr;
1305     if (HasGeometryShaderT::value)
1306     {
1307         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1308     }
1309
1310     if (HasTessellationT::value)
1311     {
1312         SWR_ASSERT(state.tsState.tsEnable == true);
1313         SWR_ASSERT(state.pfnHsFunc != nullptr);
1314         SWR_ASSERT(state.pfnDsFunc != nullptr);
1315
1316         AllocateTessellationData(pContext);
1317     }
1318     else
1319     {
1320         SWR_ASSERT(state.tsState.tsEnable == false);
1321         SWR_ASSERT(state.pfnHsFunc == nullptr);
1322         SWR_ASSERT(state.pfnDsFunc == nullptr);
1323     }
1324
1325     // allocate space for streamout input prim data
1326     uint32_t* pSoPrimData = nullptr;
1327     if (HasStreamOutT::value)
1328     {
1329         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1330     }
1331
1332     // choose primitive assembler
1333     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1334     PA_STATE& pa = paFactory.GetPA();
1335
1336     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1337     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1338     {
1339         simdscalari vIndex;
1340         uint32_t  i = 0;
1341
1342         if (IsIndexedT::value)
1343         {
1344             fetchInfo.pIndices = work.pIB;
1345         }
1346         else
1347         {
1348             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1349             fetchInfo.pIndices = (const int32_t*)&vIndex;
1350         }
1351
1352         fetchInfo.CurInstance = instanceNum;
1353         vsContext.InstanceID = instanceNum;
1354
1355         while (pa.HasWork())
1356         {
1357             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1358             // So we need to keep this outside of (i < endVertex) check.
1359             simdmask* pvCutIndices = nullptr;
1360             if (IsIndexedT::value)
1361             {
1362                 pvCutIndices = &pa.GetNextVsIndices();
1363             }
1364
1365             simdvertex& vout = pa.GetNextVsOutput();
1366             vsContext.pVout = &vout;
1367
1368             if (i < endVertex)
1369             {
1370
1371                 // 1. Execute FS/VS for a single SIMD.
1372                 AR_BEGIN(FEFetchShader, pDC->drawId);
1373                 state.pfnFetchFunc(fetchInfo, vin);
1374                 AR_END(FEFetchShader, 0);
1375
1376                 // forward fetch generated vertex IDs to the vertex shader
1377                 vsContext.VertexID = fetchInfo.VertexID;
1378
1379                 // Setup active mask for vertex shader.
1380                 vsContext.mask = GenerateMask(endVertex - i);
1381
1382                 // forward cut mask to the PA
1383                 if (IsIndexedT::value)
1384                 {
1385                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1386                 }
1387
1388                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1389
1390 #if KNOB_ENABLE_TOSS_POINTS
1391                 if (!KNOB_TOSS_FETCH)
1392 #endif
1393                 {
1394                     AR_BEGIN(FEVertexShader, pDC->drawId);
1395                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1396                     AR_END(FEVertexShader, 0);
1397
1398                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1399                 }
1400             }
1401
1402             // 2. Assemble primitives given the last two SIMD.
1403             do
1404             {
1405                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1406                 // PaAssemble returns false if there is not enough verts to assemble.
1407                 AR_BEGIN(FEPAAssemble, pDC->drawId);
1408                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1409                 AR_END(FEPAAssemble, 1);
1410
1411 #if KNOB_ENABLE_TOSS_POINTS
1412                 if (!KNOB_TOSS_FETCH)
1413 #endif
1414                 {
1415 #if KNOB_ENABLE_TOSS_POINTS
1416                     if (!KNOB_TOSS_VS)
1417 #endif
1418                     {
1419                         if (assemble)
1420                         {
1421                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1422
1423                             if (HasTessellationT::value)
1424                             {
1425                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1426                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1427                             }
1428                             else if (HasGeometryShaderT::value)
1429                             {
1430                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1431                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1432                             }
1433                             else
1434                             {
1435                                 // If streamout is enabled then stream vertices out to memory.
1436                                 if (HasStreamOutT::value)
1437                                 {
1438                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1439                                 }
1440
1441                                 if (HasRastT::value)
1442                                 {
1443                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1444                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1445                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1446                                 }
1447                             }
1448                         }
1449                     }
1450                 }
1451             } while (pa.NextPrim());
1452
1453             i += KNOB_SIMD_WIDTH;
1454             if (IsIndexedT::value)
1455             {
1456                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1457             }
1458             else
1459             {
1460                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1461             }
1462         }
1463         pa.Reset();
1464     }
1465
1466     AR_END(FEProcessDraw, numPrims * work.numInstances);
1467 }
1468
1469 struct FEDrawChooser
1470 {
1471     typedef PFN_FE_WORK_FUNC FuncType;
1472
1473     template <typename... ArgsB>
1474     static FuncType GetFunc()
1475     {
1476         return ProcessDraw<ArgsB...>;
1477     }
1478 };
1479
1480
1481 // Selector for correct templated Draw front-end function
1482 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1483     bool IsIndexed,
1484     bool IsCutIndexEnabled,
1485     bool HasTessellation,
1486     bool HasGeometryShader,
1487     bool HasStreamOut,
1488     bool HasRasterization)
1489 {
1490     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1491 }
1492
1493 //////////////////////////////////////////////////////////////////////////
1494 /// @brief Processes attributes for the backend based on linkage mask and
1495 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1496 /// @param pDC - Draw context
1497 /// @param pa - Primitive Assembly state
1498 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1499 /// @param pLinkageMap - maps VS attribute slot to PS slot
1500 /// @param triIndex - Triangle to process attributes for
1501 /// @param pBuffer - Output result
1502 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1503 INLINE void ProcessAttributes(
1504     DRAW_CONTEXT *pDC,
1505     PA_STATE&pa,
1506     uint32_t triIndex,
1507     uint32_t primId,
1508     float *pBuffer)
1509 {
1510     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1511     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1512     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1513     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1514     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1515     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1516
1517     static const float constTable[3][4] = {
1518         {0.0f, 0.0f, 0.0f, 0.0f},
1519         {0.0f, 0.0f, 0.0f, 1.0f},
1520         {1.0f, 1.0f, 1.0f, 1.0f}
1521     };
1522
1523     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1524     {
1525         uint32_t inputSlot;
1526         if (IsSwizzledT::value)
1527         {
1528             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1529             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1530
1531         }
1532         else
1533         {
1534             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1535         }
1536
1537         __m128 attrib[3];    // triangle attribs (always 4 wide)
1538         float* pAttribStart = pBuffer;
1539
1540         if (HasConstantInterpT::value || IsDegenerate::value)
1541         {
1542             if (_bittest(&constantInterpMask, i))
1543             {
1544                 uint32_t vid;
1545                 uint32_t adjustedTriIndex;
1546                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1547                 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1548                 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1549                 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1550                 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1551
1552                 switch (topo) {
1553                 case TOP_QUAD_LIST:
1554                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1555                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1556                     break;
1557                 case TOP_QUAD_STRIP:
1558                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1559                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1560                     break;
1561                 case TOP_TRIANGLE_STRIP:
1562                     adjustedTriIndex = triIndex;
1563                     vid = (triIndex & 1)
1564                         ? tristripProvokingVertex[provokingVertex]
1565                         : provokingVertex;
1566                     break;
1567                 default:
1568                     adjustedTriIndex = triIndex;
1569                     vid = provokingVertex;
1570                     break;
1571                 }
1572
1573                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1574
1575                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1576                 {
1577                     _mm_store_ps(pBuffer, attrib[vid]);
1578                     pBuffer += 4;
1579                 }
1580             }
1581             else
1582             {
1583                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1584
1585                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1586                 {
1587                     _mm_store_ps(pBuffer, attrib[i]);
1588                     pBuffer += 4;
1589                 }
1590             }
1591         }
1592         else
1593         {
1594             pa.AssembleSingle(inputSlot, triIndex, attrib);
1595
1596             for (uint32_t i = 0; i < NumVertsT::value; ++i)
1597             {
1598                 _mm_store_ps(pBuffer, attrib[i]);
1599                 pBuffer += 4;
1600             }
1601         }
1602
1603         // pad out the attrib buffer to 3 verts to ensure the triangle
1604         // interpolation code in the pixel shader works correctly for the
1605         // 3 topologies - point, line, tri.  This effectively zeros out the
1606         // effect of the missing vertices in the triangle interpolation.
1607         for (uint32_t v = NumVertsT::value; v < 3; ++v)
1608         {
1609             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1610             pBuffer += 4;
1611         }
1612
1613         // check for constant source overrides
1614         if (IsSwizzledT::value)
1615         {
1616             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1617             if (mask)
1618             {
1619                 DWORD comp;
1620                 while (_BitScanForward(&comp, mask))
1621                 {
1622                     mask &= ~(1 << comp);
1623
1624                     float constantValue = 0.0f;
1625                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1626                     {
1627                     case SWR_CONSTANT_SOURCE_CONST_0000:
1628                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1629                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1630                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1631                         break;
1632                     case SWR_CONSTANT_SOURCE_PRIM_ID:
1633                         constantValue = *(float*)&primId;
1634                         break;
1635                     }
1636
1637                     // apply constant value to all 3 vertices
1638                     for (uint32_t v = 0; v < 3; ++v)
1639                     {
1640                         pAttribStart[comp + v * 4] = constantValue;
1641                     }
1642                 }
1643             }
1644         }
1645     }
1646 }
1647
1648
1649 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1650
1651 struct ProcessAttributesChooser
1652 {
1653     typedef PFN_PROCESS_ATTRIBUTES FuncType;
1654
1655     template <typename... ArgsB>
1656     static FuncType GetFunc()
1657     {
1658         return ProcessAttributes<ArgsB...>;
1659     }
1660 };
1661
1662 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1663 {
1664     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1665 }
1666
1667 //////////////////////////////////////////////////////////////////////////
1668 /// @brief Processes enabled user clip distances. Loads the active clip
1669 ///        distances from the PA, sets up barycentric equations, and
1670 ///        stores the results to the output buffer
1671 /// @param pa - Primitive Assembly state
1672 /// @param primIndex - primitive index to process
1673 /// @param clipDistMask - mask of enabled clip distances
1674 /// @param pUserClipBuffer - buffer to store results
1675 template<uint32_t NumVerts>
1676 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1677 {
1678     DWORD clipDist;
1679     while (_BitScanForward(&clipDist, clipDistMask))
1680     {
1681         clipDistMask &= ~(1 << clipDist);
1682         uint32_t clipSlot = clipDist >> 2;
1683         uint32_t clipComp = clipDist & 0x3;
1684         uint32_t clipAttribSlot = clipSlot == 0 ?
1685             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1686
1687         __m128 primClipDist[3];
1688         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1689
1690         float vertClipDist[NumVerts];
1691         for (uint32_t e = 0; e < NumVerts; ++e)
1692         {
1693             OSALIGNSIMD(float) aVertClipDist[4];
1694             _mm_store_ps(aVertClipDist, primClipDist[e]);
1695             vertClipDist[e] = aVertClipDist[clipComp];
1696         };
1697
1698         // setup plane equations for barycentric interpolation in the backend
1699         float baryCoeff[NumVerts];
1700         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1701         {
1702             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1703         }
1704         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1705
1706         for (uint32_t e = 0; e < NumVerts; ++e)
1707         {
1708             *(pUserClipBuffer++) = baryCoeff[e];
1709         }
1710     }
1711 }
1712
1713 //////////////////////////////////////////////////////////////////////////
1714 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1715 /// Point precision from FP32.
1716 template <typename PT = FixedPointTraits<Fixed_16_8>>
1717 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1718 {
1719     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1720     return _simd_cvtps_epi32(vFixed);
1721 }
1722
1723 //////////////////////////////////////////////////////////////////////////
1724 /// @brief Helper function to set the X,Y coords of a triangle to the
1725 /// requested Fixed Point precision from FP32.
1726 /// @param tri: simdvector[3] of FP triangle verts
1727 /// @param vXi: fixed point X coords of tri verts
1728 /// @param vYi: fixed point Y coords of tri verts
1729 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1730 {
1731     vXi[0] = fpToFixedPointVertical(tri[0].x);
1732     vYi[0] = fpToFixedPointVertical(tri[0].y);
1733     vXi[1] = fpToFixedPointVertical(tri[1].x);
1734     vYi[1] = fpToFixedPointVertical(tri[1].y);
1735     vXi[2] = fpToFixedPointVertical(tri[2].x);
1736     vYi[2] = fpToFixedPointVertical(tri[2].y);
1737 }
1738
1739 //////////////////////////////////////////////////////////////////////////
1740 /// @brief Calculate bounding box for current triangle
1741 /// @tparam CT: ConservativeRastFETraits type
1742 /// @param vX: fixed point X position for triangle verts
1743 /// @param vY: fixed point Y position for triangle verts
1744 /// @param bbox: fixed point bbox
1745 /// *Note*: expects vX, vY to be in the correct precision for the type
1746 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1747 template <typename CT>
1748 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1749 {
1750     simdscalari vMinX = vX[0];
1751     vMinX = _simd_min_epi32(vMinX, vX[1]);
1752     vMinX = _simd_min_epi32(vMinX, vX[2]);
1753
1754     simdscalari vMaxX = vX[0];
1755     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1756     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1757
1758     simdscalari vMinY = vY[0];
1759     vMinY = _simd_min_epi32(vMinY, vY[1]);
1760     vMinY = _simd_min_epi32(vMinY, vY[2]);
1761
1762     simdscalari vMaxY = vY[0];
1763     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1764     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1765
1766     bbox.xmin = vMinX;
1767     bbox.xmax = vMaxX;
1768     bbox.ymin = vMinY;
1769     bbox.ymax = vMaxY;
1770 }
1771
1772 //////////////////////////////////////////////////////////////////////////
1773 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1774 /// Offsets BBox for conservative rast
1775 template <>
1776 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1777 {
1778     // FE conservative rast traits
1779     typedef FEConservativeRastT CT;
1780
1781     simdscalari vMinX = vX[0];
1782     vMinX = _simd_min_epi32(vMinX, vX[1]);
1783     vMinX = _simd_min_epi32(vMinX, vX[2]);
1784
1785     simdscalari vMaxX = vX[0];
1786     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1787     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1788
1789     simdscalari vMinY = vY[0];
1790     vMinY = _simd_min_epi32(vMinY, vY[1]);
1791     vMinY = _simd_min_epi32(vMinY, vY[2]);
1792
1793     simdscalari vMaxY = vY[0];
1794     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1795     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1796
1797     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1798     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1799     bbox.xmin = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1800     bbox.xmax = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1801     bbox.ymin = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1802     bbox.ymax = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1803 }
1804
1805 //////////////////////////////////////////////////////////////////////////
1806 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1807 ///        culling, viewport transform, etc.
1808 /// @param pDC - pointer to draw context.
1809 /// @param pa - The primitive assembly object.
1810 /// @param workerId - thread's worker id. Even thread has a unique id.
1811 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1812 /// @param primID - Primitive ID for each triangle.
1813 /// @param viewportIdx - viewport array index for each triangle.
1814 /// @tparam CT - ConservativeRastFETraits
1815 template <typename CT>
1816 void BinTriangles(
1817     DRAW_CONTEXT *pDC,
1818     PA_STATE& pa,
1819     uint32_t workerId,
1820     simdvector tri[3],
1821     uint32_t triMask,
1822     simdscalari primID,
1823     simdscalari viewportIdx)
1824 {
1825     SWR_CONTEXT *pContext = pDC->pContext;
1826
1827     AR_BEGIN(FEBinTriangles, pDC->drawId);
1828
1829     const API_STATE& state = GetApiState(pDC);
1830     const SWR_RASTSTATE& rastState = state.rastState;
1831     const SWR_FRONTEND_STATE& feState = state.frontendState;
1832     const SWR_GS_STATE& gsState = state.gsState;
1833     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1834
1835
1836     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1837     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1838     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1839
1840     if (feState.vpTransformDisable)
1841     {
1842         // RHW is passed in directly when VP transform is disabled
1843         vRecipW0 = tri[0].v[3];
1844         vRecipW1 = tri[1].v[3];
1845         vRecipW2 = tri[2].v[3];
1846     }
1847     else
1848     {
1849         // Perspective divide
1850         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1851         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1852         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1853
1854         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1855         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1856         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1857
1858         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1859         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1860         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1861
1862         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1863         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1864         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1865
1866         // Viewport transform to screen space coords
1867         if (state.gsState.emitsViewportArrayIndex)
1868         {
1869             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1870         }
1871         else
1872         {
1873             viewportTransform<3>(tri, state.vpMatrices);
1874         }
1875     }
1876
1877     // Adjust for pixel center location
1878     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1879     tri[0].x = _simd_add_ps(tri[0].x, offset);
1880     tri[0].y = _simd_add_ps(tri[0].y, offset);
1881
1882     tri[1].x = _simd_add_ps(tri[1].x, offset);
1883     tri[1].y = _simd_add_ps(tri[1].y, offset);
1884
1885     tri[2].x = _simd_add_ps(tri[2].x, offset);
1886     tri[2].y = _simd_add_ps(tri[2].y, offset);
1887
1888     simdscalari vXi[3], vYi[3];
1889     // Set vXi, vYi to required fixed point precision
1890     FPToFixedPoint(tri, vXi, vYi);
1891
1892     // triangle setup
1893     simdscalari vAi[3], vBi[3];
1894     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1895
1896     // determinant
1897     simdscalari vDet[2];
1898     calcDeterminantIntVertical(vAi, vBi, vDet);
1899
1900     // cull zero area
1901     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1902     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1903
1904     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1905
1906     uint32_t origTriMask = triMask;
1907     // don't cull degenerate triangles if we're conservatively rasterizing
1908     if(!CT::IsConservativeT::value)
1909     {
1910         triMask &= ~cullZeroAreaMask;
1911     }
1912
1913     // determine front winding tris
1914     // CW  +det
1915     // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1916     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1917     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1918     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1919
1920     uint32_t frontWindingTris;
1921     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1922     {
1923         frontWindingTris = cwTriMask;
1924     }
1925     else
1926     {
1927         frontWindingTris = ~cwTriMask;
1928     }
1929
1930     // cull
1931     uint32_t cullTris;
1932     switch ((SWR_CULLMODE)rastState.cullMode)
1933     {
1934     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1935     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1936     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1937     // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1938     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1939     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1940     }
1941
1942     triMask &= ~cullTris;
1943
1944     if (origTriMask ^ triMask)
1945     {
1946         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1947     }
1948
1949     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1950     // compute per tri backface
1951     uint32_t frontFaceMask = frontWindingTris;
1952     uint32_t *pPrimID = (uint32_t *)&primID;
1953     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
1954     DWORD triIndex = 0;
1955     // for center sample pattern, all samples are at pixel center; calculate coverage
1956     // once at center and broadcast the results in the backend
1957     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1958     uint32_t edgeEnable;
1959     PFN_WORK_FUNC pfnWork;
1960     if(CT::IsConservativeT::value)
1961     {
1962         // determine which edges of the degenerate tri, if any, are valid to rasterize.
1963         // used to call the appropriate templated rasterizer function
1964         if(cullZeroAreaMask > 0)
1965         {
1966             // e0 = v1-v0
1967             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1968             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1969             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1970
1971             // e1 = v2-v1
1972             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1973             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1974             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1975
1976             // e2 = v0-v2
1977             // if v0 == v1 & v1 == v2, v0 == v2
1978             uint32_t e2Mask = e0Mask & e1Mask;
1979             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1980
1981             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1982             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1983             e0Mask = pdep_u32(e0Mask, 0x00249249);
1984             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1985             e1Mask = pdep_u32(e1Mask, 0x00492492);
1986             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1987             e2Mask = pdep_u32(e2Mask, 0x00924924);
1988
1989             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1990         }
1991         else
1992         {
1993             edgeEnable = 0x00FFFFFF;
1994         }
1995     }
1996     else
1997     {
1998         // degenerate triangles won't be sent to rasterizer; just enable all edges
1999         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2000                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
2001                                     (state.scissorsTileAligned == false));
2002     }
2003
2004     if (!triMask)
2005     {
2006         goto endBinTriangles;
2007     }
2008
2009     // Calc bounding box of triangles
2010     simdBBox bbox;
2011     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
2012
2013     // determine if triangle falls between pixel centers and discard
2014     // only discard for non-MSAA case and when conservative rast is disabled
2015     // (xmin + 127) & ~255
2016     // (xmax + 128) & ~255
2017     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
2018     {
2019         origTriMask = triMask;
2020
2021         int cullCenterMask;
2022         {
2023             simdscalari xmin = _simd_add_epi32(bbox.xmin, _simd_set1_epi32(127));
2024             xmin = _simd_and_si(xmin, _simd_set1_epi32(~255));
2025             simdscalari xmax = _simd_add_epi32(bbox.xmax, _simd_set1_epi32(128));
2026             xmax = _simd_and_si(xmax, _simd_set1_epi32(~255));
2027
2028             simdscalari vMaskH = _simd_cmpeq_epi32(xmin, xmax);
2029
2030             simdscalari ymin = _simd_add_epi32(bbox.ymin, _simd_set1_epi32(127));
2031             ymin = _simd_and_si(ymin, _simd_set1_epi32(~255));
2032             simdscalari ymax = _simd_add_epi32(bbox.ymax, _simd_set1_epi32(128));
2033             ymax = _simd_and_si(ymax, _simd_set1_epi32(~255));
2034
2035             simdscalari vMaskV = _simd_cmpeq_epi32(ymin, ymax);
2036             vMaskV = _simd_or_si(vMaskH, vMaskV);
2037             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
2038         }
2039
2040         triMask &= ~cullCenterMask;
2041
2042         if(origTriMask ^ triMask)
2043         {
2044             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
2045         }
2046     }
2047
2048     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2049     // Gather the AOS effective scissor rects based on the per-prim VP index.
2050     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2051     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2052     if (state.gsState.emitsViewportArrayIndex)
2053     {
2054         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2055                                                 scisXmin, scisYmin, scisXmax, scisYmax);
2056     }
2057     else // broadcast fast path for non-VPAI case.
2058     {
2059         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2060         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2061         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2062         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2063     }
2064
2065     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2066     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2067     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2068     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2069
2070     if(CT::IsConservativeT::value)
2071     {
2072         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2073         // some area. Bump the xmax/ymax edges out
2074         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.ymin, bbox.ymax);
2075         bbox.ymax = _simd_blendv_epi32(bbox.ymax, _simd_add_epi32(bbox.ymax, _simd_set1_epi32(1)), topEqualsBottom);
2076         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.xmin, bbox.xmax);
2077         bbox.xmax = _simd_blendv_epi32(bbox.xmax, _simd_add_epi32(bbox.xmax, _simd_set1_epi32(1)), leftEqualsRight);
2078     }
2079
2080     // Cull tris completely outside scissor
2081     {
2082         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2083         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2084         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2085         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2086         triMask = triMask & ~maskOutsideScissor;
2087     }
2088
2089     if (!triMask)
2090     {
2091         goto endBinTriangles;
2092     }
2093
2094     // Convert triangle bbox to macrotile units.
2095     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2096     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2097     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2098     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2099
2100     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2101     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2102     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2103     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2104     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2105
2106     // transpose verts needed for backend
2107     /// @todo modify BE to take non-transformed verts
2108     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2109     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2110     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2111     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2112     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2113
2114     // store render target array index
2115     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2116     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2117     {
2118         simdvector vRtai[3];
2119         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2120         simdscalari vRtaii;
2121         vRtaii = _simd_castps_si(vRtai[0].x);
2122         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2123     }
2124     else
2125     {
2126         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2127     }
2128
2129     // scan remaining valid triangles and bin each separately
2130     while (_BitScanForward(&triIndex, triMask))
2131     {
2132         uint32_t linkageCount = state.backendState.numAttributes;
2133         uint32_t numScalarAttribs = linkageCount * 4;
2134
2135         BE_WORK work;
2136         work.type = DRAW;
2137
2138         bool isDegenerate;
2139         if(CT::IsConservativeT::value)
2140         {
2141             // only rasterize valid edges if we have a degenerate primitive
2142             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2143             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2144                                         (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2145                                         (state.scissorsTileAligned == false));
2146
2147             // Degenerate triangles are required to be constant interpolated
2148             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2149         }
2150         else
2151         {
2152             isDegenerate = false;
2153             work.pfnWork = pfnWork;
2154         }
2155
2156         // Select attribute processor
2157         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2158             state.backendState.swizzleEnable,  state.backendState.constantInterpolationMask, isDegenerate);
2159
2160         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2161
2162         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2163         desc.triFlags.primID = pPrimID[triIndex];
2164         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2165         desc.triFlags.viewportIndex = pViewportIndex[triIndex];
2166
2167         auto pArena = pDC->pArena;
2168         SWR_ASSERT(pArena != nullptr);
2169
2170         // store active attribs
2171         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2172         desc.pAttribs = pAttribs;
2173         desc.numAttribs = linkageCount;
2174         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2175
2176         // store triangle vertex data
2177         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2178
2179         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2180         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2181         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2182         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2183
2184         // store user clip distances
2185         if (rastState.clipDistanceMask)
2186         {
2187             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2188             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2189             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2190         }
2191
2192         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2193         {
2194             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2195             {
2196 #if KNOB_ENABLE_TOSS_POINTS
2197                 if (!KNOB_TOSS_SETUP_TRIS)
2198 #endif
2199                 {
2200                     pTileMgr->enqueue(x, y, &work);
2201                 }
2202             }
2203         }
2204         triMask &= ~(1 << triIndex);
2205     }
2206
2207 endBinTriangles:
2208     AR_END(FEBinTriangles, 1);
2209 }
2210
2211 struct FEBinTrianglesChooser
2212 {
2213     typedef PFN_PROCESS_PRIMS FuncType;
2214
2215     template <typename... ArgsB>
2216     static FuncType GetFunc()
2217     {
2218         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2219     }
2220 };
2221
2222 // Selector for correct templated BinTrinagles function
2223 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2224 {
2225     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2226 }
2227
2228 //////////////////////////////////////////////////////////////////////////
2229 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2230 /// @param pDC - pointer to draw context.
2231 /// @param pa - The primitive assembly object.
2232 /// @param workerId - thread's worker id. Even thread has a unique id.
2233 /// @param tri - Contains point position data for SIMDs worth of points.
2234 /// @param primID - Primitive ID for each point.
2235 void BinPoints(
2236     DRAW_CONTEXT *pDC,
2237     PA_STATE& pa,
2238     uint32_t workerId,
2239     simdvector prim[3],
2240     uint32_t primMask,
2241     simdscalari primID,
2242     simdscalari viewportIdx)
2243 {
2244     SWR_CONTEXT *pContext = pDC->pContext;
2245
2246     AR_BEGIN(FEBinPoints, pDC->drawId);
2247
2248     simdvector& primVerts = prim[0];
2249
2250     const API_STATE& state = GetApiState(pDC);
2251     const SWR_FRONTEND_STATE& feState = state.frontendState;
2252     const SWR_GS_STATE& gsState = state.gsState;
2253     const SWR_RASTSTATE& rastState = state.rastState;
2254     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2255
2256     // Select attribute processor
2257     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2258         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2259
2260     if (!feState.vpTransformDisable)
2261     {
2262         // perspective divide
2263         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2264         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2265         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2266         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2267
2268         // viewport transform to screen coords
2269         if (state.gsState.emitsViewportArrayIndex)
2270         {
2271             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2272         }
2273         else
2274         {
2275             viewportTransform<1>(&primVerts, state.vpMatrices);
2276         }
2277     }
2278
2279     // adjust for pixel center location
2280     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2281     primVerts.x = _simd_add_ps(primVerts.x, offset);
2282     primVerts.y = _simd_add_ps(primVerts.y, offset);
2283
2284     // convert to fixed point
2285     simdscalari vXi, vYi;
2286     vXi = fpToFixedPointVertical(primVerts.x);
2287     vYi = fpToFixedPointVertical(primVerts.y);
2288
2289     if (CanUseSimplePoints(pDC))
2290     {
2291         // adjust for ymin-xmin rule
2292         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2293         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2294
2295         // cull points off the ymin-xmin edge of the viewport
2296         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2297         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2298
2299         // compute macro tile coordinates
2300         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2301         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2302
2303         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2304         _simd_store_si((simdscalari*)aMacroX, macroX);
2305         _simd_store_si((simdscalari*)aMacroY, macroY);
2306
2307         // compute raster tile coordinates
2308         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2309         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2310
2311         // compute raster tile relative x,y for coverage mask
2312         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2313         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2314
2315         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2316         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2317
2318         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2319         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2320         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2321         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2322
2323         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2324         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2325         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2326         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2327
2328         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2329         _simd_store_ps((float*)aZ, primVerts.z);
2330
2331         // store render target array index
2332         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2333         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2334         {
2335             simdvector vRtai;
2336             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2337             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2338             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2339         }
2340         else
2341         {
2342             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2343         }
2344
2345         uint32_t *pPrimID = (uint32_t *)&primID;
2346         DWORD primIndex = 0;
2347
2348         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2349
2350         // scan remaining valid triangles and bin each separately
2351         while (_BitScanForward(&primIndex, primMask))
2352         {
2353             uint32_t linkageCount = backendState.numAttributes;
2354             uint32_t numScalarAttribs = linkageCount * 4;
2355
2356             BE_WORK work;
2357             work.type = DRAW;
2358
2359             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2360
2361             // points are always front facing
2362             desc.triFlags.frontFacing = 1;
2363             desc.triFlags.primID = pPrimID[primIndex];
2364             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2365             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2366
2367             work.pfnWork = RasterizeSimplePoint;
2368
2369             auto pArena = pDC->pArena;
2370             SWR_ASSERT(pArena != nullptr);
2371
2372             // store attributes
2373             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2374             desc.pAttribs = pAttribs;
2375             desc.numAttribs = linkageCount;
2376
2377             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2378
2379             // store raster tile aligned x, y, perspective correct z
2380             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2381             desc.pTriBuffer = pTriBuffer;
2382             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2383             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2384             *pTriBuffer = aZ[primIndex];
2385
2386             uint32_t tX = aTileRelativeX[primIndex];
2387             uint32_t tY = aTileRelativeY[primIndex];
2388
2389             // pack the relative x,y into the coverageMask, the rasterizer will
2390             // generate the true coverage mask from it
2391             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2392
2393             // bin it
2394             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2395 #if KNOB_ENABLE_TOSS_POINTS
2396             if (!KNOB_TOSS_SETUP_TRIS)
2397 #endif
2398             {
2399                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2400             }
2401             primMask &= ~(1 << primIndex);
2402         }
2403     }
2404     else
2405     {
2406         // non simple points need to be potentially binned to multiple macro tiles
2407         simdscalar vPointSize;
2408         if (rastState.pointParam)
2409         {
2410             simdvector size[3];
2411             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2412             vPointSize = size[0].x;
2413         }
2414         else
2415         {
2416             vPointSize = _simd_set1_ps(rastState.pointSize);
2417         }
2418
2419         // bloat point to bbox
2420         simdBBox bbox;
2421         bbox.xmin = bbox.xmax = vXi;
2422         bbox.ymin = bbox.ymax = vYi;
2423
2424         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2425         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2426         bbox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2427         bbox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2428         bbox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2429         bbox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2430
2431         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2432         // Gather the AOS effective scissor rects based on the per-prim VP index.
2433         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
2434         simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2435         if (state.gsState.emitsViewportArrayIndex)
2436         {
2437             GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2438                 scisXmin, scisYmin, scisXmax, scisYmax);
2439         }
2440         else // broadcast fast path for non-VPAI case.
2441         {
2442             scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2443             scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2444             scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2445             scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2446         }
2447
2448         bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2449         bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2450         bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2451         bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2452
2453         // Cull bloated points completely outside scissor
2454         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2455         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2456         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2457         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2458         primMask = primMask & ~maskOutsideScissor;
2459
2460         // Convert bbox to macrotile units.
2461         bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2462         bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2463         bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2464         bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2465
2466         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2467         _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2468         _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2469         _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2470         _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2471
2472         // store render target array index
2473         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2474         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2475         {
2476             simdvector vRtai[2];
2477             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2478             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2479             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2480         }
2481         else
2482         {
2483             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2484         }
2485
2486         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2487         _simd_store_ps((float*)aPointSize, vPointSize);
2488
2489         uint32_t *pPrimID = (uint32_t *)&primID;
2490
2491         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2492         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2493         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2494
2495         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2496         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2497         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2498
2499         // scan remaining valid prims and bin each separately
2500         const SWR_BACKEND_STATE& backendState = state.backendState;
2501         DWORD primIndex;
2502         while (_BitScanForward(&primIndex, primMask))
2503         {
2504             uint32_t linkageCount = backendState.numAttributes;
2505             uint32_t numScalarAttribs = linkageCount * 4;
2506
2507             BE_WORK work;
2508             work.type = DRAW;
2509
2510             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2511
2512             desc.triFlags.frontFacing = 1;
2513             desc.triFlags.primID = pPrimID[primIndex];
2514             desc.triFlags.pointSize = aPointSize[primIndex];
2515             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2516             desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2517
2518             work.pfnWork = RasterizeTriPoint;
2519
2520             auto pArena = pDC->pArena;
2521             SWR_ASSERT(pArena != nullptr);
2522
2523             // store active attribs
2524             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2525             desc.numAttribs = linkageCount;
2526             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2527
2528             // store point vertex data
2529             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2530             desc.pTriBuffer = pTriBuffer;
2531             *pTriBuffer++ = aPrimVertsX[primIndex];
2532             *pTriBuffer++ = aPrimVertsY[primIndex];
2533             *pTriBuffer = aPrimVertsZ[primIndex];
2534
2535             // store user clip distances
2536             if (rastState.clipDistanceMask)
2537             {
2538                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2539                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2540                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2541             }
2542
2543             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2544             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2545             {
2546                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2547                 {
2548 #if KNOB_ENABLE_TOSS_POINTS
2549                     if (!KNOB_TOSS_SETUP_TRIS)
2550 #endif
2551                     {
2552                         pTileMgr->enqueue(x, y, &work);
2553                     }
2554                 }
2555             }
2556
2557             primMask &= ~(1 << primIndex);
2558         }
2559     }
2560
2561     AR_END(FEBinPoints, 1);
2562 }
2563
2564 //////////////////////////////////////////////////////////////////////////
2565 /// @brief Bin SIMD lines to the backend.
2566 /// @param pDC - pointer to draw context.
2567 /// @param pa - The primitive assembly object.
2568 /// @param workerId - thread's worker id. Even thread has a unique id.
2569 /// @param tri - Contains line position data for SIMDs worth of points.
2570 /// @param primID - Primitive ID for each line.
2571 /// @param viewportIdx - Viewport Array Index for each line.
2572 void BinLines(
2573     DRAW_CONTEXT *pDC,
2574     PA_STATE& pa,
2575     uint32_t workerId,
2576     simdvector prim[],
2577     uint32_t primMask,
2578     simdscalari primID,
2579     simdscalari viewportIdx)
2580 {
2581     SWR_CONTEXT *pContext = pDC->pContext;
2582
2583     AR_BEGIN(FEBinLines, pDC->drawId);
2584
2585     const API_STATE& state = GetApiState(pDC);
2586     const SWR_RASTSTATE& rastState = state.rastState;
2587     const SWR_FRONTEND_STATE& feState = state.frontendState;
2588     const SWR_GS_STATE& gsState = state.gsState;
2589
2590     // Select attribute processor
2591     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2592     state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2593
2594     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2595     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2596
2597     if (!feState.vpTransformDisable)
2598     {
2599         // perspective divide
2600         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2601         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2602
2603         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2604         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2605
2606         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2607         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2608
2609         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2610         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2611
2612         // viewport transform to screen coords
2613         if (state.gsState.emitsViewportArrayIndex)
2614         {
2615             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2616         }
2617         else
2618         {
2619             viewportTransform<2>(prim, state.vpMatrices);
2620         }
2621     }
2622
2623     // adjust for pixel center location
2624     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2625     prim[0].x = _simd_add_ps(prim[0].x, offset);
2626     prim[0].y = _simd_add_ps(prim[0].y, offset);
2627
2628     prim[1].x = _simd_add_ps(prim[1].x, offset);
2629     prim[1].y = _simd_add_ps(prim[1].y, offset);
2630
2631     // convert to fixed point
2632     simdscalari vXi[2], vYi[2];
2633     vXi[0] = fpToFixedPointVertical(prim[0].x);
2634     vYi[0] = fpToFixedPointVertical(prim[0].y);
2635     vXi[1] = fpToFixedPointVertical(prim[1].x);
2636     vYi[1] = fpToFixedPointVertical(prim[1].y);
2637
2638     // compute x-major vs y-major mask
2639     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2640     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2641     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2642     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2643
2644     // cull zero-length lines
2645     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2646     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2647
2648     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2649
2650     uint32_t *pPrimID = (uint32_t *)&primID;
2651     const uint32_t *pViewportIndex = (uint32_t *)&viewportIdx;
2652
2653     simdscalar vUnused = _simd_setzero_ps();
2654
2655     // Calc bounding box of lines
2656     simdBBox bbox;
2657     bbox.xmin = _simd_min_epi32(vXi[0], vXi[1]);
2658     bbox.xmax = _simd_max_epi32(vXi[0], vXi[1]);
2659     bbox.ymin = _simd_min_epi32(vYi[0], vYi[1]);
2660     bbox.ymax = _simd_max_epi32(vYi[0], vYi[1]);
2661
2662     // bloat bbox by line width along minor axis
2663     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2664     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2665     simdBBox bloatBox;
2666     bloatBox.xmin = _simd_sub_epi32(bbox.xmin, vHalfWidthi);
2667     bloatBox.xmax = _simd_add_epi32(bbox.xmax, vHalfWidthi);
2668     bloatBox.ymin = _simd_sub_epi32(bbox.ymin, vHalfWidthi);
2669     bloatBox.ymax = _simd_add_epi32(bbox.ymax, vHalfWidthi);
2670
2671     bbox.xmin = _simd_blendv_epi32(bbox.xmin, bloatBox.xmin, vYmajorMask);
2672     bbox.xmax = _simd_blendv_epi32(bbox.xmax, bloatBox.xmax, vYmajorMask);
2673     bbox.ymin = _simd_blendv_epi32(bloatBox.ymin, bbox.ymin, vYmajorMask);
2674     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
2675
2676     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2677     simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
2678     if (state.gsState.emitsViewportArrayIndex)
2679     {
2680         GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
2681             scisXmin, scisYmin, scisXmax, scisYmax);
2682     }
2683     else // broadcast fast path for non-VPAI case.
2684     {
2685         scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
2686         scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
2687         scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
2688         scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
2689     }
2690
2691     bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
2692     bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
2693     bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
2694     bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
2695
2696     // Cull prims completely outside scissor
2697     {
2698         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
2699         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.ymin, bbox.ymax);
2700         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2701         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2702         primMask = primMask & ~maskOutsideScissor;
2703     }
2704
2705     if (!primMask)
2706     {
2707         goto endBinLines;
2708     }
2709
2710     // Convert triangle bbox to macrotile units.
2711     bbox.xmin = _simd_srai_epi32(bbox.xmin, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2712     bbox.ymin = _simd_srai_epi32(bbox.ymin, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2713     bbox.xmax = _simd_srai_epi32(bbox.xmax, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2714     bbox.ymax = _simd_srai_epi32(bbox.ymax, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2715
2716     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2717     _simd_store_si((simdscalari*)aMTLeft, bbox.xmin);
2718     _simd_store_si((simdscalari*)aMTRight, bbox.xmax);
2719     _simd_store_si((simdscalari*)aMTTop, bbox.ymin);
2720     _simd_store_si((simdscalari*)aMTBottom, bbox.ymax);
2721
2722     // transpose verts needed for backend
2723     /// @todo modify BE to take non-transformed verts
2724     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2725     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2726     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2727     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2728     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2729
2730     // store render target array index
2731     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2732     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2733     {
2734         simdvector vRtai[2];
2735         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2736         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2737         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2738     }
2739     else
2740     {
2741         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2742     }
2743
2744     // scan remaining valid prims and bin each separately
2745     DWORD primIndex;
2746     while (_BitScanForward(&primIndex, primMask))
2747     {
2748         uint32_t linkageCount = state.backendState.numAttributes;
2749         uint32_t numScalarAttribs = linkageCount * 4;
2750
2751         BE_WORK work;
2752         work.type = DRAW;
2753
2754         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2755
2756         desc.triFlags.frontFacing = 1;
2757         desc.triFlags.primID = pPrimID[primIndex];
2758         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2759         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2760         desc.triFlags.viewportIndex = pViewportIndex[primIndex];
2761
2762         work.pfnWork = RasterizeLine;
2763
2764         auto pArena = pDC->pArena;
2765         SWR_ASSERT(pArena != nullptr);
2766
2767         // store active attribs
2768         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2769         desc.numAttribs = linkageCount;
2770         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2771
2772         // store line vertex data
2773         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2774         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2775         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2776         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2777         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2778
2779         // store user clip distances
2780         if (rastState.clipDistanceMask)
2781         {
2782             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2783             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2784             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2785         }
2786
2787         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2788         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2789         {
2790             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2791             {
2792 #if KNOB_ENABLE_TOSS_POINTS
2793                 if (!KNOB_TOSS_SETUP_TRIS)
2794 #endif
2795                 {
2796                     pTileMgr->enqueue(x, y, &work);
2797                 }
2798             }
2799         }
2800
2801         primMask &= ~(1 << primIndex);
2802     }
2803
2804 endBinLines:
2805
2806     AR_END(FEBinLines, 1);
2807 }