src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrGetStats.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to stats callback.
  89 /// @todo This should go away when we switch this to use compute threading.
  90 void ProcessQueryStats(
  91     SWR_CONTEXT *pContext,
  92     DRAW_CONTEXT *pDC,
  93     uint32_t workerId,
  94     void *pUserData)
  95 {
  96     QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
  97     BE_WORK work;
  98     work.type = QUERYSTATS;
  99     work.pfnWork = ProcessQueryStatsBE;
 100     work.desc.queryStats = *pQueryStats;
 101
 102     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 103     pTileMgr->enqueue(0, 0, &work);
 104 }
 105
 106 //////////////////////////////////////////////////////////////////////////
 107 /// @brief FE handler for SwrClearRenderTarget.
 108 /// @param pContext - pointer to SWR context.
 109 /// @param pDC - pointer to draw context.
 110 /// @param workerId - thread's worker id. Even thread has a unique id.
 111 /// @param pUserData - Pointer to user data passed back to clear callback.
 112 /// @todo This should go away when we switch this to use compute threading.
 113 void ProcessClear(
 114     SWR_CONTEXT *pContext,
 115     DRAW_CONTEXT *pDC,
 116     uint32_t workerId,
 117     void *pUserData)
 118 {
 119     CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
 120     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 121
 122     const API_STATE& state = GetApiState(pDC);
 123
 124     // queue a clear to each macro tile
 125     // compute macro tile bounds for the current scissor/viewport
 126     uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
 127     uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
 128     uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
 129     uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
 130
 131     BE_WORK work;
 132     work.type = CLEAR;
 133     work.pfnWork = ProcessClearBE;
 134     work.desc.clear = *pClear;
 135
 136     for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
 137     {
 138         for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
 139         {
 140             pTileMgr->enqueue(x, y, &work);
 141         }
 142     }
 143 }
 144
 145 //////////////////////////////////////////////////////////////////////////
 146 /// @brief FE handler for SwrStoreTiles.
 147 /// @param pContext - pointer to SWR context.
 148 /// @param pDC - pointer to draw context.
 149 /// @param workerId - thread's worker id. Even thread has a unique id.
 150 /// @param pUserData - Pointer to user data passed back to callback.
 151 /// @todo This should go away when we switch this to use compute threading.
 152 void ProcessStoreTiles(
 153     SWR_CONTEXT *pContext,
 154     DRAW_CONTEXT *pDC,
 155     uint32_t workerId,
 156     void *pUserData)
 157 {
 158     RDTSC_START(FEProcessStoreTiles);
 159     STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
 160     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 161
 162     const API_STATE& state = GetApiState(pDC);
 163
 164     // queue a store to each macro tile
 165     // compute macro tile bounds for the current render target
 166     const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 167     const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 168
 169     uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
 170     uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
 171
 172     // store tiles
 173     BE_WORK work;
 174     work.type = STORETILES;
 175     work.pfnWork = ProcessStoreTileBE;
 176     work.desc.storeTiles = *pStore;
 177
 178     for (uint32_t x = 0; x < numMacroTilesX; ++x)
 179     {
 180         for (uint32_t y = 0; y < numMacroTilesY; ++y)
 181         {
 182             pTileMgr->enqueue(x, y, &work);
 183         }
 184     }
 185
 186     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 187 }
 188
 189 //////////////////////////////////////////////////////////////////////////
 190 /// @brief FE handler for SwrInvalidateTiles.
 191 /// @param pContext - pointer to SWR context.
 192 /// @param pDC - pointer to draw context.
 193 /// @param workerId - thread's worker id. Even thread has a unique id.
 194 /// @param pUserData - Pointer to user data passed back to callback.
 195 /// @todo This should go away when we switch this to use compute threading.
 196 void ProcessDiscardInvalidateTiles(
 197     SWR_CONTEXT *pContext,
 198     DRAW_CONTEXT *pDC,
 199     uint32_t workerId,
 200     void *pUserData)
 201 {
 202     RDTSC_START(FEProcessInvalidateTiles);
 203     DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 204     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 205
 206     SWR_RECT rect;
 207
 208     if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
 209     {
 210         // Valid rect
 211         rect = pInv->rect;
 212     }
 213     else
 214     {
 215         // Use viewport dimensions
 216         const API_STATE& state = GetApiState(pDC);
 217
 218         rect.left   = (uint32_t)state.vp[0].x;
 219         rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
 220         rect.top    = (uint32_t)state.vp[0].y;
 221         rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
 222     }
 223
 224     // queue a store to each macro tile
 225     // compute macro tile bounds for the current render target
 226     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 227     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 228
 229     // Setup region assuming full tiles
 230     uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
 231     uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
 232
 233     uint32_t macroTileEndX = rect.right / macroWidth;
 234     uint32_t macroTileEndY = rect.bottom / macroHeight;
 235
 236     if (pInv->fullTilesOnly == false)
 237     {
 238         // include partial tiles
 239         macroTileStartX = rect.left / macroWidth;
 240         macroTileStartY = rect.top / macroHeight;
 241
 242         macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
 243         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
 244     }
 245
 246     SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
 247     SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 248
 249     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
 250     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 251
 252     // load tiles
 253     BE_WORK work;
 254     work.type = DISCARDINVALIDATETILES;
 255     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 256     work.desc.discardInvalidateTiles = *pInv;
 257
 258     for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
 259     {
 260         for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
 261         {
 262             pTileMgr->enqueue(x, y, &work);
 263         }
 264     }
 265
 266     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 267 }
 268
 269 //////////////////////////////////////////////////////////////////////////
 270 /// @brief Computes the number of primitives given the number of verts.
 271 /// @param mode - primitive topology for draw operation.
 272 /// @param numPrims - number of vertices or indices for draw.
 273 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 274 uint32_t GetNumPrims(
 275     PRIMITIVE_TOPOLOGY mode,
 276     uint32_t numPrims)
 277 {
 278     switch (mode)
 279     {
 280     case TOP_POINT_LIST: return numPrims;
 281     case TOP_TRIANGLE_LIST: return numPrims / 3;
 282     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 283     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 284     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 285     case TOP_QUAD_LIST: return numPrims / 4;
 286     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 287     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 288     case TOP_LINE_LIST: return numPrims / 2;
 289     case TOP_LINE_LOOP: return numPrims;
 290     case TOP_RECT_LIST: return numPrims / 3;
 291     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 292     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 293     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 294     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 295
 296     case TOP_PATCHLIST_1:
 297     case TOP_PATCHLIST_2:
 298     case TOP_PATCHLIST_3:
 299     case TOP_PATCHLIST_4:
 300     case TOP_PATCHLIST_5:
 301     case TOP_PATCHLIST_6:
 302     case TOP_PATCHLIST_7:
 303     case TOP_PATCHLIST_8:
 304     case TOP_PATCHLIST_9:
 305     case TOP_PATCHLIST_10:
 306     case TOP_PATCHLIST_11:
 307     case TOP_PATCHLIST_12:
 308     case TOP_PATCHLIST_13:
 309     case TOP_PATCHLIST_14:
 310     case TOP_PATCHLIST_15:
 311     case TOP_PATCHLIST_16:
 312     case TOP_PATCHLIST_17:
 313     case TOP_PATCHLIST_18:
 314     case TOP_PATCHLIST_19:
 315     case TOP_PATCHLIST_20:
 316     case TOP_PATCHLIST_21:
 317     case TOP_PATCHLIST_22:
 318     case TOP_PATCHLIST_23:
 319     case TOP_PATCHLIST_24:
 320     case TOP_PATCHLIST_25:
 321     case TOP_PATCHLIST_26:
 322     case TOP_PATCHLIST_27:
 323     case TOP_PATCHLIST_28:
 324     case TOP_PATCHLIST_29:
 325     case TOP_PATCHLIST_30:
 326     case TOP_PATCHLIST_31:
 327     case TOP_PATCHLIST_32:
 328         return numPrims / (mode - TOP_PATCHLIST_BASE);
 329
 330     case TOP_POLYGON:
 331     case TOP_POINT_LIST_BF:
 332     case TOP_LINE_STRIP_CONT:
 333     case TOP_LINE_STRIP_BF:
 334     case TOP_LINE_STRIP_CONT_BF:
 335     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 336     case TOP_TRI_STRIP_REVERSE:
 337     case TOP_PATCHLIST_BASE:
 338     case TOP_UNKNOWN:
 339         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 340         return 0;
 341     }
 342
 343     return 0;
 344 }
 345
 346 //////////////////////////////////////////////////////////////////////////
 347 /// @brief Computes the number of verts given the number of primitives.
 348 /// @param mode - primitive topology for draw operation.
 349 /// @param numPrims - number of primitives for draw.
 350 uint32_t GetNumVerts(
 351     PRIMITIVE_TOPOLOGY mode,
 352     uint32_t numPrims)
 353 {
 354     switch (mode)
 355     {
 356     case TOP_POINT_LIST: return numPrims;
 357     case TOP_TRIANGLE_LIST: return numPrims * 3;
 358     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 359     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 360     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 361     case TOP_QUAD_LIST: return numPrims * 4;
 362     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 363     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 364     case TOP_LINE_LIST: return numPrims * 2;
 365     case TOP_LINE_LOOP: return numPrims;
 366     case TOP_RECT_LIST: return numPrims * 3;
 367     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 368     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 369     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 370     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 371
 372     case TOP_PATCHLIST_1:
 373     case TOP_PATCHLIST_2:
 374     case TOP_PATCHLIST_3:
 375     case TOP_PATCHLIST_4:
 376     case TOP_PATCHLIST_5:
 377     case TOP_PATCHLIST_6:
 378     case TOP_PATCHLIST_7:
 379     case TOP_PATCHLIST_8:
 380     case TOP_PATCHLIST_9:
 381     case TOP_PATCHLIST_10:
 382     case TOP_PATCHLIST_11:
 383     case TOP_PATCHLIST_12:
 384     case TOP_PATCHLIST_13:
 385     case TOP_PATCHLIST_14:
 386     case TOP_PATCHLIST_15:
 387     case TOP_PATCHLIST_16:
 388     case TOP_PATCHLIST_17:
 389     case TOP_PATCHLIST_18:
 390     case TOP_PATCHLIST_19:
 391     case TOP_PATCHLIST_20:
 392     case TOP_PATCHLIST_21:
 393     case TOP_PATCHLIST_22:
 394     case TOP_PATCHLIST_23:
 395     case TOP_PATCHLIST_24:
 396     case TOP_PATCHLIST_25:
 397     case TOP_PATCHLIST_26:
 398     case TOP_PATCHLIST_27:
 399     case TOP_PATCHLIST_28:
 400     case TOP_PATCHLIST_29:
 401     case TOP_PATCHLIST_30:
 402     case TOP_PATCHLIST_31:
 403     case TOP_PATCHLIST_32:
 404         return numPrims * (mode - TOP_PATCHLIST_BASE);
 405
 406     case TOP_POLYGON:
 407     case TOP_POINT_LIST_BF:
 408     case TOP_LINE_STRIP_CONT:
 409     case TOP_LINE_STRIP_BF:
 410     case TOP_LINE_STRIP_CONT_BF:
 411     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 412     case TOP_TRI_STRIP_REVERSE:
 413     case TOP_PATCHLIST_BASE:
 414     case TOP_UNKNOWN:
 415         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 416         return 0;
 417     }
 418
 419     return 0;
 420 }
 421
 422 //////////////////////////////////////////////////////////////////////////
 423 /// @brief Return number of verts per primitive.
 424 /// @param topology - topology
 425 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 426 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 427 {
 428     uint32_t numVerts = 0;
 429     switch (topology)
 430     {
 431     case TOP_POINT_LIST:
 432     case TOP_POINT_LIST_BF:
 433         numVerts = 1;
 434         break;
 435     case TOP_LINE_LIST:
 436     case TOP_LINE_STRIP:
 437     case TOP_LINE_LIST_ADJ:
 438     case TOP_LINE_LOOP:
 439     case TOP_LINE_STRIP_CONT:
 440     case TOP_LINE_STRIP_BF:
 441     case TOP_LISTSTRIP_ADJ:
 442         numVerts = 2;
 443         break;
 444     case TOP_TRIANGLE_LIST:
 445     case TOP_TRIANGLE_STRIP:
 446     case TOP_TRIANGLE_FAN:
 447     case TOP_TRI_LIST_ADJ:
 448     case TOP_TRI_STRIP_ADJ:
 449     case TOP_TRI_STRIP_REVERSE:
 450     case TOP_RECT_LIST:
 451         numVerts = 3;
 452         break;
 453     case TOP_QUAD_LIST:
 454     case TOP_QUAD_STRIP:
 455         numVerts = 4;
 456         break;
 457     case TOP_PATCHLIST_1:
 458     case TOP_PATCHLIST_2:
 459     case TOP_PATCHLIST_3:
 460     case TOP_PATCHLIST_4:
 461     case TOP_PATCHLIST_5:
 462     case TOP_PATCHLIST_6:
 463     case TOP_PATCHLIST_7:
 464     case TOP_PATCHLIST_8:
 465     case TOP_PATCHLIST_9:
 466     case TOP_PATCHLIST_10:
 467     case TOP_PATCHLIST_11:
 468     case TOP_PATCHLIST_12:
 469     case TOP_PATCHLIST_13:
 470     case TOP_PATCHLIST_14:
 471     case TOP_PATCHLIST_15:
 472     case TOP_PATCHLIST_16:
 473     case TOP_PATCHLIST_17:
 474     case TOP_PATCHLIST_18:
 475     case TOP_PATCHLIST_19:
 476     case TOP_PATCHLIST_20:
 477     case TOP_PATCHLIST_21:
 478     case TOP_PATCHLIST_22:
 479     case TOP_PATCHLIST_23:
 480     case TOP_PATCHLIST_24:
 481     case TOP_PATCHLIST_25:
 482     case TOP_PATCHLIST_26:
 483     case TOP_PATCHLIST_27:
 484     case TOP_PATCHLIST_28:
 485     case TOP_PATCHLIST_29:
 486     case TOP_PATCHLIST_30:
 487     case TOP_PATCHLIST_31:
 488     case TOP_PATCHLIST_32:
 489         numVerts = topology - TOP_PATCHLIST_BASE;
 490         break;
 491     default:
 492         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 493         break;
 494     }
 495
 496     if (includeAdjVerts)
 497     {
 498         switch (topology)
 499         {
 500         case TOP_LISTSTRIP_ADJ:
 501         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 502         case TOP_TRI_STRIP_ADJ:
 503         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 504         default: break;
 505         }
 506     }
 507
 508     return numVerts;
 509 }
 510
 511 //////////////////////////////////////////////////////////////////////////
 512 /// @brief Generate mask from remaining work.
 513 /// @param numWorkItems - Number of items being worked on by a SIMD.
 514 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 515 {
 516     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 517     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 518     return _simd_castps_si(vMask(mask));
 519 }
 520
 521 //////////////////////////////////////////////////////////////////////////
 522 /// @brief StreamOut - Streams vertex data out to SO buffers.
 523 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 524 /// @param pDC - pointer to draw context.
 525 /// @param workerId - thread's worker id. Even thread has a unique id.
 526 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 527 static void StreamOut(
 528     DRAW_CONTEXT* pDC,
 529     PA_STATE& pa,
 530     uint32_t workerId,
 531     uint32_t* pPrimData,
 532     uint32_t streamIndex)
 533 {
 534     RDTSC_START(FEStreamout);
 535
 536     SWR_CONTEXT* pContext = pDC->pContext;
 537
 538     const API_STATE& state = GetApiState(pDC);
 539     const SWR_STREAMOUT_STATE &soState = state.soState;
 540
 541     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 542
 543     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 544     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 545
 546     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 547
 548     // Setup buffer state pointers.
 549     for (uint32_t i = 0; i < 4; ++i)
 550     {
 551         soContext.pBuffer[i] = &state.soBuffer[i];
 552     }
 553
 554     uint32_t numPrims = pa.NumPrims();
 555     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 556     {
 557         DWORD slot = 0;
 558         uint32_t soMask = soState.streamMasks[streamIndex];
 559
 560         // Write all entries into primitive data buffer for SOS.
 561         while (_BitScanForward(&slot, soMask))
 562         {
 563             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 564             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 565             pa.AssembleSingle(paSlot, primIndex, attrib);
 566
 567             // Attribute offset is relative offset from start of vertex.
 568             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 569             // to prim data starting at slot 0. Which is why we do (slot - 1).
 570             // Also note: GL works slightly differently, and needs slot 0
 571             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 572
 573             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 574             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 575             {
 576                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 577
 578                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 579             }
 580             soMask &= ~(1 << slot);
 581         }
 582
 583         // Update pPrimData pointer
 584         soContext.pPrimData = pPrimData;
 585
 586         // Call SOS
 587         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 588         state.pfnSoFunc[streamIndex](soContext);
 589     }
 590
 591     // Update SO write offset. The driver provides memory for the update.
 592     for (uint32_t i = 0; i < 4; ++i)
 593     {
 594         if (state.soBuffer[i].pWriteOffset)
 595         {
 596             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 597         }
 598
 599         if (state.soBuffer[i].soWriteEnable)
 600         {
 601             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 602             pDC->dynState.SoWriteOffsetDirty[i] = true;
 603         }
 604     }
 605
 606     UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 607     UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 608
 609     RDTSC_STOP(FEStreamout, 1, 0);
 610 }
 611
 612 //////////////////////////////////////////////////////////////////////////
 613 /// @brief Computes number of invocations. The current index represents
 614 ///        the start of the SIMD. The max index represents how much work
 615 ///        items are remaining. If there is less then a SIMD's left of work
 616 ///        then return the remaining amount of work.
 617 /// @param curIndex - The start index for the SIMD.
 618 /// @param maxIndex - The last index for all work items.
 619 static INLINE uint32_t GetNumInvocations(
 620     uint32_t curIndex,
 621     uint32_t maxIndex)
 622 {
 623     uint32_t remainder = (maxIndex - curIndex);
 624     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 625 }
 626
 627 //////////////////////////////////////////////////////////////////////////
 628 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 629 ///        The geometry shader will loop over each active streamout buffer, assembling
 630 ///        primitives for the downstream stages. When multistream output is enabled,
 631 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 632 ///        buffer for the primitive assembler.
 633 /// @param stream - stream id to generate the cut buffer for
 634 /// @param pStreamIdBase - pointer to the stream ID buffer
 635 /// @param numEmittedVerts - Number of total verts emitted by the GS
 636 /// @param pCutBuffer - output buffer to write cuts to
 637 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 638 {
 639     SWR_ASSERT(stream < MAX_SO_STREAMS);
 640
 641     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 642     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 643
 644     for (uint32_t b = 0; b < numOutputBytes; ++b)
 645     {
 646         uint8_t curInputByte = pStreamIdBase[2*b];
 647         uint8_t outByte = 0;
 648         for (uint32_t i = 0; i < 4; ++i)
 649         {
 650             if ((curInputByte & 0x3) != stream)
 651             {
 652                 outByte |= (1 << i);
 653             }
 654             curInputByte >>= 2;
 655         }
 656
 657         curInputByte = pStreamIdBase[2 * b + 1];
 658         for (uint32_t i = 0; i < 4; ++i)
 659         {
 660             if ((curInputByte & 0x3) != stream)
 661             {
 662                 outByte |= (1 << (i + 4));
 663             }
 664             curInputByte >>= 2;
 665         }
 666
 667         *pCutBuffer++ = outByte;
 668     }
 669 }
 670
 671 THREAD SWR_GS_CONTEXT tlsGsContext;
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Implements GS stage.
 675 /// @param pDC - pointer to draw context.
 676 /// @param workerId - thread's worker id. Even thread has a unique id.
 677 /// @param pa - The primitive assembly object.
 678 /// @param pGsOut - output stream for GS
 679 template <
 680     typename HasStreamOutT,
 681     typename HasRastT>
 682 static void GeometryShaderStage(
 683     DRAW_CONTEXT *pDC,
 684     uint32_t workerId,
 685     PA_STATE& pa,
 686     void* pGsOut,
 687     void* pCutBuffer,
 688     void* pStreamCutBuffer,
 689     uint32_t* pSoPrimData,
 690     simdscalari primID)
 691 {
 692     RDTSC_START(FEGeometryShader);
 693
 694     SWR_CONTEXT* pContext = pDC->pContext;
 695
 696     const API_STATE& state = GetApiState(pDC);
 697     const SWR_GS_STATE* pState = &state.gsState;
 698
 699     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 700     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 701
 702     tlsGsContext.pStream = (uint8_t*)pGsOut;
 703     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 704     tlsGsContext.PrimitiveID = primID;
 705
 706     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 707     simdvector attrib[MAX_ATTRIBUTES];
 708
 709     // assemble all attributes for the input primitive
 710     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 711     {
 712         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 713         pa.Assemble(attribSlot, attrib);
 714
 715         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 716         {
 717             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 718         }
 719     }
 720
 721     // assemble position
 722     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 723     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 724     {
 725         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 726     }
 727
 728     const uint32_t vertexStride = sizeof(simdvertex);
 729     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 730     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 731     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 732     uint32_t cutPrimStride;
 733     uint32_t cutInstanceStride;
 734
 735     if (pState->isSingleStream)
 736     {
 737         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 738         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 739     }
 740     else
 741     {
 742         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 743         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 744     }
 745
 746     // record valid prims from the frontend to avoid over binning the newly generated
 747     // prims from the GS
 748     uint32_t numInputPrims = pa.NumPrims();
 749
 750     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 751     {
 752         tlsGsContext.InstanceID = instance;
 753         tlsGsContext.mask = GenerateMask(numInputPrims);
 754
 755         // execute the geometry shader
 756         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 757
 758         tlsGsContext.pStream += instanceStride;
 759         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 760     }
 761
 762     // set up new binner and state for the GS output topology
 763     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 764     if (HasRastT::value)
 765     {
 766         switch (pState->outputTopology)
 767         {
 768         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 769         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 770         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 771         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 772         }
 773     }
 774
 775     // foreach input prim:
 776     // - setup a new PA based on the emitted verts for that prim
 777     // - loop over the new verts, calling PA to assemble each prim
 778     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 779     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 780
 781     uint32_t totalPrimsGenerated = 0;
 782     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 783     {
 784         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 785         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 786         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 787         {
 788             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 789             if (numEmittedVerts == 0)
 790             {
 791                 continue;
 792             }
 793
 794             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 795             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 796
 797             uint32_t numAttribs = state.feNumAttributes;
 798
 799             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 800             {
 801                 bool processCutVerts = false;
 802
 803                 uint8_t* pCutBuffer = pCutBase;
 804
 805                 // assign default stream ID, only relevant when GS is outputting a single stream
 806                 uint32_t streamID = 0;
 807                 if (pState->isSingleStream)
 808                 {
 809                     processCutVerts = true;
 810                     streamID = pState->singleStreamID;
 811                     if (streamID != stream) continue;
 812                 }
 813                 else
 814                 {
 815                     // early exit if this stream is not enabled for streamout
 816                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 817                     {
 818                         continue;
 819                     }
 820
 821                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 822                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 823                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 824                     processCutVerts = false;
 825                 }
 826
 827                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 828
 829                 while (gsPa.GetNextStreamOutput())
 830                 {
 831                     do
 832                     {
 833                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 834
 835                         if (assemble)
 836                         {
 837                             totalPrimsGenerated += gsPa.NumPrims();
 838
 839                             if (HasStreamOutT::value)
 840                             {
 841                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 842                             }
 843
 844                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 845                             {
 846                                 simdscalari vPrimId;
 847                                 // pull primitiveID from the GS output if available
 848                                 if (state.gsState.emitsPrimitiveID)
 849                                 {
 850                                     simdvector primIdAttrib[3];
 851                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 852                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 853                                 }
 854                                 else
 855                                 {
 856                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 857                                 }
 858
 859                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 860                             }
 861                         }
 862                     } while (gsPa.NextPrim());
 863                 }
 864             }
 865         }
 866     }
 867
 868     // update GS pipeline stats
 869     UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
 870     UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
 871
 872     RDTSC_STOP(FEGeometryShader, 1, 0);
 873 }
 874
 875 //////////////////////////////////////////////////////////////////////////
 876 /// @brief Allocate GS buffers
 877 /// @param pDC - pointer to draw context.
 878 /// @param state - API state
 879 /// @param ppGsOut - pointer to GS output buffer allocation
 880 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 881 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 882     void **ppStreamCutBuffer)
 883 {
 884     auto pArena = pDC->pArena;
 885     SWR_ASSERT(pArena != nullptr);
 886     SWR_ASSERT(state.gsState.gsEnable);
 887     // allocate arena space to hold GS output verts
 888     // @todo pack attribs
 889     // @todo support multiple streams
 890     const uint32_t vertexStride = sizeof(simdvertex);
 891     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 892     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 893     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 894
 895     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 896     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 897     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 898     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 899
 900     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 901     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 902
 903     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 904     if (state.gsState.isSingleStream)
 905     {
 906         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 907         *ppStreamCutBuffer = nullptr;
 908     }
 909     else
 910     {
 911         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 912         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 913     }
 914
 915 }
 916
 917 //////////////////////////////////////////////////////////////////////////
 918 /// @brief Contains all data generated by the HS and passed to the
 919 /// tessellator and DS.
 920 struct TessellationThreadLocalData
 921 {
 922     SWR_HS_CONTEXT hsContext;
 923     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 924     void* pTxCtx;
 925     size_t tsCtxSize;
 926
 927     simdscalar* pDSOutput;
 928     size_t numDSOutputVectors;
 929 };
 930
 931 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 932
 933 //////////////////////////////////////////////////////////////////////////
 934 /// @brief Allocate tessellation data for this worker thread.
 935 INLINE
 936 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 937 {
 938     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 939     if (gt_pTessellationThreadData == nullptr)
 940     {
 941         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 942             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 943         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 944     }
 945 }
 946
 947 //////////////////////////////////////////////////////////////////////////
 948 /// @brief Implements Tessellation Stages.
 949 /// @param pDC - pointer to draw context.
 950 /// @param workerId - thread's worker id. Even thread has a unique id.
 951 /// @param pa - The primitive assembly object.
 952 /// @param pGsOut - output stream for GS
 953 template <
 954     typename HasGeometryShaderT,
 955     typename HasStreamOutT,
 956     typename HasRastT>
 957 static void TessellationStages(
 958     DRAW_CONTEXT *pDC,
 959     uint32_t workerId,
 960     PA_STATE& pa,
 961     void* pGsOut,
 962     void* pCutBuffer,
 963     void* pCutStreamBuffer,
 964     uint32_t* pSoPrimData,
 965     simdscalari primID)
 966 {
 967     const API_STATE& state = GetApiState(pDC);
 968     const SWR_TS_STATE& tsState = state.tsState;
 969     SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
 970
 971     SWR_ASSERT(gt_pTessellationThreadData);
 972
 973     HANDLE tsCtx = TSInitCtx(
 974         tsState.domain,
 975         tsState.partitioning,
 976         tsState.tsOutputTopology,
 977         gt_pTessellationThreadData->pTxCtx,
 978         gt_pTessellationThreadData->tsCtxSize);
 979     if (tsCtx == nullptr)
 980     {
 981         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
 982         tsCtx = TSInitCtx(
 983             tsState.domain,
 984             tsState.partitioning,
 985             tsState.tsOutputTopology,
 986             gt_pTessellationThreadData->pTxCtx,
 987             gt_pTessellationThreadData->tsCtxSize);
 988     }
 989     SWR_ASSERT(tsCtx);
 990
 991     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 992     if (HasRastT::value)
 993     {
 994         switch (tsState.postDSTopology)
 995         {
 996         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
 997         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
 998         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
 999         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1000         }
1001     }
1002
1003     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1004     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1005     hsContext.PrimitiveID = primID;
1006
1007     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1008     // Max storage for one attribute for an entire simdprimitive
1009     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1010
1011     // assemble all attributes for the input primitives
1012     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1013     {
1014         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1015         pa.Assemble(attribSlot, simdattrib);
1016
1017         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1018         {
1019             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1020         }
1021     }
1022
1023 #if defined(_DEBUG)
1024     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1025 #endif
1026
1027     uint32_t numPrims = pa.NumPrims();
1028     hsContext.mask = GenerateMask(numPrims);
1029
1030     // Run the HS
1031     RDTSC_START(FEHullShader);
1032     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1033     RDTSC_STOP(FEHullShader, 0, 0);
1034
1035     UPDATE_STAT(HsInvocations, numPrims);
1036
1037     const uint32_t* pPrimId = (const uint32_t*)&primID;
1038
1039     for (uint32_t p = 0; p < numPrims; ++p)
1040     {
1041         // Run Tessellator
1042         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1043         RDTSC_START(FETessellation);
1044         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1045         RDTSC_STOP(FETessellation, 0, 0);
1046
1047         if (tsData.NumPrimitives == 0)
1048         {
1049             continue;
1050         }
1051         SWR_ASSERT(tsData.NumDomainPoints);
1052
1053         // Allocate DS Output memory
1054         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1055         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1056         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1057         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1058         {
1059             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1060             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1061             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1062         }
1063         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1064         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1065
1066 #if defined(_DEBUG)
1067         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1068 #endif
1069
1070         // Run Domain Shader
1071         SWR_DS_CONTEXT dsContext;
1072         dsContext.PrimitiveID = pPrimId[p];
1073         dsContext.pCpIn = &hsContext.pCPout[p];
1074         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1075         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1076         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1077         dsContext.vectorStride = requiredDSVectorInvocations;
1078
1079         uint32_t dsInvocations = 0;
1080
1081         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1082         {
1083             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1084
1085             RDTSC_START(FEDomainShader);
1086             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1087             RDTSC_STOP(FEDomainShader, 0, 0);
1088
1089             dsInvocations += KNOB_SIMD_WIDTH;
1090         }
1091         UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1092
1093         PA_TESS tessPa(
1094             pDC,
1095             dsContext.pOutputData,
1096             dsContext.vectorStride,
1097             tsState.numDsOutputAttribs,
1098             tsData.ppIndices,
1099             tsData.NumPrimitives,
1100             tsState.postDSTopology);
1101
1102         while (tessPa.HasWork())
1103         {
1104             if (HasGeometryShaderT::value)
1105             {
1106                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1107                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1108                     _simd_set1_epi32(dsContext.PrimitiveID));
1109             }
1110             else
1111             {
1112                 if (HasStreamOutT::value)
1113                 {
1114                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1115                 }
1116
1117                 if (HasRastT::value)
1118                 {
1119                     simdvector prim[3]; // Only deal with triangles, lines, or points
1120                     RDTSC_START(FEPAAssemble);
1121 #if SWR_ENABLE_ASSERTS
1122                     bool assemble =
1123 #endif
1124                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1125                     RDTSC_STOP(FEPAAssemble, 1, 0);
1126                     SWR_ASSERT(assemble);
1127
1128                     SWR_ASSERT(pfnClipFunc);
1129                     pfnClipFunc(pDC, tessPa, workerId, prim,
1130                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1131                 }
1132             }
1133
1134             tessPa.NextPrim();
1135
1136         } // while (tessPa.HasWork())
1137     } // for (uint32_t p = 0; p < numPrims; ++p)
1138
1139     TSDestroyCtx(tsCtx);
1140 }
1141
1142 //////////////////////////////////////////////////////////////////////////
1143 /// @brief FE handler for SwrDraw.
1144 /// @tparam IsIndexedT - Is indexed drawing enabled
1145 /// @tparam HasTessellationT - Is tessellation enabled
1146 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1147 /// @tparam HasStreamOutT - Is stream-out enabled
1148 /// @tparam HasRastT - Is rasterization enabled
1149 /// @param pContext - pointer to SWR context.
1150 /// @param pDC - pointer to draw context.
1151 /// @param workerId - thread's worker id.
1152 /// @param pUserData - Pointer to DRAW_WORK
1153 template <
1154     typename IsIndexedT,
1155     typename IsCutIndexEnabledT,
1156     typename HasTessellationT,
1157     typename HasGeometryShaderT,
1158     typename HasStreamOutT,
1159     typename HasRastT>
1160 void ProcessDraw(
1161     SWR_CONTEXT *pContext,
1162     DRAW_CONTEXT *pDC,
1163     uint32_t workerId,
1164     void *pUserData)
1165 {
1166
1167 #if KNOB_ENABLE_TOSS_POINTS
1168     if (KNOB_TOSS_QUEUE_FE)
1169     {
1170         return;
1171     }
1172 #endif
1173
1174     RDTSC_START(FEProcessDraw);
1175
1176     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1177     const API_STATE&    state = GetApiState(pDC);
1178     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1179     SWR_VS_CONTEXT      vsContext;
1180     simdvertex          vin;
1181
1182     int indexSize = 0;
1183     uint32_t endVertex = work.numVerts;
1184
1185     const int32_t* pLastRequestedIndex = nullptr;
1186     if (IsIndexedT::value)
1187     {
1188         switch (work.type)
1189         {
1190         case R32_UINT:
1191             indexSize = sizeof(uint32_t);
1192             pLastRequestedIndex = &(work.pIB[endVertex]);
1193             break;
1194         case R16_UINT:
1195             indexSize = sizeof(uint16_t);
1196             // nasty address offset to last index
1197             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1198             break;
1199         case R8_UINT:
1200             indexSize = sizeof(uint8_t);
1201             // nasty address offset to last index
1202             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1203             break;
1204         default:
1205             SWR_ASSERT(0);
1206         }
1207     }
1208     else
1209     {
1210         // No cuts, prune partial primitives.
1211         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1212     }
1213
1214     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1215     fetchInfo.pStreams = &state.vertexBuffers[0];
1216     fetchInfo.StartInstance = work.startInstance;
1217     fetchInfo.StartVertex = 0;
1218
1219     vsContext.pVin = &vin;
1220
1221     if (IsIndexedT::value)
1222     {
1223         fetchInfo.BaseVertex = work.baseVertex;
1224
1225         // if the entire index buffer isn't being consumed, set the last index
1226         // so that fetches < a SIMD wide will be masked off
1227         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1228         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1229         {
1230             fetchInfo.pLastIndex = pLastRequestedIndex;
1231         }
1232     }
1233     else
1234     {
1235         fetchInfo.StartVertex = work.startVertex;
1236     }
1237
1238 #ifdef KNOB_ENABLE_RDTSC
1239     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1240 #endif
1241
1242     void* pGsOut = nullptr;
1243     void* pCutBuffer = nullptr;
1244     void* pStreamCutBuffer = nullptr;
1245     if (HasGeometryShaderT::value)
1246     {
1247         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1248     }
1249
1250     if (HasTessellationT::value)
1251     {
1252         SWR_ASSERT(state.tsState.tsEnable == true);
1253         SWR_ASSERT(state.pfnHsFunc != nullptr);
1254         SWR_ASSERT(state.pfnDsFunc != nullptr);
1255
1256         AllocateTessellationData(pContext);
1257     }
1258     else
1259     {
1260         SWR_ASSERT(state.tsState.tsEnable == false);
1261         SWR_ASSERT(state.pfnHsFunc == nullptr);
1262         SWR_ASSERT(state.pfnDsFunc == nullptr);
1263     }
1264
1265     // allocate space for streamout input prim data
1266     uint32_t* pSoPrimData = nullptr;
1267     if (HasStreamOutT::value)
1268     {
1269         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1270     }
1271
1272     // choose primitive assembler
1273     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1274     PA_STATE& pa = paFactory.GetPA();
1275
1276     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1277     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1278     {
1279         simdscalari vIndex;
1280         uint32_t  i = 0;
1281
1282         if (IsIndexedT::value)
1283         {
1284             fetchInfo.pIndices = work.pIB;
1285         }
1286         else
1287         {
1288             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1289             fetchInfo.pIndices = (const int32_t*)&vIndex;
1290         }
1291
1292         fetchInfo.CurInstance = instanceNum;
1293         vsContext.InstanceID = instanceNum;
1294
1295         while (pa.HasWork())
1296         {
1297             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1298             // So we need to keep this outside of (i < endVertex) check.
1299             simdmask* pvCutIndices = nullptr;
1300             if (IsIndexedT::value)
1301             {
1302                 pvCutIndices = &pa.GetNextVsIndices();
1303             }
1304
1305             simdvertex& vout = pa.GetNextVsOutput();
1306             vsContext.pVout = &vout;
1307
1308             if (i < endVertex)
1309             {
1310
1311                 // 1. Execute FS/VS for a single SIMD.
1312                 RDTSC_START(FEFetchShader);
1313                 state.pfnFetchFunc(fetchInfo, vin);
1314                 RDTSC_STOP(FEFetchShader, 0, 0);
1315
1316                 // forward fetch generated vertex IDs to the vertex shader
1317                 vsContext.VertexID = fetchInfo.VertexID;
1318
1319                 // Setup active mask for vertex shader.
1320                 vsContext.mask = GenerateMask(endVertex - i);
1321
1322                 // forward cut mask to the PA
1323                 if (IsIndexedT::value)
1324                 {
1325                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1326                 }
1327
1328                 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1329
1330 #if KNOB_ENABLE_TOSS_POINTS
1331                 if (!KNOB_TOSS_FETCH)
1332 #endif
1333                 {
1334                     RDTSC_START(FEVertexShader);
1335                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1336                     RDTSC_STOP(FEVertexShader, 0, 0);
1337
1338                     UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1339                 }
1340             }
1341
1342             // 2. Assemble primitives given the last two SIMD.
1343             do
1344             {
1345                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1346                 // PaAssemble returns false if there is not enough verts to assemble.
1347                 RDTSC_START(FEPAAssemble);
1348                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1349                 RDTSC_STOP(FEPAAssemble, 1, 0);
1350
1351 #if KNOB_ENABLE_TOSS_POINTS
1352                 if (!KNOB_TOSS_FETCH)
1353 #endif
1354                 {
1355 #if KNOB_ENABLE_TOSS_POINTS
1356                     if (!KNOB_TOSS_VS)
1357 #endif
1358                     {
1359                         if (assemble)
1360                         {
1361                             UPDATE_STAT(IaPrimitives, pa.NumPrims());
1362
1363                             if (HasTessellationT::value)
1364                             {
1365                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1366                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1367                             }
1368                             else if (HasGeometryShaderT::value)
1369                             {
1370                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1371                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1372                             }
1373                             else
1374                             {
1375                                 // If streamout is enabled then stream vertices out to memory.
1376                                 if (HasStreamOutT::value)
1377                                 {
1378                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1379                                 }
1380
1381                                 if (HasRastT::value)
1382                                 {
1383                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1384                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1385                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1386                                 }
1387                             }
1388                         }
1389                     }
1390                 }
1391             } while (pa.NextPrim());
1392
1393             i += KNOB_SIMD_WIDTH;
1394             if (IsIndexedT::value)
1395             {
1396                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1397             }
1398             else
1399             {
1400                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1401             }
1402         }
1403         pa.Reset();
1404     }
1405
1406     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1407 }
1408
1409 struct FEDrawChooser
1410 {
1411     typedef PFN_FE_WORK_FUNC FuncType;
1412
1413     template <typename... ArgsB>
1414     static FuncType GetFunc()
1415     {
1416         return ProcessDraw<ArgsB...>;
1417     }
1418 };
1419
1420
1421 // Selector for correct templated Draw front-end function
1422 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1423     bool IsIndexed,
1424     bool IsCutIndexEnabled,
1425     bool HasTessellation,
1426     bool HasGeometryShader,
1427     bool HasStreamOut,
1428     bool HasRasterization)
1429 {
1430     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1431 }
1432
1433 //////////////////////////////////////////////////////////////////////////
1434 /// @brief Processes attributes for the backend based on linkage mask and
1435 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1436 /// @param pDC - Draw context
1437 /// @param pa - Primitive Assembly state
1438 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1439 /// @param pLinkageMap - maps VS attribute slot to PS slot
1440 /// @param triIndex - Triangle to process attributes for
1441 /// @param pBuffer - Output result
1442 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1443 INLINE void ProcessAttributes(
1444     DRAW_CONTEXT *pDC,
1445     PA_STATE&pa,
1446     uint32_t triIndex,
1447     uint32_t primId,
1448     float *pBuffer)
1449 {
1450     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1451     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1452     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1453     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1454     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1455     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1456
1457     static const float constTable[3][4] = {
1458         {0.0f, 0.0f, 0.0f, 0.0f},
1459         {0.0f, 0.0f, 0.0f, 1.0f},
1460         {1.0f, 1.0f, 1.0f, 1.0f}
1461     };
1462
1463     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1464     {
1465         uint32_t inputSlot;
1466         if (IsSwizzledT::value)
1467         {
1468             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1469             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1470
1471         }
1472         else
1473         {
1474             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1475         }
1476
1477         __m128 attrib[3];    // triangle attribs (always 4 wide)
1478         float* pAttribStart = pBuffer;
1479
1480         if (HasConstantInterpT::value || IsDegenerate::value)
1481         {
1482             if (_bittest(&constantInterpMask, i))
1483             {
1484                 uint32_t vid;
1485                 uint32_t adjustedTriIndex;
1486                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1487                 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1488                 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1489                 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1490                 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1491
1492                 switch (topo) {
1493                 case TOP_QUAD_LIST:
1494                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1495                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1496                     break;
1497                 case TOP_QUAD_STRIP:
1498                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1499                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1500                     break;
1501                 case TOP_TRIANGLE_STRIP:
1502                     adjustedTriIndex = triIndex;
1503                     vid = (triIndex & 1)
1504                         ? tristripProvokingVertex[provokingVertex]
1505                         : provokingVertex;
1506                     break;
1507                 default:
1508                     adjustedTriIndex = triIndex;
1509                     vid = provokingVertex;
1510                     break;
1511                 }
1512
1513                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1514
1515                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1516                 {
1517                     _mm_store_ps(pBuffer, attrib[vid]);
1518                     pBuffer += 4;
1519                 }
1520             }
1521             else
1522             {
1523                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1524
1525                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1526                 {
1527                     _mm_store_ps(pBuffer, attrib[i]);
1528                     pBuffer += 4;
1529                 }
1530             }
1531         }
1532         else
1533         {
1534             pa.AssembleSingle(inputSlot, triIndex, attrib);
1535
1536             for (uint32_t i = 0; i < NumVertsT::value; ++i)
1537             {
1538                 _mm_store_ps(pBuffer, attrib[i]);
1539                 pBuffer += 4;
1540             }
1541         }
1542
1543         // pad out the attrib buffer to 3 verts to ensure the triangle
1544         // interpolation code in the pixel shader works correctly for the
1545         // 3 topologies - point, line, tri.  This effectively zeros out the
1546         // effect of the missing vertices in the triangle interpolation.
1547         for (uint32_t v = NumVertsT::value; v < 3; ++v)
1548         {
1549             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1550             pBuffer += 4;
1551         }
1552
1553         // check for constant source overrides
1554         if (IsSwizzledT::value)
1555         {
1556             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1557             if (mask)
1558             {
1559                 DWORD comp;
1560                 while (_BitScanForward(&comp, mask))
1561                 {
1562                     mask &= ~(1 << comp);
1563
1564                     float constantValue = 0.0f;
1565                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1566                     {
1567                     case SWR_CONSTANT_SOURCE_CONST_0000:
1568                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1569                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1570                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1571                         break;
1572                     case SWR_CONSTANT_SOURCE_PRIM_ID:
1573                         constantValue = *(float*)&primId;
1574                         break;
1575                     }
1576
1577                     // apply constant value to all 3 vertices
1578                     for (uint32_t v = 0; v < 3; ++v)
1579                     {
1580                         pAttribStart[comp + v * 4] = constantValue;
1581                     }
1582                 }
1583             }
1584         }
1585     }
1586 }
1587
1588
1589 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1590
1591 struct ProcessAttributesChooser
1592 {
1593     typedef PFN_PROCESS_ATTRIBUTES FuncType;
1594
1595     template <typename... ArgsB>
1596     static FuncType GetFunc()
1597     {
1598         return ProcessAttributes<ArgsB...>;
1599     }
1600 };
1601
1602 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1603 {
1604     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1605 }
1606
1607 //////////////////////////////////////////////////////////////////////////
1608 /// @brief Processes enabled user clip distances. Loads the active clip
1609 ///        distances from the PA, sets up barycentric equations, and
1610 ///        stores the results to the output buffer
1611 /// @param pa - Primitive Assembly state
1612 /// @param primIndex - primitive index to process
1613 /// @param clipDistMask - mask of enabled clip distances
1614 /// @param pUserClipBuffer - buffer to store results
1615 template<uint32_t NumVerts>
1616 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1617 {
1618     DWORD clipDist;
1619     while (_BitScanForward(&clipDist, clipDistMask))
1620     {
1621         clipDistMask &= ~(1 << clipDist);
1622         uint32_t clipSlot = clipDist >> 2;
1623         uint32_t clipComp = clipDist & 0x3;
1624         uint32_t clipAttribSlot = clipSlot == 0 ?
1625             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1626
1627         __m128 primClipDist[3];
1628         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1629
1630         float vertClipDist[NumVerts];
1631         for (uint32_t e = 0; e < NumVerts; ++e)
1632         {
1633             OSALIGNSIMD(float) aVertClipDist[4];
1634             _mm_store_ps(aVertClipDist, primClipDist[e]);
1635             vertClipDist[e] = aVertClipDist[clipComp];
1636         };
1637
1638         // setup plane equations for barycentric interpolation in the backend
1639         float baryCoeff[NumVerts];
1640         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1641         {
1642             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1643         }
1644         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1645
1646         for (uint32_t e = 0; e < NumVerts; ++e)
1647         {
1648             *(pUserClipBuffer++) = baryCoeff[e];
1649         }
1650     }
1651 }
1652
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1655 /// Point precision from FP32.
1656 template <typename PT = FixedPointTraits<Fixed_16_8>>
1657 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1658 {
1659     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1660     return _simd_cvtps_epi32(vFixed);
1661 }
1662
1663 //////////////////////////////////////////////////////////////////////////
1664 /// @brief Helper function to set the X,Y coords of a triangle to the
1665 /// requested Fixed Point precision from FP32.
1666 /// @param tri: simdvector[3] of FP triangle verts
1667 /// @param vXi: fixed point X coords of tri verts
1668 /// @param vYi: fixed point Y coords of tri verts
1669 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1670 {
1671     vXi[0] = fpToFixedPointVertical(tri[0].x);
1672     vYi[0] = fpToFixedPointVertical(tri[0].y);
1673     vXi[1] = fpToFixedPointVertical(tri[1].x);
1674     vYi[1] = fpToFixedPointVertical(tri[1].y);
1675     vXi[2] = fpToFixedPointVertical(tri[2].x);
1676     vYi[2] = fpToFixedPointVertical(tri[2].y);
1677 }
1678
1679 //////////////////////////////////////////////////////////////////////////
1680 /// @brief Calculate bounding box for current triangle
1681 /// @tparam CT: ConservativeRastFETraits type
1682 /// @param vX: fixed point X position for triangle verts
1683 /// @param vY: fixed point Y position for triangle verts
1684 /// @param bbox: fixed point bbox
1685 /// *Note*: expects vX, vY to be in the correct precision for the type
1686 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1687 template <typename CT>
1688 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1689 {
1690     simdscalari vMinX = vX[0];
1691     vMinX = _simd_min_epi32(vMinX, vX[1]);
1692     vMinX = _simd_min_epi32(vMinX, vX[2]);
1693
1694     simdscalari vMaxX = vX[0];
1695     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1696     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1697
1698     simdscalari vMinY = vY[0];
1699     vMinY = _simd_min_epi32(vMinY, vY[1]);
1700     vMinY = _simd_min_epi32(vMinY, vY[2]);
1701
1702     simdscalari vMaxY = vY[0];
1703     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1704     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1705
1706     bbox.left = vMinX;
1707     bbox.right = vMaxX;
1708     bbox.top = vMinY;
1709     bbox.bottom = vMaxY;
1710 }
1711
1712 //////////////////////////////////////////////////////////////////////////
1713 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1714 /// Offsets BBox for conservative rast
1715 template <>
1716 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1717 {
1718     // FE conservative rast traits
1719     typedef FEConservativeRastT CT;
1720
1721     simdscalari vMinX = vX[0];
1722     vMinX = _simd_min_epi32(vMinX, vX[1]);
1723     vMinX = _simd_min_epi32(vMinX, vX[2]);
1724
1725     simdscalari vMaxX = vX[0];
1726     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1727     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1728
1729     simdscalari vMinY = vY[0];
1730     vMinY = _simd_min_epi32(vMinY, vY[1]);
1731     vMinY = _simd_min_epi32(vMinY, vY[2]);
1732
1733     simdscalari vMaxY = vY[0];
1734     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1735     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1736
1737     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1738     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1739     bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1740     bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1741     bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1742     bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1743 }
1744
1745 //////////////////////////////////////////////////////////////////////////
1746 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1747 ///        culling, viewport transform, etc.
1748 /// @param pDC - pointer to draw context.
1749 /// @param pa - The primitive assembly object.
1750 /// @param workerId - thread's worker id. Even thread has a unique id.
1751 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1752 /// @param primID - Primitive ID for each triangle.
1753 /// @tparam CT - ConservativeRastFETraits
1754 template <typename CT>
1755 void BinTriangles(
1756     DRAW_CONTEXT *pDC,
1757     PA_STATE& pa,
1758     uint32_t workerId,
1759     simdvector tri[3],
1760     uint32_t triMask,
1761     simdscalari primID)
1762 {
1763     RDTSC_START(FEBinTriangles);
1764
1765     const API_STATE& state = GetApiState(pDC);
1766     const SWR_RASTSTATE& rastState = state.rastState;
1767     const SWR_FRONTEND_STATE& feState = state.frontendState;
1768     const SWR_GS_STATE& gsState = state.gsState;
1769     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1770
1771
1772     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1773     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1774     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1775
1776     if (!feState.vpTransformDisable)
1777     {
1778         // perspective divide
1779         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1780         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1781         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1782
1783         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1784         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1785         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1786
1787         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1788         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1789         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1790
1791         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1792         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1793         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1794
1795         // viewport transform to screen coords
1796         viewportTransform<3>(tri, state.vpMatrix[0]);
1797     }
1798
1799     // adjust for pixel center location
1800     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1801     tri[0].x = _simd_add_ps(tri[0].x, offset);
1802     tri[0].y = _simd_add_ps(tri[0].y, offset);
1803
1804     tri[1].x = _simd_add_ps(tri[1].x, offset);
1805     tri[1].y = _simd_add_ps(tri[1].y, offset);
1806
1807     tri[2].x = _simd_add_ps(tri[2].x, offset);
1808     tri[2].y = _simd_add_ps(tri[2].y, offset);
1809
1810     simdscalari vXi[3], vYi[3];
1811     // Set vXi, vYi to required fixed point precision
1812     FPToFixedPoint(tri, vXi, vYi);
1813
1814     // triangle setup
1815     simdscalari vAi[3], vBi[3];
1816     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1817
1818     // determinant
1819     simdscalari vDet[2];
1820     calcDeterminantIntVertical(vAi, vBi, vDet);
1821
1822     // cull zero area
1823     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1824     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1825
1826     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1827
1828     uint32_t origTriMask = triMask;
1829     // don't cull degenerate triangles if we're conservatively rasterizing
1830     if(!CT::IsConservativeT::value)
1831     {
1832         triMask &= ~cullZeroAreaMask;
1833     }
1834
1835     // determine front winding tris
1836     // CW  +det
1837     // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1838     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1839     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1840     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1841
1842     uint32_t frontWindingTris;
1843     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1844     {
1845         frontWindingTris = cwTriMask;
1846     }
1847     else
1848     {
1849         frontWindingTris = ~cwTriMask;
1850     }
1851
1852     // cull
1853     uint32_t cullTris;
1854     switch ((SWR_CULLMODE)rastState.cullMode)
1855     {
1856     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1857     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1858     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1859     // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1860     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1861     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1862     }
1863
1864     triMask &= ~cullTris;
1865
1866     if (origTriMask ^ triMask)
1867     {
1868         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1869     }
1870
1871     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1872     // compute per tri backface
1873     uint32_t frontFaceMask = frontWindingTris;
1874     uint32_t *pPrimID = (uint32_t *)&primID;
1875     DWORD triIndex = 0;
1876     // for center sample pattern, all samples are at pixel center; calculate coverage
1877     // once at center and broadcast the results in the backend
1878     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1879     uint32_t edgeEnable;
1880     PFN_WORK_FUNC pfnWork;
1881     if(CT::IsConservativeT::value)
1882     {
1883         // determine which edges of the degenerate tri, if any, are valid to rasterize.
1884         // used to call the appropriate templated rasterizer function
1885         if(cullZeroAreaMask > 0)
1886         {
1887             // e0 = v1-v0
1888             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1889             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1890             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1891
1892             // e1 = v2-v1
1893             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1894             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1895             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1896
1897             // e2 = v0-v2
1898             // if v0 == v1 & v1 == v2, v0 == v2
1899             uint32_t e2Mask = e0Mask & e1Mask;
1900             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1901
1902             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1903             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1904             e0Mask = pdep_u32(e0Mask, 0x00249249);
1905             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1906             e1Mask = pdep_u32(e1Mask, 0x00492492);
1907             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1908             e2Mask = pdep_u32(e2Mask, 0x00924924);
1909
1910             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1911         }
1912         else
1913         {
1914             edgeEnable = 0x00FFFFFF;
1915         }
1916     }
1917     else
1918     {
1919         // degenerate triangles won't be sent to rasterizer; just enable all edges
1920         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1921                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1922                                     (rastState.scissorEnable > 0));
1923     }
1924
1925     if (!triMask)
1926     {
1927         goto endBinTriangles;
1928     }
1929
1930     // Calc bounding box of triangles
1931     simdBBox bbox;
1932     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1933
1934     // determine if triangle falls between pixel centers and discard
1935     // only discard for non-MSAA case and when conservative rast is disabled
1936     // (left + 127) & ~255
1937     // (right + 128) & ~255
1938     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1939     {
1940         origTriMask = triMask;
1941
1942         int cullCenterMask;
1943         {
1944             simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1945             left = _simd_and_si(left, _simd_set1_epi32(~255));
1946             simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1947             right = _simd_and_si(right, _simd_set1_epi32(~255));
1948
1949             simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1950
1951             simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1952             top = _simd_and_si(top, _simd_set1_epi32(~255));
1953             simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1954             bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1955
1956             simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1957             vMaskV = _simd_or_si(vMaskH, vMaskV);
1958             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1959         }
1960
1961         triMask &= ~cullCenterMask;
1962
1963         if(origTriMask ^ triMask)
1964         {
1965             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1966         }
1967     }
1968
1969     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1970     bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1971     bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1972     bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1973     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1974
1975     if(CT::IsConservativeT::value)
1976     {
1977         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1978         // some area. Bump the right/bottom edges out
1979         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
1980         bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
1981         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
1982         bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
1983     }
1984
1985     // Cull tris completely outside scissor
1986     {
1987         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1988         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1989         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1990         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1991         triMask = triMask & ~maskOutsideScissor;
1992     }
1993
1994     if (!triMask)
1995     {
1996         goto endBinTriangles;
1997     }
1998
1999     // Convert triangle bbox to macrotile units.
2000     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2001     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2002     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2003     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2004
2005     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2006     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2007     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2008     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2009     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2010
2011     // transpose verts needed for backend
2012     /// @todo modify BE to take non-transformed verts
2013     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2014     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2015     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2016     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2017     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2018
2019     // store render target array index
2020     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2021     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2022     {
2023         simdvector vRtai[3];
2024         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2025         simdscalari vRtaii;
2026         vRtaii = _simd_castps_si(vRtai[0].x);
2027         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2028     }
2029     else
2030     {
2031         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2032     }
2033
2034     // scan remaining valid triangles and bin each separately
2035     while (_BitScanForward(&triIndex, triMask))
2036     {
2037         uint32_t linkageCount = state.backendState.numAttributes;
2038         uint32_t numScalarAttribs = linkageCount * 4;
2039
2040         BE_WORK work;
2041         work.type = DRAW;
2042
2043         bool isDegenerate;
2044         if(CT::IsConservativeT::value)
2045         {
2046             // only rasterize valid edges if we have a degenerate primitive
2047             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2048             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2049                                         (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2050                                         (rastState.scissorEnable > 0));
2051
2052             // Degenerate triangles are required to be constant interpolated
2053             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2054         }
2055         else
2056         {
2057             isDegenerate = false;
2058             work.pfnWork = pfnWork;
2059         }
2060
2061         // Select attribute processor
2062         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2063             state.backendState.swizzleEnable,  state.backendState.constantInterpolationMask, isDegenerate);
2064
2065         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2066
2067         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2068         desc.triFlags.primID = pPrimID[triIndex];
2069         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2070
2071         auto pArena = pDC->pArena;
2072         SWR_ASSERT(pArena != nullptr);
2073
2074         // store active attribs
2075         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2076         desc.pAttribs = pAttribs;
2077         desc.numAttribs = linkageCount;
2078         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2079
2080         // store triangle vertex data
2081         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2082
2083         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2084         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2085         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2086         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2087
2088         // store user clip distances
2089         if (rastState.clipDistanceMask)
2090         {
2091             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2092             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2093             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2094         }
2095
2096         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2097         {
2098             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2099             {
2100 #if KNOB_ENABLE_TOSS_POINTS
2101                 if (!KNOB_TOSS_SETUP_TRIS)
2102 #endif
2103                 {
2104                     pTileMgr->enqueue(x, y, &work);
2105                 }
2106             }
2107         }
2108         triMask &= ~(1 << triIndex);
2109     }
2110
2111 endBinTriangles:
2112     RDTSC_STOP(FEBinTriangles, 1, 0);
2113 }
2114
2115 struct FEBinTrianglesChooser
2116 {
2117     typedef PFN_PROCESS_PRIMS FuncType;
2118
2119     template <typename... ArgsB>
2120     static FuncType GetFunc()
2121     {
2122         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2123     }
2124 };
2125
2126 // Selector for correct templated BinTrinagles function
2127 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2128 {
2129     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2130 }
2131
2132 //////////////////////////////////////////////////////////////////////////
2133 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2134 /// @param pDC - pointer to draw context.
2135 /// @param pa - The primitive assembly object.
2136 /// @param workerId - thread's worker id. Even thread has a unique id.
2137 /// @param tri - Contains point position data for SIMDs worth of points.
2138 /// @param primID - Primitive ID for each point.
2139 void BinPoints(
2140     DRAW_CONTEXT *pDC,
2141     PA_STATE& pa,
2142     uint32_t workerId,
2143     simdvector prim[3],
2144     uint32_t primMask,
2145     simdscalari primID)
2146 {
2147     RDTSC_START(FEBinPoints);
2148
2149     simdvector& primVerts = prim[0];
2150
2151     const API_STATE& state = GetApiState(pDC);
2152     const SWR_FRONTEND_STATE& feState = state.frontendState;
2153     const SWR_GS_STATE& gsState = state.gsState;
2154     const SWR_RASTSTATE& rastState = state.rastState;
2155
2156     // Select attribute processor
2157     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2158         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2159
2160     if (!feState.vpTransformDisable)
2161     {
2162         // perspective divide
2163         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2164         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2165         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2166         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2167
2168         // viewport transform to screen coords
2169         viewportTransform<1>(&primVerts, state.vpMatrix[0]);
2170     }
2171
2172     // adjust for pixel center location
2173     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2174     primVerts.x = _simd_add_ps(primVerts.x, offset);
2175     primVerts.y = _simd_add_ps(primVerts.y, offset);
2176
2177     // convert to fixed point
2178     simdscalari vXi, vYi;
2179     vXi = fpToFixedPointVertical(primVerts.x);
2180     vYi = fpToFixedPointVertical(primVerts.y);
2181
2182     if (CanUseSimplePoints(pDC))
2183     {
2184         // adjust for top-left rule
2185         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2186         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2187
2188         // cull points off the top-left edge of the viewport
2189         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2190         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2191
2192         // compute macro tile coordinates
2193         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2194         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2195
2196         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2197         _simd_store_si((simdscalari*)aMacroX, macroX);
2198         _simd_store_si((simdscalari*)aMacroY, macroY);
2199
2200         // compute raster tile coordinates
2201         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2202         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2203
2204         // compute raster tile relative x,y for coverage mask
2205         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2206         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2207
2208         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2209         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2210
2211         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2212         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2213         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2214         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2215
2216         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2217         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2218         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2219         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2220
2221         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2222         _simd_store_ps((float*)aZ, primVerts.z);
2223
2224         // store render target array index
2225         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2226         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2227         {
2228             simdvector vRtai;
2229             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2230             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2231             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2232         }
2233         else
2234         {
2235             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2236         }
2237
2238         uint32_t *pPrimID = (uint32_t *)&primID;
2239         DWORD primIndex = 0;
2240
2241         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2242
2243         // scan remaining valid triangles and bin each separately
2244         while (_BitScanForward(&primIndex, primMask))
2245         {
2246             uint32_t linkageCount = backendState.numAttributes;
2247             uint32_t numScalarAttribs = linkageCount * 4;
2248
2249             BE_WORK work;
2250             work.type = DRAW;
2251
2252             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2253
2254             // points are always front facing
2255             desc.triFlags.frontFacing = 1;
2256             desc.triFlags.primID = pPrimID[primIndex];
2257             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2258
2259             work.pfnWork = RasterizeSimplePoint;
2260
2261             auto pArena = pDC->pArena;
2262             SWR_ASSERT(pArena != nullptr);
2263
2264             // store attributes
2265             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2266             desc.pAttribs = pAttribs;
2267             desc.numAttribs = linkageCount;
2268
2269             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2270
2271             // store raster tile aligned x, y, perspective correct z
2272             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2273             desc.pTriBuffer = pTriBuffer;
2274             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2275             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2276             *pTriBuffer = aZ[primIndex];
2277
2278             uint32_t tX = aTileRelativeX[primIndex];
2279             uint32_t tY = aTileRelativeY[primIndex];
2280
2281             // pack the relative x,y into the coverageMask, the rasterizer will
2282             // generate the true coverage mask from it
2283             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2284
2285             // bin it
2286             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2287 #if KNOB_ENABLE_TOSS_POINTS
2288             if (!KNOB_TOSS_SETUP_TRIS)
2289 #endif
2290             {
2291                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2292             }
2293             primMask &= ~(1 << primIndex);
2294         }
2295     }
2296     else
2297     {
2298         // non simple points need to be potentially binned to multiple macro tiles
2299         simdscalar vPointSize;
2300         if (rastState.pointParam)
2301         {
2302             simdvector size[3];
2303             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2304             vPointSize = size[0].x;
2305         }
2306         else
2307         {
2308             vPointSize = _simd_set1_ps(rastState.pointSize);
2309         }
2310
2311         // bloat point to bbox
2312         simdBBox bbox;
2313         bbox.left = bbox.right = vXi;
2314         bbox.top = bbox.bottom = vYi;
2315
2316         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2317         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2318         bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2319         bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2320         bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2321         bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2322
2323         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2324         bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2325         bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2326         bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2327         bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2328
2329         // Cull bloated points completely outside scissor
2330         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2331         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2332         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2333         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2334         primMask = primMask & ~maskOutsideScissor;
2335
2336         // Convert bbox to macrotile units.
2337         bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2338         bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2339         bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2340         bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2341
2342         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2343         _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2344         _simd_store_si((simdscalari*)aMTRight, bbox.right);
2345         _simd_store_si((simdscalari*)aMTTop, bbox.top);
2346         _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2347
2348         // store render target array index
2349         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2350         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2351         {
2352             simdvector vRtai[2];
2353             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2354             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2355             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2356         }
2357         else
2358         {
2359             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2360         }
2361
2362         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2363         _simd_store_ps((float*)aPointSize, vPointSize);
2364
2365         uint32_t *pPrimID = (uint32_t *)&primID;
2366
2367         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2368         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2369         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2370
2371         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2372         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2373         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2374
2375         // scan remaining valid prims and bin each separately
2376         const SWR_BACKEND_STATE& backendState = state.backendState;
2377         DWORD primIndex;
2378         while (_BitScanForward(&primIndex, primMask))
2379         {
2380             uint32_t linkageCount = backendState.numAttributes;
2381             uint32_t numScalarAttribs = linkageCount * 4;
2382
2383             BE_WORK work;
2384             work.type = DRAW;
2385
2386             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2387
2388             desc.triFlags.frontFacing = 1;
2389             desc.triFlags.primID = pPrimID[primIndex];
2390             desc.triFlags.pointSize = aPointSize[primIndex];
2391             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2392
2393             work.pfnWork = RasterizeTriPoint;
2394
2395             auto pArena = pDC->pArena;
2396             SWR_ASSERT(pArena != nullptr);
2397
2398             // store active attribs
2399             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2400             desc.numAttribs = linkageCount;
2401             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2402
2403             // store point vertex data
2404             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2405             desc.pTriBuffer = pTriBuffer;
2406             *pTriBuffer++ = aPrimVertsX[primIndex];
2407             *pTriBuffer++ = aPrimVertsY[primIndex];
2408             *pTriBuffer = aPrimVertsZ[primIndex];
2409
2410             // store user clip distances
2411             if (rastState.clipDistanceMask)
2412             {
2413                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2414                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2415                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2416             }
2417
2418             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2419             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2420             {
2421                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2422                 {
2423 #if KNOB_ENABLE_TOSS_POINTS
2424                     if (!KNOB_TOSS_SETUP_TRIS)
2425 #endif
2426                     {
2427                         pTileMgr->enqueue(x, y, &work);
2428                     }
2429                 }
2430             }
2431
2432             primMask &= ~(1 << primIndex);
2433         }
2434     }
2435
2436
2437
2438
2439     RDTSC_STOP(FEBinPoints, 1, 0);
2440 }
2441
2442 //////////////////////////////////////////////////////////////////////////
2443 /// @brief Bin SIMD lines to the backend.
2444 /// @param pDC - pointer to draw context.
2445 /// @param pa - The primitive assembly object.
2446 /// @param workerId - thread's worker id. Even thread has a unique id.
2447 /// @param tri - Contains line position data for SIMDs worth of points.
2448 /// @param primID - Primitive ID for each line.
2449 void BinLines(
2450     DRAW_CONTEXT *pDC,
2451     PA_STATE& pa,
2452     uint32_t workerId,
2453     simdvector prim[],
2454     uint32_t primMask,
2455     simdscalari primID)
2456 {
2457     RDTSC_START(FEBinLines);
2458
2459     const API_STATE& state = GetApiState(pDC);
2460     const SWR_RASTSTATE& rastState = state.rastState;
2461     const SWR_FRONTEND_STATE& feState = state.frontendState;
2462     const SWR_GS_STATE& gsState = state.gsState;
2463
2464     // Select attribute processor
2465     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2466     state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2467
2468     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2469     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2470
2471     if (!feState.vpTransformDisable)
2472     {
2473         // perspective divide
2474         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2475         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2476
2477         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2478         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2479
2480         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2481         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2482
2483         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2484         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2485
2486         // viewport transform to screen coords
2487         viewportTransform<2>(prim, state.vpMatrix[0]);
2488     }
2489
2490     // adjust for pixel center location
2491     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2492     prim[0].x = _simd_add_ps(prim[0].x, offset);
2493     prim[0].y = _simd_add_ps(prim[0].y, offset);
2494
2495     prim[1].x = _simd_add_ps(prim[1].x, offset);
2496     prim[1].y = _simd_add_ps(prim[1].y, offset);
2497
2498     // convert to fixed point
2499     simdscalari vXi[2], vYi[2];
2500     vXi[0] = fpToFixedPointVertical(prim[0].x);
2501     vYi[0] = fpToFixedPointVertical(prim[0].y);
2502     vXi[1] = fpToFixedPointVertical(prim[1].x);
2503     vYi[1] = fpToFixedPointVertical(prim[1].y);
2504
2505     // compute x-major vs y-major mask
2506     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2507     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2508     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2509     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2510
2511     // cull zero-length lines
2512     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2513     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2514
2515     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2516
2517     uint32_t *pPrimID = (uint32_t *)&primID;
2518
2519     simdscalar vUnused = _simd_setzero_ps();
2520
2521     // Calc bounding box of lines
2522     simdBBox bbox;
2523     bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2524     bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2525     bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2526     bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2527
2528     // bloat bbox by line width along minor axis
2529     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2530     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2531     simdBBox bloatBox;
2532     bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2533     bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2534     bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2535     bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2536
2537     bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2538     bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2539     bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2540     bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2541
2542     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2543     bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2544     bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2545     bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2546     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2547
2548     // Cull prims completely outside scissor
2549     {
2550         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2551         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2552         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2553         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2554         primMask = primMask & ~maskOutsideScissor;
2555     }
2556
2557     if (!primMask)
2558     {
2559         goto endBinLines;
2560     }
2561
2562     // Convert triangle bbox to macrotile units.
2563     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2564     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2565     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2566     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2567
2568     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2569     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2570     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2571     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2572     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2573
2574     // transpose verts needed for backend
2575     /// @todo modify BE to take non-transformed verts
2576     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2577     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2578     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2579     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2580     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2581
2582     // store render target array index
2583     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2584     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2585     {
2586         simdvector vRtai[2];
2587         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2588         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2589         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2590     }
2591     else
2592     {
2593         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2594     }
2595
2596     // scan remaining valid prims and bin each separately
2597     DWORD primIndex;
2598     while (_BitScanForward(&primIndex, primMask))
2599     {
2600         uint32_t linkageCount = state.backendState.numAttributes;
2601         uint32_t numScalarAttribs = linkageCount * 4;
2602
2603         BE_WORK work;
2604         work.type = DRAW;
2605
2606         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2607
2608         desc.triFlags.frontFacing = 1;
2609         desc.triFlags.primID = pPrimID[primIndex];
2610         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2611         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2612
2613         work.pfnWork = RasterizeLine;
2614
2615         auto pArena = pDC->pArena;
2616         SWR_ASSERT(pArena != nullptr);
2617
2618         // store active attribs
2619         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2620         desc.numAttribs = linkageCount;
2621         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2622
2623         // store line vertex data
2624         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2625         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2626         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2627         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2628         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2629
2630         // store user clip distances
2631         if (rastState.clipDistanceMask)
2632         {
2633             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2634             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2635             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2636         }
2637
2638         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2639         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2640         {
2641             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2642             {
2643 #if KNOB_ENABLE_TOSS_POINTS
2644                 if (!KNOB_TOSS_SETUP_TRIS)
2645 #endif
2646                 {
2647                     pTileMgr->enqueue(x, y, &work);
2648                 }
2649             }
2650         }
2651
2652         primMask &= ~(1 << primIndex);
2653     }
2654
2655 endBinLines:
2656
2657     RDTSC_STOP(FEBinLines, 1, 0);
2658 }