src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
  76     BE_WORK work;
  77     work.type = SYNC;
  78     work.pfnWork = ProcessSyncBE;
  79     work.desc.sync = *pSync;
  80
  81     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  82     pTileMgr->enqueue(0, 0, &work);
  83 }
  84
  85 //////////////////////////////////////////////////////////////////////////
  86 /// @brief FE handler for SwrGetStats.
  87 /// @param pContext - pointer to SWR context.
  88 /// @param pDC - pointer to draw context.
  89 /// @param workerId - thread's worker id. Even thread has a unique id.
  90 /// @param pUserData - Pointer to user data passed back to stats callback.
  91 /// @todo This should go away when we switch this to use compute threading.
  92 void ProcessQueryStats(
  93     SWR_CONTEXT *pContext,
  94     DRAW_CONTEXT *pDC,
  95     uint32_t workerId,
  96     void *pUserData)
  97 {
  98     QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
  99     BE_WORK work;
 100     work.type = QUERYSTATS;
 101     work.pfnWork = ProcessQueryStatsBE;
 102     work.desc.queryStats = *pQueryStats;
 103
 104     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 105     pTileMgr->enqueue(0, 0, &work);
 106 }
 107
 108 //////////////////////////////////////////////////////////////////////////
 109 /// @brief FE handler for SwrClearRenderTarget.
 110 /// @param pContext - pointer to SWR context.
 111 /// @param pDC - pointer to draw context.
 112 /// @param workerId - thread's worker id. Even thread has a unique id.
 113 /// @param pUserData - Pointer to user data passed back to clear callback.
 114 /// @todo This should go away when we switch this to use compute threading.
 115 void ProcessClear(
 116     SWR_CONTEXT *pContext,
 117     DRAW_CONTEXT *pDC,
 118     uint32_t workerId,
 119     void *pUserData)
 120 {
 121     CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
 122     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 123
 124     const API_STATE& state = GetApiState(pDC);
 125
 126     // queue a clear to each macro tile
 127     // compute macro tile bounds for the current scissor/viewport
 128     uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
 129     uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
 130     uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
 131     uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
 132
 133     BE_WORK work;
 134     work.type = CLEAR;
 135     work.pfnWork = ProcessClearBE;
 136     work.desc.clear = *pClear;
 137
 138     for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
 139     {
 140         for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
 141         {
 142             pTileMgr->enqueue(x, y, &work);
 143         }
 144     }
 145 }
 146
 147 //////////////////////////////////////////////////////////////////////////
 148 /// @brief FE handler for SwrStoreTiles.
 149 /// @param pContext - pointer to SWR context.
 150 /// @param pDC - pointer to draw context.
 151 /// @param workerId - thread's worker id. Even thread has a unique id.
 152 /// @param pUserData - Pointer to user data passed back to callback.
 153 /// @todo This should go away when we switch this to use compute threading.
 154 void ProcessStoreTiles(
 155     SWR_CONTEXT *pContext,
 156     DRAW_CONTEXT *pDC,
 157     uint32_t workerId,
 158     void *pUserData)
 159 {
 160     RDTSC_START(FEProcessStoreTiles);
 161     STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
 162     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 163
 164     const API_STATE& state = GetApiState(pDC);
 165
 166     // queue a store to each macro tile
 167     // compute macro tile bounds for the current render target
 168     const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 169     const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 170
 171     uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
 172     uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
 173
 174     // store tiles
 175     BE_WORK work;
 176     work.type = STORETILES;
 177     work.pfnWork = ProcessStoreTileBE;
 178     work.desc.storeTiles = *pStore;
 179
 180     for (uint32_t x = 0; x < numMacroTilesX; ++x)
 181     {
 182         for (uint32_t y = 0; y < numMacroTilesY; ++y)
 183         {
 184             pTileMgr->enqueue(x, y, &work);
 185         }
 186     }
 187
 188     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 189 }
 190
 191 //////////////////////////////////////////////////////////////////////////
 192 /// @brief FE handler for SwrInvalidateTiles.
 193 /// @param pContext - pointer to SWR context.
 194 /// @param pDC - pointer to draw context.
 195 /// @param workerId - thread's worker id. Even thread has a unique id.
 196 /// @param pUserData - Pointer to user data passed back to callback.
 197 /// @todo This should go away when we switch this to use compute threading.
 198 void ProcessDiscardInvalidateTiles(
 199     SWR_CONTEXT *pContext,
 200     DRAW_CONTEXT *pDC,
 201     uint32_t workerId,
 202     void *pUserData)
 203 {
 204     RDTSC_START(FEProcessInvalidateTiles);
 205     DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 206     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 207
 208     SWR_RECT rect;
 209
 210     if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
 211     {
 212         // Valid rect
 213         rect = pInv->rect;
 214     }
 215     else
 216     {
 217         // Use viewport dimensions
 218         const API_STATE& state = GetApiState(pDC);
 219
 220         rect.left   = (uint32_t)state.vp[0].x;
 221         rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
 222         rect.top    = (uint32_t)state.vp[0].y;
 223         rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
 224     }
 225
 226     // queue a store to each macro tile
 227     // compute macro tile bounds for the current render target
 228     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 229     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 230
 231     // Setup region assuming full tiles
 232     uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
 233     uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
 234
 235     uint32_t macroTileEndX = rect.right / macroWidth;
 236     uint32_t macroTileEndY = rect.bottom / macroHeight;
 237
 238     if (pInv->fullTilesOnly == false)
 239     {
 240         // include partial tiles
 241         macroTileStartX = rect.left / macroWidth;
 242         macroTileStartY = rect.top / macroHeight;
 243
 244         macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
 245         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
 246     }
 247
 248     SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
 249     SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 250
 251     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
 252     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 253
 254     // load tiles
 255     BE_WORK work;
 256     work.type = DISCARDINVALIDATETILES;
 257     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 258     work.desc.discardInvalidateTiles = *pInv;
 259
 260     for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
 261     {
 262         for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
 263         {
 264             pTileMgr->enqueue(x, y, &work);
 265         }
 266     }
 267
 268     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 269 }
 270
 271 //////////////////////////////////////////////////////////////////////////
 272 /// @brief Computes the number of primitives given the number of verts.
 273 /// @param mode - primitive topology for draw operation.
 274 /// @param numPrims - number of vertices or indices for draw.
 275 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 276 uint32_t GetNumPrims(
 277     PRIMITIVE_TOPOLOGY mode,
 278     uint32_t numPrims)
 279 {
 280     switch (mode)
 281     {
 282     case TOP_POINT_LIST: return numPrims;
 283     case TOP_TRIANGLE_LIST: return numPrims / 3;
 284     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 285     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 286     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 287     case TOP_QUAD_LIST: return numPrims / 4;
 288     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 289     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 290     case TOP_LINE_LIST: return numPrims / 2;
 291     case TOP_LINE_LOOP: return numPrims;
 292     case TOP_RECT_LIST: return numPrims / 3;
 293     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 294     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 295     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 296     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 297
 298     case TOP_PATCHLIST_1:
 299     case TOP_PATCHLIST_2:
 300     case TOP_PATCHLIST_3:
 301     case TOP_PATCHLIST_4:
 302     case TOP_PATCHLIST_5:
 303     case TOP_PATCHLIST_6:
 304     case TOP_PATCHLIST_7:
 305     case TOP_PATCHLIST_8:
 306     case TOP_PATCHLIST_9:
 307     case TOP_PATCHLIST_10:
 308     case TOP_PATCHLIST_11:
 309     case TOP_PATCHLIST_12:
 310     case TOP_PATCHLIST_13:
 311     case TOP_PATCHLIST_14:
 312     case TOP_PATCHLIST_15:
 313     case TOP_PATCHLIST_16:
 314     case TOP_PATCHLIST_17:
 315     case TOP_PATCHLIST_18:
 316     case TOP_PATCHLIST_19:
 317     case TOP_PATCHLIST_20:
 318     case TOP_PATCHLIST_21:
 319     case TOP_PATCHLIST_22:
 320     case TOP_PATCHLIST_23:
 321     case TOP_PATCHLIST_24:
 322     case TOP_PATCHLIST_25:
 323     case TOP_PATCHLIST_26:
 324     case TOP_PATCHLIST_27:
 325     case TOP_PATCHLIST_28:
 326     case TOP_PATCHLIST_29:
 327     case TOP_PATCHLIST_30:
 328     case TOP_PATCHLIST_31:
 329     case TOP_PATCHLIST_32:
 330         return numPrims / (mode - TOP_PATCHLIST_BASE);
 331
 332     case TOP_POLYGON:
 333     case TOP_POINT_LIST_BF:
 334     case TOP_LINE_STRIP_CONT:
 335     case TOP_LINE_STRIP_BF:
 336     case TOP_LINE_STRIP_CONT_BF:
 337     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 338     case TOP_TRI_STRIP_REVERSE:
 339     case TOP_PATCHLIST_BASE:
 340     case TOP_UNKNOWN:
 341         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 342         return 0;
 343     }
 344
 345     return 0;
 346 }
 347
 348 //////////////////////////////////////////////////////////////////////////
 349 /// @brief Computes the number of verts given the number of primitives.
 350 /// @param mode - primitive topology for draw operation.
 351 /// @param numPrims - number of primitives for draw.
 352 uint32_t GetNumVerts(
 353     PRIMITIVE_TOPOLOGY mode,
 354     uint32_t numPrims)
 355 {
 356     switch (mode)
 357     {
 358     case TOP_POINT_LIST: return numPrims;
 359     case TOP_TRIANGLE_LIST: return numPrims * 3;
 360     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 361     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 362     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 363     case TOP_QUAD_LIST: return numPrims * 4;
 364     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 365     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 366     case TOP_LINE_LIST: return numPrims * 2;
 367     case TOP_LINE_LOOP: return numPrims;
 368     case TOP_RECT_LIST: return numPrims * 3;
 369     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 370     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 371     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 372     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 373
 374     case TOP_PATCHLIST_1:
 375     case TOP_PATCHLIST_2:
 376     case TOP_PATCHLIST_3:
 377     case TOP_PATCHLIST_4:
 378     case TOP_PATCHLIST_5:
 379     case TOP_PATCHLIST_6:
 380     case TOP_PATCHLIST_7:
 381     case TOP_PATCHLIST_8:
 382     case TOP_PATCHLIST_9:
 383     case TOP_PATCHLIST_10:
 384     case TOP_PATCHLIST_11:
 385     case TOP_PATCHLIST_12:
 386     case TOP_PATCHLIST_13:
 387     case TOP_PATCHLIST_14:
 388     case TOP_PATCHLIST_15:
 389     case TOP_PATCHLIST_16:
 390     case TOP_PATCHLIST_17:
 391     case TOP_PATCHLIST_18:
 392     case TOP_PATCHLIST_19:
 393     case TOP_PATCHLIST_20:
 394     case TOP_PATCHLIST_21:
 395     case TOP_PATCHLIST_22:
 396     case TOP_PATCHLIST_23:
 397     case TOP_PATCHLIST_24:
 398     case TOP_PATCHLIST_25:
 399     case TOP_PATCHLIST_26:
 400     case TOP_PATCHLIST_27:
 401     case TOP_PATCHLIST_28:
 402     case TOP_PATCHLIST_29:
 403     case TOP_PATCHLIST_30:
 404     case TOP_PATCHLIST_31:
 405     case TOP_PATCHLIST_32:
 406         return numPrims * (mode - TOP_PATCHLIST_BASE);
 407
 408     case TOP_POLYGON:
 409     case TOP_POINT_LIST_BF:
 410     case TOP_LINE_STRIP_CONT:
 411     case TOP_LINE_STRIP_BF:
 412     case TOP_LINE_STRIP_CONT_BF:
 413     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 414     case TOP_TRI_STRIP_REVERSE:
 415     case TOP_PATCHLIST_BASE:
 416     case TOP_UNKNOWN:
 417         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 418         return 0;
 419     }
 420
 421     return 0;
 422 }
 423
 424 //////////////////////////////////////////////////////////////////////////
 425 /// @brief Return number of verts per primitive.
 426 /// @param topology - topology
 427 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 428 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 429 {
 430     uint32_t numVerts = 0;
 431     switch (topology)
 432     {
 433     case TOP_POINT_LIST:
 434     case TOP_POINT_LIST_BF:
 435         numVerts = 1;
 436         break;
 437     case TOP_LINE_LIST:
 438     case TOP_LINE_STRIP:
 439     case TOP_LINE_LIST_ADJ:
 440     case TOP_LINE_LOOP:
 441     case TOP_LINE_STRIP_CONT:
 442     case TOP_LINE_STRIP_BF:
 443     case TOP_LISTSTRIP_ADJ:
 444         numVerts = 2;
 445         break;
 446     case TOP_TRIANGLE_LIST:
 447     case TOP_TRIANGLE_STRIP:
 448     case TOP_TRIANGLE_FAN:
 449     case TOP_TRI_LIST_ADJ:
 450     case TOP_TRI_STRIP_ADJ:
 451     case TOP_TRI_STRIP_REVERSE:
 452     case TOP_RECT_LIST:
 453         numVerts = 3;
 454         break;
 455     case TOP_QUAD_LIST:
 456     case TOP_QUAD_STRIP:
 457         numVerts = 4;
 458         break;
 459     case TOP_PATCHLIST_1:
 460     case TOP_PATCHLIST_2:
 461     case TOP_PATCHLIST_3:
 462     case TOP_PATCHLIST_4:
 463     case TOP_PATCHLIST_5:
 464     case TOP_PATCHLIST_6:
 465     case TOP_PATCHLIST_7:
 466     case TOP_PATCHLIST_8:
 467     case TOP_PATCHLIST_9:
 468     case TOP_PATCHLIST_10:
 469     case TOP_PATCHLIST_11:
 470     case TOP_PATCHLIST_12:
 471     case TOP_PATCHLIST_13:
 472     case TOP_PATCHLIST_14:
 473     case TOP_PATCHLIST_15:
 474     case TOP_PATCHLIST_16:
 475     case TOP_PATCHLIST_17:
 476     case TOP_PATCHLIST_18:
 477     case TOP_PATCHLIST_19:
 478     case TOP_PATCHLIST_20:
 479     case TOP_PATCHLIST_21:
 480     case TOP_PATCHLIST_22:
 481     case TOP_PATCHLIST_23:
 482     case TOP_PATCHLIST_24:
 483     case TOP_PATCHLIST_25:
 484     case TOP_PATCHLIST_26:
 485     case TOP_PATCHLIST_27:
 486     case TOP_PATCHLIST_28:
 487     case TOP_PATCHLIST_29:
 488     case TOP_PATCHLIST_30:
 489     case TOP_PATCHLIST_31:
 490     case TOP_PATCHLIST_32:
 491         numVerts = topology - TOP_PATCHLIST_BASE;
 492         break;
 493     default:
 494         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 495         break;
 496     }
 497
 498     if (includeAdjVerts)
 499     {
 500         switch (topology)
 501         {
 502         case TOP_LISTSTRIP_ADJ:
 503         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 504         case TOP_TRI_STRIP_ADJ:
 505         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 506         default: break;
 507         }
 508     }
 509
 510     return numVerts;
 511 }
 512
 513 //////////////////////////////////////////////////////////////////////////
 514 /// @brief Generate mask from remaining work.
 515 /// @param numWorkItems - Number of items being worked on by a SIMD.
 516 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 517 {
 518     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 519     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 520     return _simd_castps_si(vMask(mask));
 521 }
 522
 523 //////////////////////////////////////////////////////////////////////////
 524 /// @brief StreamOut - Streams vertex data out to SO buffers.
 525 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 526 /// @param pDC - pointer to draw context.
 527 /// @param workerId - thread's worker id. Even thread has a unique id.
 528 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 529 static void StreamOut(
 530     DRAW_CONTEXT* pDC,
 531     PA_STATE& pa,
 532     uint32_t workerId,
 533     uint32_t* pPrimData,
 534     uint32_t streamIndex)
 535 {
 536     RDTSC_START(FEStreamout);
 537
 538     SWR_CONTEXT* pContext = pDC->pContext;
 539
 540     const API_STATE& state = GetApiState(pDC);
 541     const SWR_STREAMOUT_STATE &soState = state.soState;
 542
 543     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 544
 545     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 546     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 547
 548     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 549
 550     // Setup buffer state pointers.
 551     for (uint32_t i = 0; i < 4; ++i)
 552     {
 553         soContext.pBuffer[i] = &state.soBuffer[i];
 554     }
 555
 556     uint32_t numPrims = pa.NumPrims();
 557     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 558     {
 559         DWORD slot = 0;
 560         uint32_t soMask = soState.streamMasks[streamIndex];
 561
 562         // Write all entries into primitive data buffer for SOS.
 563         while (_BitScanForward(&slot, soMask))
 564         {
 565             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 566             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 567             pa.AssembleSingle(paSlot, primIndex, attrib);
 568
 569             // Attribute offset is relative offset from start of vertex.
 570             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 571             // to prim data starting at slot 0. Which is why we do (slot - 1).
 572             // Also note: GL works slightly differently, and needs slot 0
 573             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 574
 575             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 576             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 577             {
 578                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 579
 580                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 581             }
 582             soMask &= ~(1 << slot);
 583         }
 584
 585         // Update pPrimData pointer
 586         soContext.pPrimData = pPrimData;
 587
 588         // Call SOS
 589         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 590         state.pfnSoFunc[streamIndex](soContext);
 591     }
 592
 593     // Update SO write offset. The driver provides memory for the update.
 594     for (uint32_t i = 0; i < 4; ++i)
 595     {
 596         if (state.soBuffer[i].pWriteOffset)
 597         {
 598             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 599
 600             // The SOS increments the existing write offset. So we don't want to increment
 601             // the SoWriteOffset stat using an absolute offset instead of relative.
 602             SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
 603         }
 604     }
 605
 606     UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 607     UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 608
 609     RDTSC_STOP(FEStreamout, 1, 0);
 610 }
 611
 612 //////////////////////////////////////////////////////////////////////////
 613 /// @brief Computes number of invocations. The current index represents
 614 ///        the start of the SIMD. The max index represents how much work
 615 ///        items are remaining. If there is less then a SIMD's left of work
 616 ///        then return the remaining amount of work.
 617 /// @param curIndex - The start index for the SIMD.
 618 /// @param maxIndex - The last index for all work items.
 619 static INLINE uint32_t GetNumInvocations(
 620     uint32_t curIndex,
 621     uint32_t maxIndex)
 622 {
 623     uint32_t remainder = (maxIndex - curIndex);
 624     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 625 }
 626
 627 //////////////////////////////////////////////////////////////////////////
 628 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 629 ///        The geometry shader will loop over each active streamout buffer, assembling
 630 ///        primitives for the downstream stages. When multistream output is enabled,
 631 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 632 ///        buffer for the primitive assembler.
 633 /// @param stream - stream id to generate the cut buffer for
 634 /// @param pStreamIdBase - pointer to the stream ID buffer
 635 /// @param numEmittedVerts - Number of total verts emitted by the GS
 636 /// @param pCutBuffer - output buffer to write cuts to
 637 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 638 {
 639     SWR_ASSERT(stream < MAX_SO_STREAMS);
 640
 641     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 642     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 643
 644     for (uint32_t b = 0; b < numOutputBytes; ++b)
 645     {
 646         uint8_t curInputByte = pStreamIdBase[2*b];
 647         uint8_t outByte = 0;
 648         for (uint32_t i = 0; i < 4; ++i)
 649         {
 650             if ((curInputByte & 0x3) != stream)
 651             {
 652                 outByte |= (1 << i);
 653             }
 654             curInputByte >>= 2;
 655         }
 656
 657         curInputByte = pStreamIdBase[2 * b + 1];
 658         for (uint32_t i = 0; i < 4; ++i)
 659         {
 660             if ((curInputByte & 0x3) != stream)
 661             {
 662                 outByte |= (1 << (i + 4));
 663             }
 664             curInputByte >>= 2;
 665         }
 666
 667         *pCutBuffer++ = outByte;
 668     }
 669 }
 670
 671 THREAD SWR_GS_CONTEXT tlsGsContext;
 672
 673 //////////////////////////////////////////////////////////////////////////
 674 /// @brief Implements GS stage.
 675 /// @param pDC - pointer to draw context.
 676 /// @param workerId - thread's worker id. Even thread has a unique id.
 677 /// @param pa - The primitive assembly object.
 678 /// @param pGsOut - output stream for GS
 679 template <
 680     typename HasStreamOutT,
 681     typename HasRastT>
 682 static void GeometryShaderStage(
 683     DRAW_CONTEXT *pDC,
 684     uint32_t workerId,
 685     PA_STATE& pa,
 686     void* pGsOut,
 687     void* pCutBuffer,
 688     void* pStreamCutBuffer,
 689     uint32_t* pSoPrimData,
 690     simdscalari primID)
 691 {
 692     RDTSC_START(FEGeometryShader);
 693
 694     SWR_CONTEXT* pContext = pDC->pContext;
 695
 696     const API_STATE& state = GetApiState(pDC);
 697     const SWR_GS_STATE* pState = &state.gsState;
 698
 699     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 700     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 701
 702     tlsGsContext.pStream = (uint8_t*)pGsOut;
 703     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 704     tlsGsContext.PrimitiveID = primID;
 705
 706     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 707     simdvector attrib[MAX_ATTRIBUTES];
 708
 709     // assemble all attributes for the input primitive
 710     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 711     {
 712         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 713         pa.Assemble(attribSlot, attrib);
 714
 715         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 716         {
 717             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 718         }
 719     }
 720
 721     // assemble position
 722     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 723     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 724     {
 725         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 726     }
 727
 728     const uint32_t vertexStride = sizeof(simdvertex);
 729     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 730     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 731     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 732     uint32_t cutPrimStride;
 733     uint32_t cutInstanceStride;
 734
 735     if (pState->isSingleStream)
 736     {
 737         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 738         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 739     }
 740     else
 741     {
 742         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 743         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 744     }
 745
 746     // record valid prims from the frontend to avoid over binning the newly generated
 747     // prims from the GS
 748     uint32_t numInputPrims = pa.NumPrims();
 749
 750     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 751     {
 752         tlsGsContext.InstanceID = instance;
 753         tlsGsContext.mask = GenerateMask(numInputPrims);
 754
 755         // execute the geometry shader
 756         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 757
 758         tlsGsContext.pStream += instanceStride;
 759         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 760     }
 761
 762     // set up new binner and state for the GS output topology
 763     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 764     if (HasRastT::value)
 765     {
 766         switch (pState->outputTopology)
 767         {
 768         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 769         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 770         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 771         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 772         }
 773     }
 774
 775     // foreach input prim:
 776     // - setup a new PA based on the emitted verts for that prim
 777     // - loop over the new verts, calling PA to assemble each prim
 778     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 779     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 780
 781     uint32_t totalPrimsGenerated = 0;
 782     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 783     {
 784         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 785         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 786         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 787         {
 788             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 789             if (numEmittedVerts == 0)
 790             {
 791                 continue;
 792             }
 793
 794             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 795             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 796
 797             DWORD numAttribs;
 798             if (_BitScanReverse(&numAttribs, state.feAttribMask))
 799             {
 800                 numAttribs++;
 801             }
 802             else
 803             {
 804                 numAttribs = 0;
 805             }
 806
 807             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 808             {
 809                 bool processCutVerts = false;
 810
 811                 uint8_t* pCutBuffer = pCutBase;
 812
 813                 // assign default stream ID, only relevant when GS is outputting a single stream
 814                 uint32_t streamID = 0;
 815                 if (pState->isSingleStream)
 816                 {
 817                     processCutVerts = true;
 818                     streamID = pState->singleStreamID;
 819                     if (streamID != stream) continue;
 820                 }
 821                 else
 822                 {
 823                     // early exit if this stream is not enabled for streamout
 824                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 825                     {
 826                         continue;
 827                     }
 828
 829                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 830                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 831                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 832                     processCutVerts = false;
 833                 }
 834
 835                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 836
 837                 while (gsPa.GetNextStreamOutput())
 838                 {
 839                     do
 840                     {
 841                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 842
 843                         if (assemble)
 844                         {
 845                             totalPrimsGenerated += gsPa.NumPrims();
 846
 847                             if (HasStreamOutT::value)
 848                             {
 849                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 850                             }
 851
 852                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 853                             {
 854                                 simdscalari vPrimId;
 855                                 // pull primitiveID from the GS output if available
 856                                 if (state.gsState.emitsPrimitiveID)
 857                                 {
 858                                     simdvector primIdAttrib[3];
 859                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 860                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 861                                 }
 862                                 else
 863                                 {
 864                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 865                                 }
 866
 867                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 868                             }
 869                         }
 870                     } while (gsPa.NextPrim());
 871                 }
 872             }
 873         }
 874     }
 875
 876     // update GS pipeline stats
 877     UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
 878     UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
 879
 880     RDTSC_STOP(FEGeometryShader, 1, 0);
 881 }
 882
 883 //////////////////////////////////////////////////////////////////////////
 884 /// @brief Allocate GS buffers
 885 /// @param pDC - pointer to draw context.
 886 /// @param state - API state
 887 /// @param ppGsOut - pointer to GS output buffer allocation
 888 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 889 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 890     void **ppStreamCutBuffer)
 891 {
 892     auto pArena = pDC->pArena;
 893     SWR_ASSERT(pArena != nullptr);
 894     SWR_ASSERT(state.gsState.gsEnable);
 895     // allocate arena space to hold GS output verts
 896     // @todo pack attribs
 897     // @todo support multiple streams
 898     const uint32_t vertexStride = sizeof(simdvertex);
 899     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 900     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 901     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 902
 903     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 904     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 905     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 906     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 907
 908     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 909     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 910
 911     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 912     if (state.gsState.isSingleStream)
 913     {
 914         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 915         *ppStreamCutBuffer = nullptr;
 916     }
 917     else
 918     {
 919         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 920         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 921     }
 922
 923 }
 924
 925 //////////////////////////////////////////////////////////////////////////
 926 /// @brief Contains all data generated by the HS and passed to the
 927 /// tessellator and DS.
 928 struct TessellationThreadLocalData
 929 {
 930     SWR_HS_CONTEXT hsContext;
 931     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 932     void* pTxCtx;
 933     size_t tsCtxSize;
 934
 935     simdscalar* pDSOutput;
 936     size_t numDSOutputVectors;
 937 };
 938
 939 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 940
 941 //////////////////////////////////////////////////////////////////////////
 942 /// @brief Allocate tessellation data for this worker thread.
 943 INLINE
 944 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 945 {
 946     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 947     if (gt_pTessellationThreadData == nullptr)
 948     {
 949         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 950             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 951         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 952     }
 953 }
 954
 955 //////////////////////////////////////////////////////////////////////////
 956 /// @brief Implements Tessellation Stages.
 957 /// @param pDC - pointer to draw context.
 958 /// @param workerId - thread's worker id. Even thread has a unique id.
 959 /// @param pa - The primitive assembly object.
 960 /// @param pGsOut - output stream for GS
 961 template <
 962     typename HasGeometryShaderT,
 963     typename HasStreamOutT,
 964     typename HasRastT>
 965 static void TessellationStages(
 966     DRAW_CONTEXT *pDC,
 967     uint32_t workerId,
 968     PA_STATE& pa,
 969     void* pGsOut,
 970     void* pCutBuffer,
 971     void* pCutStreamBuffer,
 972     uint32_t* pSoPrimData,
 973     simdscalari primID)
 974 {
 975     const API_STATE& state = GetApiState(pDC);
 976     const SWR_TS_STATE& tsState = state.tsState;
 977     SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
 978
 979     SWR_ASSERT(gt_pTessellationThreadData);
 980
 981     HANDLE tsCtx = TSInitCtx(
 982         tsState.domain,
 983         tsState.partitioning,
 984         tsState.tsOutputTopology,
 985         gt_pTessellationThreadData->pTxCtx,
 986         gt_pTessellationThreadData->tsCtxSize);
 987     if (tsCtx == nullptr)
 988     {
 989         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
 990         tsCtx = TSInitCtx(
 991             tsState.domain,
 992             tsState.partitioning,
 993             tsState.tsOutputTopology,
 994             gt_pTessellationThreadData->pTxCtx,
 995             gt_pTessellationThreadData->tsCtxSize);
 996     }
 997     SWR_ASSERT(tsCtx);
 998
 999     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
1000     if (HasRastT::value)
1001     {
1002         switch (tsState.postDSTopology)
1003         {
1004         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1005         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1006         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1007         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1008         }
1009     }
1010
1011     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1012     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1013     hsContext.PrimitiveID = primID;
1014
1015     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1016     // Max storage for one attribute for an entire simdprimitive
1017     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1018
1019     // assemble all attributes for the input primitives
1020     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1021     {
1022         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1023         pa.Assemble(attribSlot, simdattrib);
1024
1025         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1026         {
1027             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1028         }
1029     }
1030
1031 #if defined(_DEBUG)
1032     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1033 #endif
1034
1035     uint32_t numPrims = pa.NumPrims();
1036     hsContext.mask = GenerateMask(numPrims);
1037
1038     // Run the HS
1039     RDTSC_START(FEHullShader);
1040     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1041     RDTSC_STOP(FEHullShader, 0, 0);
1042
1043     UPDATE_STAT(HsInvocations, numPrims);
1044
1045     const uint32_t* pPrimId = (const uint32_t*)&primID;
1046
1047     for (uint32_t p = 0; p < numPrims; ++p)
1048     {
1049         // Run Tessellator
1050         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1051         RDTSC_START(FETessellation);
1052         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1053         RDTSC_STOP(FETessellation, 0, 0);
1054
1055         if (tsData.NumPrimitives == 0)
1056         {
1057             continue;
1058         }
1059         SWR_ASSERT(tsData.NumDomainPoints);
1060
1061         // Allocate DS Output memory
1062         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1063         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1064         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1065         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1066         {
1067             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1068             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1069             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1070         }
1071         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1072         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1073
1074 #if defined(_DEBUG)
1075         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1076 #endif
1077
1078         // Run Domain Shader
1079         SWR_DS_CONTEXT dsContext;
1080         dsContext.PrimitiveID = pPrimId[p];
1081         dsContext.pCpIn = &hsContext.pCPout[p];
1082         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1083         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1084         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1085         dsContext.vectorStride = requiredDSVectorInvocations;
1086
1087         uint32_t dsInvocations = 0;
1088
1089         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1090         {
1091             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1092
1093             RDTSC_START(FEDomainShader);
1094             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1095             RDTSC_STOP(FEDomainShader, 0, 0);
1096
1097             dsInvocations += KNOB_SIMD_WIDTH;
1098         }
1099         UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1100
1101         PA_TESS tessPa(
1102             pDC,
1103             dsContext.pOutputData,
1104             dsContext.vectorStride,
1105             tsState.numDsOutputAttribs,
1106             tsData.ppIndices,
1107             tsData.NumPrimitives,
1108             tsState.postDSTopology);
1109
1110         while (tessPa.HasWork())
1111         {
1112             if (HasGeometryShaderT::value)
1113             {
1114                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1115                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1116                     _simd_set1_epi32(dsContext.PrimitiveID));
1117             }
1118             else
1119             {
1120                 if (HasStreamOutT::value)
1121                 {
1122                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1123                 }
1124
1125                 if (HasRastT::value)
1126                 {
1127                     simdvector prim[3]; // Only deal with triangles, lines, or points
1128                     RDTSC_START(FEPAAssemble);
1129 #if SWR_ENABLE_ASSERTS
1130                     bool assemble =
1131 #endif
1132                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1133                     RDTSC_STOP(FEPAAssemble, 1, 0);
1134                     SWR_ASSERT(assemble);
1135
1136                     SWR_ASSERT(pfnClipFunc);
1137                     pfnClipFunc(pDC, tessPa, workerId, prim,
1138                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1139                 }
1140             }
1141
1142             tessPa.NextPrim();
1143
1144         } // while (tessPa.HasWork())
1145     } // for (uint32_t p = 0; p < numPrims; ++p)
1146
1147     TSDestroyCtx(tsCtx);
1148 }
1149
1150 //////////////////////////////////////////////////////////////////////////
1151 /// @brief FE handler for SwrDraw.
1152 /// @tparam IsIndexedT - Is indexed drawing enabled
1153 /// @tparam HasTessellationT - Is tessellation enabled
1154 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1155 /// @tparam HasStreamOutT - Is stream-out enabled
1156 /// @tparam HasRastT - Is rasterization enabled
1157 /// @param pContext - pointer to SWR context.
1158 /// @param pDC - pointer to draw context.
1159 /// @param workerId - thread's worker id.
1160 /// @param pUserData - Pointer to DRAW_WORK
1161 template <
1162     typename IsIndexedT,
1163     typename IsCutIndexEnabledT,
1164     typename HasTessellationT,
1165     typename HasGeometryShaderT,
1166     typename HasStreamOutT,
1167     typename HasRastT>
1168 void ProcessDraw(
1169     SWR_CONTEXT *pContext,
1170     DRAW_CONTEXT *pDC,
1171     uint32_t workerId,
1172     void *pUserData)
1173 {
1174
1175 #if KNOB_ENABLE_TOSS_POINTS
1176     if (KNOB_TOSS_QUEUE_FE)
1177     {
1178         return;
1179     }
1180 #endif
1181
1182     RDTSC_START(FEProcessDraw);
1183
1184     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1185     const API_STATE&    state = GetApiState(pDC);
1186     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1187     SWR_VS_CONTEXT      vsContext;
1188     simdvertex          vin;
1189
1190     int indexSize = 0;
1191     uint32_t endVertex = work.numVerts;
1192
1193     const int32_t* pLastRequestedIndex = nullptr;
1194     if (IsIndexedT::value)
1195     {
1196         switch (work.type)
1197         {
1198         case R32_UINT:
1199             indexSize = sizeof(uint32_t);
1200             pLastRequestedIndex = &(work.pIB[endVertex]);
1201             break;
1202         case R16_UINT:
1203             indexSize = sizeof(uint16_t);
1204             // nasty address offset to last index
1205             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1206             break;
1207         case R8_UINT:
1208             indexSize = sizeof(uint8_t);
1209             // nasty address offset to last index
1210             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1211             break;
1212         default:
1213             SWR_ASSERT(0);
1214         }
1215     }
1216     else
1217     {
1218         // No cuts, prune partial primitives.
1219         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1220     }
1221
1222     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1223     fetchInfo.pStreams = &state.vertexBuffers[0];
1224     fetchInfo.StartInstance = work.startInstance;
1225     fetchInfo.StartVertex = 0;
1226
1227     vsContext.pVin = &vin;
1228
1229     if (IsIndexedT::value)
1230     {
1231         fetchInfo.BaseVertex = work.baseVertex;
1232
1233         // if the entire index buffer isn't being consumed, set the last index
1234         // so that fetches < a SIMD wide will be masked off
1235         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1236         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1237         {
1238             fetchInfo.pLastIndex = pLastRequestedIndex;
1239         }
1240     }
1241     else
1242     {
1243         fetchInfo.StartVertex = work.startVertex;
1244     }
1245
1246 #ifdef KNOB_ENABLE_RDTSC
1247     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1248 #endif
1249
1250     void* pGsOut = nullptr;
1251     void* pCutBuffer = nullptr;
1252     void* pStreamCutBuffer = nullptr;
1253     if (HasGeometryShaderT::value)
1254     {
1255         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1256     }
1257
1258     if (HasTessellationT::value)
1259     {
1260         SWR_ASSERT(state.tsState.tsEnable == true);
1261         SWR_ASSERT(state.pfnHsFunc != nullptr);
1262         SWR_ASSERT(state.pfnDsFunc != nullptr);
1263
1264         AllocateTessellationData(pContext);
1265     }
1266     else
1267     {
1268         SWR_ASSERT(state.tsState.tsEnable == false);
1269         SWR_ASSERT(state.pfnHsFunc == nullptr);
1270         SWR_ASSERT(state.pfnDsFunc == nullptr);
1271     }
1272
1273     // allocate space for streamout input prim data
1274     uint32_t* pSoPrimData = nullptr;
1275     if (HasStreamOutT::value)
1276     {
1277         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1278
1279         // update the
1280         for (uint32_t i = 0; i < 4; ++i)
1281         {
1282             SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1283         }
1284
1285     }
1286
1287     // choose primitive assembler
1288     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1289     PA_STATE& pa = paFactory.GetPA();
1290
1291     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1292     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1293     {
1294         simdscalari vIndex;
1295         uint32_t  i = 0;
1296
1297         if (IsIndexedT::value)
1298         {
1299             fetchInfo.pIndices = work.pIB;
1300         }
1301         else
1302         {
1303             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1304             fetchInfo.pIndices = (const int32_t*)&vIndex;
1305         }
1306
1307         fetchInfo.CurInstance = instanceNum;
1308         vsContext.InstanceID = instanceNum;
1309
1310         while (pa.HasWork())
1311         {
1312             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1313             // So we need to keep this outside of (i < endVertex) check.
1314             simdmask* pvCutIndices = nullptr;
1315             if (IsIndexedT::value)
1316             {
1317                 pvCutIndices = &pa.GetNextVsIndices();
1318             }
1319
1320             simdvertex& vout = pa.GetNextVsOutput();
1321             vsContext.pVout = &vout;
1322
1323             if (i < endVertex)
1324             {
1325
1326                 // 1. Execute FS/VS for a single SIMD.
1327                 RDTSC_START(FEFetchShader);
1328                 state.pfnFetchFunc(fetchInfo, vin);
1329                 RDTSC_STOP(FEFetchShader, 0, 0);
1330
1331                 // forward fetch generated vertex IDs to the vertex shader
1332                 vsContext.VertexID = fetchInfo.VertexID;
1333
1334                 // Setup active mask for vertex shader.
1335                 vsContext.mask = GenerateMask(endVertex - i);
1336
1337                 // forward cut mask to the PA
1338                 if (IsIndexedT::value)
1339                 {
1340                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1341                 }
1342
1343                 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1344
1345 #if KNOB_ENABLE_TOSS_POINTS
1346                 if (!KNOB_TOSS_FETCH)
1347 #endif
1348                 {
1349                     RDTSC_START(FEVertexShader);
1350                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1351                     RDTSC_STOP(FEVertexShader, 0, 0);
1352
1353                     UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1354                 }
1355             }
1356
1357             // 2. Assemble primitives given the last two SIMD.
1358             do
1359             {
1360                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1361                 // PaAssemble returns false if there is not enough verts to assemble.
1362                 RDTSC_START(FEPAAssemble);
1363                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1364                 RDTSC_STOP(FEPAAssemble, 1, 0);
1365
1366 #if KNOB_ENABLE_TOSS_POINTS
1367                 if (!KNOB_TOSS_FETCH)
1368 #endif
1369                 {
1370 #if KNOB_ENABLE_TOSS_POINTS
1371                     if (!KNOB_TOSS_VS)
1372 #endif
1373                     {
1374                         if (assemble)
1375                         {
1376                             UPDATE_STAT(IaPrimitives, pa.NumPrims());
1377
1378                             if (HasTessellationT::value)
1379                             {
1380                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1381                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1382                             }
1383                             else if (HasGeometryShaderT::value)
1384                             {
1385                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1386                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1387                             }
1388                             else
1389                             {
1390                                 // If streamout is enabled then stream vertices out to memory.
1391                                 if (HasStreamOutT::value)
1392                                 {
1393                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1394                                 }
1395
1396                                 if (HasRastT::value)
1397                                 {
1398                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1399                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1400                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1401                                 }
1402                             }
1403                         }
1404                     }
1405                 }
1406             } while (pa.NextPrim());
1407
1408             i += KNOB_SIMD_WIDTH;
1409             if (IsIndexedT::value)
1410             {
1411                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1412             }
1413             else
1414             {
1415                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1416             }
1417         }
1418         pa.Reset();
1419     }
1420
1421     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1422 }
1423
1424 struct FEDrawChooser
1425 {
1426     typedef PFN_FE_WORK_FUNC FuncType;
1427
1428     template <typename... ArgsB>
1429     static FuncType GetFunc()
1430     {
1431         return ProcessDraw<ArgsB...>;
1432     }
1433 };
1434
1435
1436 // Selector for correct templated Draw front-end function
1437 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1438     bool IsIndexed,
1439     bool IsCutIndexEnabled,
1440     bool HasTessellation,
1441     bool HasGeometryShader,
1442     bool HasStreamOut,
1443     bool HasRasterization)
1444 {
1445     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1446 }
1447
1448
1449 //////////////////////////////////////////////////////////////////////////
1450 /// @brief Processes attributes for the backend based on linkage mask and
1451 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1452 /// @param pDC - Draw context
1453 /// @param pa - Primitive Assembly state
1454 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1455 /// @param pLinkageMap - maps VS attribute slot to PS slot
1456 /// @param triIndex - Triangle to process attributes for
1457 /// @param pBuffer - Output result
1458 template<uint32_t NumVerts>
1459 INLINE void ProcessAttributes(
1460     DRAW_CONTEXT *pDC,
1461     PA_STATE&pa,
1462     uint32_t linkageMask,
1463     const uint8_t* pLinkageMap,
1464     uint32_t triIndex,
1465     float *pBuffer)
1466 {
1467     DWORD slot = 0;
1468     uint32_t mapIdx = 0;
1469     LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1470     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1471     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1472
1473     while (_BitScanForward(&slot, linkageMask))
1474     {
1475         linkageMask &= ~(1 << slot); // done with this bit.
1476
1477         // compute absolute slot in vertex attrib array
1478         uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1479
1480         __m128 attrib[3];    // triangle attribs (always 4 wide)
1481
1482         if (_bittest(&constantInterpMask, mapIdx))
1483         {
1484             uint32_t vid;
1485             static const uint32_t tristripProvokingVertex[] = {0, 2, 1};
1486             static const int32_t quadProvokingTri[2][4] = {{0, 0, 0, 1}, {0, -1, 0, 0}};
1487             static const uint32_t quadProvokingVertex[2][4] = {{0, 1, 2, 2}, {0, 1, 1, 2}};
1488             static const int32_t qstripProvokingTri[2][4] = {{0, 0, 0, 1}, {-1, 0, 0, 0}};
1489             static const uint32_t qstripProvokingVertex[2][4] = {{0, 1, 2, 1}, {0, 0, 2, 1}};
1490
1491             switch (topo) {
1492             case TOP_QUAD_LIST:
1493                 pa.AssembleSingle(inputSlot,
1494                                   triIndex + quadProvokingTri[triIndex & 1][provokingVertex],
1495                                   attrib);
1496                 vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1497                 break;
1498             case TOP_QUAD_STRIP:
1499                 pa.AssembleSingle(inputSlot,
1500                                   triIndex + qstripProvokingTri[triIndex & 1][provokingVertex],
1501                                   attrib);
1502                 vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1503                 break;
1504             case TOP_TRIANGLE_STRIP:
1505                pa.AssembleSingle(inputSlot, triIndex, attrib);
1506                vid = (triIndex & 1)
1507                    ? tristripProvokingVertex[provokingVertex]
1508                    : provokingVertex;
1509                break;
1510             default:
1511                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1512                 vid = provokingVertex;
1513                 break;
1514             }
1515
1516             for (uint32_t i = 0; i < NumVerts; ++i)
1517             {
1518                 _mm_store_ps(pBuffer, attrib[vid]);
1519                 pBuffer += 4;
1520             }
1521         }
1522         else
1523         {
1524             pa.AssembleSingle(inputSlot, triIndex, attrib);
1525
1526             for (uint32_t i = 0; i < NumVerts; ++i)
1527             {
1528                 _mm_store_ps(pBuffer, attrib[i]);
1529                 pBuffer += 4;
1530             }
1531         }
1532
1533         // pad out the attrib buffer to 3 verts to ensure the triangle
1534         // interpolation code in the pixel shader works correctly for the
1535         // 3 topologies - point, line, tri.  This effectively zeros out the
1536         // effect of the missing vertices in the triangle interpolation.
1537         for (uint32_t i = NumVerts; i < 3; ++i)
1538         {
1539             _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1540             pBuffer += 4;
1541         }
1542
1543         mapIdx++;
1544     }
1545 }
1546
1547 //////////////////////////////////////////////////////////////////////////
1548 /// @brief Processes enabled user clip distances. Loads the active clip
1549 ///        distances from the PA, sets up barycentric equations, and
1550 ///        stores the results to the output buffer
1551 /// @param pa - Primitive Assembly state
1552 /// @param primIndex - primitive index to process
1553 /// @param clipDistMask - mask of enabled clip distances
1554 /// @param pUserClipBuffer - buffer to store results
1555 template<uint32_t NumVerts>
1556 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1557 {
1558     DWORD clipDist;
1559     while (_BitScanForward(&clipDist, clipDistMask))
1560     {
1561         clipDistMask &= ~(1 << clipDist);
1562         uint32_t clipSlot = clipDist >> 2;
1563         uint32_t clipComp = clipDist & 0x3;
1564         uint32_t clipAttribSlot = clipSlot == 0 ?
1565             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1566
1567         __m128 primClipDist[3];
1568         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1569
1570         float vertClipDist[NumVerts];
1571         for (uint32_t e = 0; e < NumVerts; ++e)
1572         {
1573             OSALIGNSIMD(float) aVertClipDist[4];
1574             _mm_store_ps(aVertClipDist, primClipDist[e]);
1575             vertClipDist[e] = aVertClipDist[clipComp];
1576         };
1577
1578         // setup plane equations for barycentric interpolation in the backend
1579         float baryCoeff[NumVerts];
1580         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1581         {
1582             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1583         }
1584         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1585
1586         for (uint32_t e = 0; e < NumVerts; ++e)
1587         {
1588             *(pUserClipBuffer++) = baryCoeff[e];
1589         }
1590     }
1591 }
1592
1593 //////////////////////////////////////////////////////////////////////////
1594 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1595 /// Point precision from FP32.
1596 template <typename PT = FixedPointTraits<Fixed_16_8>>
1597 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1598 {
1599     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1600     return _simd_cvtps_epi32(vFixed);
1601 }
1602
1603 //////////////////////////////////////////////////////////////////////////
1604 /// @brief Helper function to set the X,Y coords of a triangle to the
1605 /// requested Fixed Point precision from FP32. If the RequestedT
1606 /// FixedPointTraits precision is the same as the CurrentT, no extra
1607 /// conversions will be done. If they are different, convert from FP32
1608 /// to the Requested precision and set vXi, vYi
1609 /// @tparam RequestedT: requested FixedPointTraits type
1610 /// @tparam CurrentT: FixedPointTraits type of the last
1611 template<typename RequestedT, typename CurrentT = FixedPointTraits<Fixed_Uninit>>
1612 struct FPToFixedPoint
1613 {
1614     //////////////////////////////////////////////////////////////////////////
1615     /// @param tri: simdvector[3] of FP triangle verts
1616     /// @param vXi: fixed point X coords of tri verts
1617     /// @param vYi: fixed point Y coords of tri verts
1618     INLINE static void Set(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1619     {
1620         vXi[0] = fpToFixedPointVertical<RequestedT>(tri[0].x);
1621         vYi[0] = fpToFixedPointVertical<RequestedT>(tri[0].y);
1622         vXi[1] = fpToFixedPointVertical<RequestedT>(tri[1].x);
1623         vYi[1] = fpToFixedPointVertical<RequestedT>(tri[1].y);
1624         vXi[2] = fpToFixedPointVertical<RequestedT>(tri[2].x);
1625         vYi[2] = fpToFixedPointVertical<RequestedT>(tri[2].y);
1626     };
1627 };
1628
1629 //////////////////////////////////////////////////////////////////////////
1630 /// @brief In the case where the RequestedT and CurrentT fixed point
1631 /// precisions are the same, do nothing.
1632 template<typename RequestedT>
1633 struct FPToFixedPoint<RequestedT, RequestedT>
1634 {
1635     INLINE static void Set(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3]){};
1636 };
1637
1638 //////////////////////////////////////////////////////////////////////////
1639 /// @brief Calculate bounding box for current triangle
1640 /// @tparam CT: ConservativeRastFETraits type
1641 /// @param vX: fixed point X position for triangle verts
1642 /// @param vY: fixed point Y position for triangle verts
1643 /// @param bbox: fixed point bbox
1644 /// *Note*: expects vX, vY to be in the correct precision for the type
1645 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1646 template <typename CT>
1647 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox){}
1648
1649 //////////////////////////////////////////////////////////////////////////
1650 /// @brief FEStandardRastT specialization of calcBoundingBoxIntVertical
1651 template <>
1652 INLINE void calcBoundingBoxIntVertical<FEStandardRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1653 {
1654     // FE conservative rast traits
1655     typedef FEStandardRastT CT;
1656
1657     static_assert(std::is_same<CT::BBoxPrecisionT, FixedPointTraits<Fixed_16_8>>::value, "Standard rast BBox calculation needs to be in 16.8 precision");
1658     // Update vXi, vYi fixed point precision for BBox calculation if necessary
1659     FPToFixedPoint<CT::BBoxPrecisionT, CT::ZeroAreaPrecisionT>::Set(tri, vX, vY);
1660
1661     simdscalari vMinX = vX[0];
1662     vMinX = _simd_min_epi32(vMinX, vX[1]);
1663     vMinX = _simd_min_epi32(vMinX, vX[2]);
1664
1665     simdscalari vMaxX = vX[0];
1666     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1667     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1668
1669     simdscalari vMinY = vY[0];
1670     vMinY = _simd_min_epi32(vMinY, vY[1]);
1671     vMinY = _simd_min_epi32(vMinY, vY[2]);
1672
1673     simdscalari vMaxY = vY[0];
1674     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1675     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1676
1677     bbox.left = vMinX;
1678     bbox.right = vMaxX;
1679     bbox.top = vMinY;
1680     bbox.bottom = vMaxY;
1681 }
1682
1683 //////////////////////////////////////////////////////////////////////////
1684 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1685 /// Offsets BBox for conservative rast
1686 template <>
1687 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1688 {
1689     // FE conservative rast traits
1690     typedef FEConservativeRastT CT;
1691
1692     static_assert(std::is_same<CT::BBoxPrecisionT, FixedPointTraits<Fixed_16_9>>::value, "Conservative rast BBox calculation needs to be in 16.9 precision");
1693     // Update vXi, vYi fixed point precision for BBox calculation if necessary
1694     FPToFixedPoint<CT::BBoxPrecisionT, CT::ZeroAreaPrecisionT>::Set(tri, vX, vY);
1695
1696     simdscalari vMinX = vX[0];
1697     vMinX = _simd_min_epi32(vMinX, vX[1]);
1698     vMinX = _simd_min_epi32(vMinX, vX[2]);
1699
1700     simdscalari vMaxX = vX[0];
1701     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1702     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1703
1704     simdscalari vMinY = vY[0];
1705     vMinY = _simd_min_epi32(vMinY, vY[1]);
1706     vMinY = _simd_min_epi32(vMinY, vY[2]);
1707
1708     simdscalari vMaxY = vY[0];
1709     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1710     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1711
1712     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1713     bbox.left = _simd_srli_epi32(_simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1714     bbox.right = _simd_srli_epi32(_simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1715     bbox.top = _simd_srli_epi32(_simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1716     bbox.bottom = _simd_srli_epi32(_simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value)), CT::BoundingBoxShiftT::value);
1717 }
1718
1719 //////////////////////////////////////////////////////////////////////////
1720 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1721 ///        culling, viewport transform, etc.
1722 /// @param pDC - pointer to draw context.
1723 /// @param pa - The primitive assembly object.
1724 /// @param workerId - thread's worker id. Even thread has a unique id.
1725 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1726 /// @param primID - Primitive ID for each triangle.
1727 /// @tparam CT - ConservativeRastFETraits
1728 template <typename CT>
1729 void BinTriangles(
1730     DRAW_CONTEXT *pDC,
1731     PA_STATE& pa,
1732     uint32_t workerId,
1733     simdvector tri[3],
1734     uint32_t triMask,
1735     simdscalari primID)
1736 {
1737     RDTSC_START(FEBinTriangles);
1738
1739     const API_STATE& state = GetApiState(pDC);
1740     const SWR_RASTSTATE& rastState = state.rastState;
1741     const SWR_FRONTEND_STATE& feState = state.frontendState;
1742     const SWR_GS_STATE& gsState = state.gsState;
1743     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1744
1745
1746     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1747     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1748     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1749
1750     if (!feState.vpTransformDisable)
1751     {
1752         // perspective divide
1753         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1754         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1755         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1756
1757         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1758         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1759         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1760
1761         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1762         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1763         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1764
1765         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1766         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1767         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1768
1769         // viewport transform to screen coords
1770         viewportTransform<3>(tri, state.vpMatrix[0]);
1771     }
1772
1773     // adjust for pixel center location
1774     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1775     tri[0].x = _simd_add_ps(tri[0].x, offset);
1776     tri[0].y = _simd_add_ps(tri[0].y, offset);
1777
1778     tri[1].x = _simd_add_ps(tri[1].x, offset);
1779     tri[1].y = _simd_add_ps(tri[1].y, offset);
1780
1781     tri[2].x = _simd_add_ps(tri[2].x, offset);
1782     tri[2].y = _simd_add_ps(tri[2].y, offset);
1783
1784     simdscalari vXi[3], vYi[3];
1785     // Set vXi, vYi to fixed point precision required for degenerate triangle check
1786     FPToFixedPoint<typename CT::ZeroAreaPrecisionT>::Set(tri, vXi, vYi);
1787
1788     // triangle setup
1789     simdscalari vAi[3], vBi[3];
1790     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1791
1792     // determinant
1793     simdscalari vDet[2];
1794     calcDeterminantIntVertical(vAi, vBi, vDet);
1795
1796     /// todo: handle degen tri's for Conservative Rast.
1797
1798     // cull zero area
1799     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1800     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1801
1802     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1803
1804     uint32_t origTriMask = triMask;
1805     triMask &= ~cullZeroAreaMask;
1806
1807     // determine front winding tris
1808     // CW  +det
1809     // CCW -det
1810     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1811     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1812     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1813
1814     uint32_t frontWindingTris;
1815     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1816     {
1817         frontWindingTris = cwTriMask;
1818     }
1819     else
1820     {
1821         frontWindingTris = ~cwTriMask;
1822     }
1823
1824     // cull
1825     uint32_t cullTris;
1826     switch ((SWR_CULLMODE)rastState.cullMode)
1827     {
1828     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1829     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1830     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1831     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1832     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1833     }
1834
1835     triMask &= ~cullTris;
1836
1837     if (origTriMask ^ triMask)
1838     {
1839         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1840     }
1841
1842     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1843     // compute per tri backface
1844     uint32_t frontFaceMask = frontWindingTris;
1845     uint32_t *pPrimID = (uint32_t *)&primID;
1846     DWORD triIndex = 0;
1847     // for center sample pattern, all samples are at pixel center; calculate coverage
1848     // once at center and broadcast the results in the backend
1849     uint32_t sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1850     PFN_WORK_FUNC pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1851                                               pDC->pState->state.psState.inputCoverage, (rastState.scissorEnable > 0));
1852     if (!triMask)
1853     {
1854         goto endBinTriangles;
1855     }
1856
1857     // Calc bounding box of triangles
1858     simdBBox bbox;
1859     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1860
1861     // determine if triangle falls between pixel centers and discard
1862     // only discard for non-MSAA case and when conservative rast is disabled
1863     // (left + 127) & ~255
1864     // (right + 128) & ~255
1865     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1866     {
1867         origTriMask = triMask;
1868
1869         int cullCenterMask;
1870         {
1871             simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1872             left = _simd_and_si(left, _simd_set1_epi32(~255));
1873             simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1874             right = _simd_and_si(right, _simd_set1_epi32(~255));
1875
1876             simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1877
1878             simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1879             top = _simd_and_si(top, _simd_set1_epi32(~255));
1880             simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1881             bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1882
1883             simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1884             vMaskV = _simd_or_si(vMaskH, vMaskV);
1885             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1886         }
1887
1888         triMask &= ~cullCenterMask;
1889
1890         if(origTriMask ^ triMask)
1891         {
1892             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1893         }
1894     }
1895
1896     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1897     bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1898     bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1899     bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1900     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1901
1902     // Cull tris completely outside scissor
1903     {
1904         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1905         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1906         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1907         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1908         triMask = triMask & ~maskOutsideScissor;
1909     }
1910
1911     if (!triMask)
1912     {
1913         goto endBinTriangles;
1914     }
1915
1916     // Convert triangle bbox to macrotile units.
1917     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1920     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1921
1922     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1923     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1924     _simd_store_si((simdscalari*)aMTRight, bbox.right);
1925     _simd_store_si((simdscalari*)aMTTop, bbox.top);
1926     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1927
1928     // transpose verts needed for backend
1929     /// @todo modify BE to take non-transformed verts
1930     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1931     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1932     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1933     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1934     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1935
1936     // store render target array index
1937     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1938     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1939     {
1940         simdvector vRtai[3];
1941         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1942         simdscalari vRtaii;
1943         vRtaii = _simd_castps_si(vRtai[0].x);
1944         _simd_store_si((simdscalari*)aRTAI, vRtaii);
1945     }
1946     else
1947     {
1948         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1949     }
1950
1951     // scan remaining valid triangles and bin each separately
1952     while (_BitScanForward(&triIndex, triMask))
1953     {
1954         uint32_t linkageCount = state.linkageCount;
1955         uint32_t linkageMask  = state.linkageMask;
1956         uint32_t numScalarAttribs = linkageCount * 4;
1957
1958         BE_WORK work;
1959         work.type = DRAW;
1960         work.pfnWork = pfnWork;
1961
1962         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1963
1964         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1965         desc.triFlags.primID = pPrimID[triIndex];
1966         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1967
1968         auto pArena = pDC->pArena;
1969         SWR_ASSERT(pArena != nullptr);
1970
1971         // store active attribs
1972         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1973         desc.pAttribs = pAttribs;
1974         desc.numAttribs = linkageCount;
1975         ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1976
1977         // store triangle vertex data
1978         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1979
1980         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1981         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1982         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1983         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1984
1985         // store user clip distances
1986         if (rastState.clipDistanceMask)
1987         {
1988             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1989             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1990             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1991         }
1992
1993         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1994         {
1995             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1996             {
1997 #if KNOB_ENABLE_TOSS_POINTS
1998                 if (!KNOB_TOSS_SETUP_TRIS)
1999 #endif
2000                 {
2001                     pTileMgr->enqueue(x, y, &work);
2002                 }
2003             }
2004         }
2005         triMask &= ~(1 << triIndex);
2006     }
2007
2008 endBinTriangles:
2009     RDTSC_STOP(FEBinTriangles, 1, 0);
2010 }
2011
2012 struct FEBinTrianglesChooser
2013 {
2014     typedef PFN_PROCESS_PRIMS FuncType;
2015
2016     template <typename... ArgsB>
2017     static FuncType GetFunc()
2018     {
2019         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2020     }
2021 };
2022
2023 // Selector for correct templated BinTrinagles function
2024 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2025 {
2026     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2027 }
2028
2029 //////////////////////////////////////////////////////////////////////////
2030 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2031 /// @param pDC - pointer to draw context.
2032 /// @param pa - The primitive assembly object.
2033 /// @param workerId - thread's worker id. Even thread has a unique id.
2034 /// @param tri - Contains point position data for SIMDs worth of points.
2035 /// @param primID - Primitive ID for each point.
2036 void BinPoints(
2037     DRAW_CONTEXT *pDC,
2038     PA_STATE& pa,
2039     uint32_t workerId,
2040     simdvector prim[3],
2041     uint32_t primMask,
2042     simdscalari primID)
2043 {
2044     RDTSC_START(FEBinPoints);
2045
2046     simdvector& primVerts = prim[0];
2047
2048     const API_STATE& state = GetApiState(pDC);
2049     const SWR_FRONTEND_STATE& feState = state.frontendState;
2050     const SWR_GS_STATE& gsState = state.gsState;
2051     const SWR_RASTSTATE& rastState = state.rastState;
2052
2053     if (!feState.vpTransformDisable)
2054     {
2055         // perspective divide
2056         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2057         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2058         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2059         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2060
2061         // viewport transform to screen coords
2062         viewportTransform<1>(&primVerts, state.vpMatrix[0]);
2063     }
2064
2065     // adjust for pixel center location
2066     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2067     primVerts.x = _simd_add_ps(primVerts.x, offset);
2068     primVerts.y = _simd_add_ps(primVerts.y, offset);
2069
2070     // convert to fixed point
2071     simdscalari vXi, vYi;
2072     vXi = fpToFixedPointVertical(primVerts.x);
2073     vYi = fpToFixedPointVertical(primVerts.y);
2074
2075     if (CanUseSimplePoints(pDC))
2076     {
2077         // adjust for top-left rule
2078         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2079         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2080
2081         // cull points off the top-left edge of the viewport
2082         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2083         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2084
2085         // compute macro tile coordinates
2086         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2087         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2088
2089         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2090         _simd_store_si((simdscalari*)aMacroX, macroX);
2091         _simd_store_si((simdscalari*)aMacroY, macroY);
2092
2093         // compute raster tile coordinates
2094         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2095         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2096
2097         // compute raster tile relative x,y for coverage mask
2098         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2099         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2100
2101         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2102         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2103
2104         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2105         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2106         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2107         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2108
2109         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2110         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2111         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2112         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2113
2114         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2115         _simd_store_ps((float*)aZ, primVerts.z);
2116
2117         // store render target array index
2118         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2119         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2120         {
2121             simdvector vRtai;
2122             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2123             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2124             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2125         }
2126         else
2127         {
2128             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2129         }
2130
2131         uint32_t *pPrimID = (uint32_t *)&primID;
2132         DWORD primIndex = 0;
2133         // scan remaining valid triangles and bin each separately
2134         while (_BitScanForward(&primIndex, primMask))
2135         {
2136             uint32_t linkageCount = state.linkageCount;
2137             uint32_t linkageMask = state.linkageMask;
2138
2139             uint32_t numScalarAttribs = linkageCount * 4;
2140
2141             BE_WORK work;
2142             work.type = DRAW;
2143
2144             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2145
2146             // points are always front facing
2147             desc.triFlags.frontFacing = 1;
2148             desc.triFlags.primID = pPrimID[primIndex];
2149             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2150
2151             work.pfnWork = RasterizeSimplePoint;
2152
2153             auto pArena = pDC->pArena;
2154             SWR_ASSERT(pArena != nullptr);
2155
2156             // store attributes
2157             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2158             desc.pAttribs = pAttribs;
2159             desc.numAttribs = linkageCount;
2160
2161             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
2162
2163             // store raster tile aligned x, y, perspective correct z
2164             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2165             desc.pTriBuffer = pTriBuffer;
2166             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2167             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2168             *pTriBuffer = aZ[primIndex];
2169
2170             uint32_t tX = aTileRelativeX[primIndex];
2171             uint32_t tY = aTileRelativeY[primIndex];
2172
2173             // pack the relative x,y into the coverageMask, the rasterizer will
2174             // generate the true coverage mask from it
2175             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2176
2177             // bin it
2178             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2179 #if KNOB_ENABLE_TOSS_POINTS
2180             if (!KNOB_TOSS_SETUP_TRIS)
2181 #endif
2182             {
2183                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2184             }
2185             primMask &= ~(1 << primIndex);
2186         }
2187     }
2188     else
2189     {
2190         // non simple points need to be potentially binned to multiple macro tiles
2191         simdscalar vPointSize;
2192         if (rastState.pointParam)
2193         {
2194             simdvector size[3];
2195             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2196             vPointSize = size[0].x;
2197         }
2198         else
2199         {
2200             vPointSize = _simd_set1_ps(rastState.pointSize);
2201         }
2202
2203         // bloat point to bbox
2204         simdBBox bbox;
2205         bbox.left = bbox.right = vXi;
2206         bbox.top = bbox.bottom = vYi;
2207
2208         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2209         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2210         bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2211         bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2212         bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2213         bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2214
2215         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2216         bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2217         bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2218         bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2219         bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2220
2221         // Cull bloated points completely outside scissor
2222         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2223         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2224         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2225         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2226         primMask = primMask & ~maskOutsideScissor;
2227
2228         // Convert bbox to macrotile units.
2229         bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2230         bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2231         bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2232         bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2233
2234         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2235         _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2236         _simd_store_si((simdscalari*)aMTRight, bbox.right);
2237         _simd_store_si((simdscalari*)aMTTop, bbox.top);
2238         _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2239
2240         // store render target array index
2241         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2242         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2243         {
2244             simdvector vRtai[2];
2245             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2246             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2247             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2248         }
2249         else
2250         {
2251             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2252         }
2253
2254         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2255         _simd_store_ps((float*)aPointSize, vPointSize);
2256
2257         uint32_t *pPrimID = (uint32_t *)&primID;
2258
2259         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2260         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2261         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2262
2263         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2264         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2265         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2266
2267         // scan remaining valid prims and bin each separately
2268         DWORD primIndex;
2269         while (_BitScanForward(&primIndex, primMask))
2270         {
2271             uint32_t linkageCount = state.linkageCount;
2272             uint32_t linkageMask = state.linkageMask;
2273             uint32_t numScalarAttribs = linkageCount * 4;
2274
2275             BE_WORK work;
2276             work.type = DRAW;
2277
2278             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2279
2280             desc.triFlags.frontFacing = 1;
2281             desc.triFlags.primID = pPrimID[primIndex];
2282             desc.triFlags.pointSize = aPointSize[primIndex];
2283             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2284
2285             work.pfnWork = RasterizeTriPoint;
2286
2287             auto pArena = pDC->pArena;
2288             SWR_ASSERT(pArena != nullptr);
2289
2290             // store active attribs
2291             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2292             desc.numAttribs = linkageCount;
2293             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2294
2295             // store point vertex data
2296             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2297             desc.pTriBuffer = pTriBuffer;
2298             *pTriBuffer++ = aPrimVertsX[primIndex];
2299             *pTriBuffer++ = aPrimVertsY[primIndex];
2300             *pTriBuffer = aPrimVertsZ[primIndex];
2301
2302             // store user clip distances
2303             if (rastState.clipDistanceMask)
2304             {
2305                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2306                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2307                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2308             }
2309
2310             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2311             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2312             {
2313                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2314                 {
2315 #if KNOB_ENABLE_TOSS_POINTS
2316                     if (!KNOB_TOSS_SETUP_TRIS)
2317 #endif
2318                     {
2319                         pTileMgr->enqueue(x, y, &work);
2320                     }
2321                 }
2322             }
2323
2324             primMask &= ~(1 << primIndex);
2325         }
2326     }
2327
2328
2329
2330
2331     RDTSC_STOP(FEBinPoints, 1, 0);
2332 }
2333
2334 //////////////////////////////////////////////////////////////////////////
2335 /// @brief Bin SIMD lines to the backend.
2336 /// @param pDC - pointer to draw context.
2337 /// @param pa - The primitive assembly object.
2338 /// @param workerId - thread's worker id. Even thread has a unique id.
2339 /// @param tri - Contains line position data for SIMDs worth of points.
2340 /// @param primID - Primitive ID for each line.
2341 void BinLines(
2342     DRAW_CONTEXT *pDC,
2343     PA_STATE& pa,
2344     uint32_t workerId,
2345     simdvector prim[],
2346     uint32_t primMask,
2347     simdscalari primID)
2348 {
2349     RDTSC_START(FEBinLines);
2350
2351     const API_STATE& state = GetApiState(pDC);
2352     const SWR_RASTSTATE& rastState = state.rastState;
2353     const SWR_FRONTEND_STATE& feState = state.frontendState;
2354     const SWR_GS_STATE& gsState = state.gsState;
2355
2356     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2357     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2358
2359     if (!feState.vpTransformDisable)
2360     {
2361         // perspective divide
2362         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2363         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2364
2365         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2366         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2367
2368         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2369         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2370
2371         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2372         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2373
2374         // viewport transform to screen coords
2375         viewportTransform<2>(prim, state.vpMatrix[0]);
2376     }
2377
2378     // adjust for pixel center location
2379     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2380     prim[0].x = _simd_add_ps(prim[0].x, offset);
2381     prim[0].y = _simd_add_ps(prim[0].y, offset);
2382
2383     prim[1].x = _simd_add_ps(prim[1].x, offset);
2384     prim[1].y = _simd_add_ps(prim[1].y, offset);
2385
2386     // convert to fixed point
2387     simdscalari vXi[2], vYi[2];
2388     vXi[0] = fpToFixedPointVertical(prim[0].x);
2389     vYi[0] = fpToFixedPointVertical(prim[0].y);
2390     vXi[1] = fpToFixedPointVertical(prim[1].x);
2391     vYi[1] = fpToFixedPointVertical(prim[1].y);
2392
2393     // compute x-major vs y-major mask
2394     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2395     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2396     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2397     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2398
2399     // cull zero-length lines
2400     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2401     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2402
2403     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2404
2405     uint32_t *pPrimID = (uint32_t *)&primID;
2406
2407     simdscalar vUnused = _simd_setzero_ps();
2408
2409     // Calc bounding box of lines
2410     simdBBox bbox;
2411     bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2412     bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2413     bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2414     bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2415
2416     // bloat bbox by line width along minor axis
2417     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2418     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2419     simdBBox bloatBox;
2420     bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2421     bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2422     bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2423     bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2424
2425     bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2426     bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2427     bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2428     bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2429
2430     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2431     bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2432     bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2433     bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2434     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2435
2436     // Cull prims completely outside scissor
2437     {
2438         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2439         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2440         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2441         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2442         primMask = primMask & ~maskOutsideScissor;
2443     }
2444
2445     if (!primMask)
2446     {
2447         goto endBinLines;
2448     }
2449
2450     // Convert triangle bbox to macrotile units.
2451     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2452     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2453     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2454     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2455
2456     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2457     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2458     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2459     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2460     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2461
2462     // transpose verts needed for backend
2463     /// @todo modify BE to take non-transformed verts
2464     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2465     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2466     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2467     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2468     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2469
2470     // store render target array index
2471     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2472     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2473     {
2474         simdvector vRtai[2];
2475         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2476         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2477         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2478     }
2479     else
2480     {
2481         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2482     }
2483
2484     // scan remaining valid prims and bin each separately
2485     DWORD primIndex;
2486     while (_BitScanForward(&primIndex, primMask))
2487     {
2488         uint32_t linkageCount = state.linkageCount;
2489         uint32_t linkageMask = state.linkageMask;
2490         uint32_t numScalarAttribs = linkageCount * 4;
2491
2492         BE_WORK work;
2493         work.type = DRAW;
2494
2495         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2496
2497         desc.triFlags.frontFacing = 1;
2498         desc.triFlags.primID = pPrimID[primIndex];
2499         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2500         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2501
2502         work.pfnWork = RasterizeLine;
2503
2504         auto pArena = pDC->pArena;
2505         SWR_ASSERT(pArena != nullptr);
2506
2507         // store active attribs
2508         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2509         desc.numAttribs = linkageCount;
2510         ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2511
2512         // store line vertex data
2513         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2514         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2515         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2516         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2517         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2518
2519         // store user clip distances
2520         if (rastState.clipDistanceMask)
2521         {
2522             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2523             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2524             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2525         }
2526
2527         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2528         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2529         {
2530             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2531             {
2532 #if KNOB_ENABLE_TOSS_POINTS
2533                 if (!KNOB_TOSS_SETUP_TRIS)
2534 #endif
2535                 {
2536                     pTileMgr->enqueue(x, y, &work);
2537                 }
2538             }
2539         }
2540
2541         primMask &= ~(1 << primIndex);
2542     }
2543
2544 endBinLines:
2545
2546     RDTSC_STOP(FEBinLines, 1, 0);
2547 }