src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "conservativeRast.h"
  37 #include "utils.h"
  38 #include "threads.h"
  39 #include "pa.h"
  40 #include "clip.h"
  41 #include "tilemgr.h"
  42 #include "tessellator.h"
  43 #include <limits>
  44
  45 //////////////////////////////////////////////////////////////////////////
  46 /// @brief Helper macro to generate a bitmask
  47 static INLINE uint32_t GenMask(uint32_t numBits)
  48 {
  49     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  50     return ((1U << numBits) - 1);
  51 }
  52
  53 //////////////////////////////////////////////////////////////////////////
  54 /// @brief Offsets added to post-viewport vertex positions based on
  55 /// raster state.
  56 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  57 {
  58     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  59     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  60 };
  61
  62 //////////////////////////////////////////////////////////////////////////
  63 /// @brief FE handler for SwrSync.
  64 /// @param pContext - pointer to SWR context.
  65 /// @param pDC - pointer to draw context.
  66 /// @param workerId - thread's worker id. Even thread has a unique id.
  67 /// @param pUserData - Pointer to user data passed back to sync callback.
  68 /// @todo This should go away when we switch this to use compute threading.
  69 void ProcessSync(
  70     SWR_CONTEXT *pContext,
  71     DRAW_CONTEXT *pDC,
  72     uint32_t workerId,
  73     void *pUserData)
  74 {
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrClearRenderTarget.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to clear callback.
  89 /// @todo This should go away when we switch this to use compute threading.
  90 void ProcessClear(
  91     SWR_CONTEXT *pContext,
  92     DRAW_CONTEXT *pDC,
  93     uint32_t workerId,
  94     void *pUserData)
  95 {
  96     CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
  97     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  98
  99     const API_STATE& state = GetApiState(pDC);
 100
 101     // queue a clear to each macro tile
 102     // compute macro tile bounds for the current scissor/viewport
 103     uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
 104     uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
 105     uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
 106     uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
 107
 108     BE_WORK work;
 109     work.type = CLEAR;
 110     work.pfnWork = ProcessClearBE;
 111     work.desc.clear = *pClear;
 112
 113     for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
 114     {
 115         for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
 116         {
 117             pTileMgr->enqueue(x, y, &work);
 118         }
 119     }
 120 }
 121
 122 //////////////////////////////////////////////////////////////////////////
 123 /// @brief FE handler for SwrStoreTiles.
 124 /// @param pContext - pointer to SWR context.
 125 /// @param pDC - pointer to draw context.
 126 /// @param workerId - thread's worker id. Even thread has a unique id.
 127 /// @param pUserData - Pointer to user data passed back to callback.
 128 /// @todo This should go away when we switch this to use compute threading.
 129 void ProcessStoreTiles(
 130     SWR_CONTEXT *pContext,
 131     DRAW_CONTEXT *pDC,
 132     uint32_t workerId,
 133     void *pUserData)
 134 {
 135     RDTSC_START(FEProcessStoreTiles);
 136     STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
 137     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 138
 139     const API_STATE& state = GetApiState(pDC);
 140
 141     // queue a store to each macro tile
 142     // compute macro tile bounds for the current render target
 143     const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 144     const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 145
 146     uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
 147     uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
 148
 149     // store tiles
 150     BE_WORK work;
 151     work.type = STORETILES;
 152     work.pfnWork = ProcessStoreTileBE;
 153     work.desc.storeTiles = *pStore;
 154
 155     for (uint32_t x = 0; x < numMacroTilesX; ++x)
 156     {
 157         for (uint32_t y = 0; y < numMacroTilesY; ++y)
 158         {
 159             pTileMgr->enqueue(x, y, &work);
 160         }
 161     }
 162
 163     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 164 }
 165
 166 //////////////////////////////////////////////////////////////////////////
 167 /// @brief FE handler for SwrInvalidateTiles.
 168 /// @param pContext - pointer to SWR context.
 169 /// @param pDC - pointer to draw context.
 170 /// @param workerId - thread's worker id. Even thread has a unique id.
 171 /// @param pUserData - Pointer to user data passed back to callback.
 172 /// @todo This should go away when we switch this to use compute threading.
 173 void ProcessDiscardInvalidateTiles(
 174     SWR_CONTEXT *pContext,
 175     DRAW_CONTEXT *pDC,
 176     uint32_t workerId,
 177     void *pUserData)
 178 {
 179     RDTSC_START(FEProcessInvalidateTiles);
 180     DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 181     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 182
 183     SWR_RECT rect;
 184
 185     if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
 186     {
 187         // Valid rect
 188         rect = pInv->rect;
 189     }
 190     else
 191     {
 192         // Use viewport dimensions
 193         const API_STATE& state = GetApiState(pDC);
 194
 195         rect.left   = (uint32_t)state.vp[0].x;
 196         rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
 197         rect.top    = (uint32_t)state.vp[0].y;
 198         rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
 199     }
 200
 201     // queue a store to each macro tile
 202     // compute macro tile bounds for the current render target
 203     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 204     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 205
 206     // Setup region assuming full tiles
 207     uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
 208     uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
 209
 210     uint32_t macroTileEndX = rect.right / macroWidth;
 211     uint32_t macroTileEndY = rect.bottom / macroHeight;
 212
 213     if (pInv->fullTilesOnly == false)
 214     {
 215         // include partial tiles
 216         macroTileStartX = rect.left / macroWidth;
 217         macroTileStartY = rect.top / macroHeight;
 218
 219         macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
 220         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
 221     }
 222
 223     SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
 224     SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 225
 226     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
 227     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 228
 229     // load tiles
 230     BE_WORK work;
 231     work.type = DISCARDINVALIDATETILES;
 232     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 233     work.desc.discardInvalidateTiles = *pInv;
 234
 235     for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
 236     {
 237         for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
 238         {
 239             pTileMgr->enqueue(x, y, &work);
 240         }
 241     }
 242
 243     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 244 }
 245
 246 //////////////////////////////////////////////////////////////////////////
 247 /// @brief Computes the number of primitives given the number of verts.
 248 /// @param mode - primitive topology for draw operation.
 249 /// @param numPrims - number of vertices or indices for draw.
 250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 251 uint32_t GetNumPrims(
 252     PRIMITIVE_TOPOLOGY mode,
 253     uint32_t numPrims)
 254 {
 255     switch (mode)
 256     {
 257     case TOP_POINT_LIST: return numPrims;
 258     case TOP_TRIANGLE_LIST: return numPrims / 3;
 259     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 260     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 261     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 262     case TOP_QUAD_LIST: return numPrims / 4;
 263     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 264     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 265     case TOP_LINE_LIST: return numPrims / 2;
 266     case TOP_LINE_LOOP: return numPrims;
 267     case TOP_RECT_LIST: return numPrims / 3;
 268     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 269     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 270     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 271     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 272
 273     case TOP_PATCHLIST_1:
 274     case TOP_PATCHLIST_2:
 275     case TOP_PATCHLIST_3:
 276     case TOP_PATCHLIST_4:
 277     case TOP_PATCHLIST_5:
 278     case TOP_PATCHLIST_6:
 279     case TOP_PATCHLIST_7:
 280     case TOP_PATCHLIST_8:
 281     case TOP_PATCHLIST_9:
 282     case TOP_PATCHLIST_10:
 283     case TOP_PATCHLIST_11:
 284     case TOP_PATCHLIST_12:
 285     case TOP_PATCHLIST_13:
 286     case TOP_PATCHLIST_14:
 287     case TOP_PATCHLIST_15:
 288     case TOP_PATCHLIST_16:
 289     case TOP_PATCHLIST_17:
 290     case TOP_PATCHLIST_18:
 291     case TOP_PATCHLIST_19:
 292     case TOP_PATCHLIST_20:
 293     case TOP_PATCHLIST_21:
 294     case TOP_PATCHLIST_22:
 295     case TOP_PATCHLIST_23:
 296     case TOP_PATCHLIST_24:
 297     case TOP_PATCHLIST_25:
 298     case TOP_PATCHLIST_26:
 299     case TOP_PATCHLIST_27:
 300     case TOP_PATCHLIST_28:
 301     case TOP_PATCHLIST_29:
 302     case TOP_PATCHLIST_30:
 303     case TOP_PATCHLIST_31:
 304     case TOP_PATCHLIST_32:
 305         return numPrims / (mode - TOP_PATCHLIST_BASE);
 306
 307     case TOP_POLYGON:
 308     case TOP_POINT_LIST_BF:
 309     case TOP_LINE_STRIP_CONT:
 310     case TOP_LINE_STRIP_BF:
 311     case TOP_LINE_STRIP_CONT_BF:
 312     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 313     case TOP_TRI_STRIP_REVERSE:
 314     case TOP_PATCHLIST_BASE:
 315     case TOP_UNKNOWN:
 316         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 317         return 0;
 318     }
 319
 320     return 0;
 321 }
 322
 323 //////////////////////////////////////////////////////////////////////////
 324 /// @brief Computes the number of verts given the number of primitives.
 325 /// @param mode - primitive topology for draw operation.
 326 /// @param numPrims - number of primitives for draw.
 327 uint32_t GetNumVerts(
 328     PRIMITIVE_TOPOLOGY mode,
 329     uint32_t numPrims)
 330 {
 331     switch (mode)
 332     {
 333     case TOP_POINT_LIST: return numPrims;
 334     case TOP_TRIANGLE_LIST: return numPrims * 3;
 335     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 336     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 337     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 338     case TOP_QUAD_LIST: return numPrims * 4;
 339     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 340     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 341     case TOP_LINE_LIST: return numPrims * 2;
 342     case TOP_LINE_LOOP: return numPrims;
 343     case TOP_RECT_LIST: return numPrims * 3;
 344     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 345     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 346     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 347     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 348
 349     case TOP_PATCHLIST_1:
 350     case TOP_PATCHLIST_2:
 351     case TOP_PATCHLIST_3:
 352     case TOP_PATCHLIST_4:
 353     case TOP_PATCHLIST_5:
 354     case TOP_PATCHLIST_6:
 355     case TOP_PATCHLIST_7:
 356     case TOP_PATCHLIST_8:
 357     case TOP_PATCHLIST_9:
 358     case TOP_PATCHLIST_10:
 359     case TOP_PATCHLIST_11:
 360     case TOP_PATCHLIST_12:
 361     case TOP_PATCHLIST_13:
 362     case TOP_PATCHLIST_14:
 363     case TOP_PATCHLIST_15:
 364     case TOP_PATCHLIST_16:
 365     case TOP_PATCHLIST_17:
 366     case TOP_PATCHLIST_18:
 367     case TOP_PATCHLIST_19:
 368     case TOP_PATCHLIST_20:
 369     case TOP_PATCHLIST_21:
 370     case TOP_PATCHLIST_22:
 371     case TOP_PATCHLIST_23:
 372     case TOP_PATCHLIST_24:
 373     case TOP_PATCHLIST_25:
 374     case TOP_PATCHLIST_26:
 375     case TOP_PATCHLIST_27:
 376     case TOP_PATCHLIST_28:
 377     case TOP_PATCHLIST_29:
 378     case TOP_PATCHLIST_30:
 379     case TOP_PATCHLIST_31:
 380     case TOP_PATCHLIST_32:
 381         return numPrims * (mode - TOP_PATCHLIST_BASE);
 382
 383     case TOP_POLYGON:
 384     case TOP_POINT_LIST_BF:
 385     case TOP_LINE_STRIP_CONT:
 386     case TOP_LINE_STRIP_BF:
 387     case TOP_LINE_STRIP_CONT_BF:
 388     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 389     case TOP_TRI_STRIP_REVERSE:
 390     case TOP_PATCHLIST_BASE:
 391     case TOP_UNKNOWN:
 392         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 393         return 0;
 394     }
 395
 396     return 0;
 397 }
 398
 399 //////////////////////////////////////////////////////////////////////////
 400 /// @brief Return number of verts per primitive.
 401 /// @param topology - topology
 402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 403 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 404 {
 405     uint32_t numVerts = 0;
 406     switch (topology)
 407     {
 408     case TOP_POINT_LIST:
 409     case TOP_POINT_LIST_BF:
 410         numVerts = 1;
 411         break;
 412     case TOP_LINE_LIST:
 413     case TOP_LINE_STRIP:
 414     case TOP_LINE_LIST_ADJ:
 415     case TOP_LINE_LOOP:
 416     case TOP_LINE_STRIP_CONT:
 417     case TOP_LINE_STRIP_BF:
 418     case TOP_LISTSTRIP_ADJ:
 419         numVerts = 2;
 420         break;
 421     case TOP_TRIANGLE_LIST:
 422     case TOP_TRIANGLE_STRIP:
 423     case TOP_TRIANGLE_FAN:
 424     case TOP_TRI_LIST_ADJ:
 425     case TOP_TRI_STRIP_ADJ:
 426     case TOP_TRI_STRIP_REVERSE:
 427     case TOP_RECT_LIST:
 428         numVerts = 3;
 429         break;
 430     case TOP_QUAD_LIST:
 431     case TOP_QUAD_STRIP:
 432         numVerts = 4;
 433         break;
 434     case TOP_PATCHLIST_1:
 435     case TOP_PATCHLIST_2:
 436     case TOP_PATCHLIST_3:
 437     case TOP_PATCHLIST_4:
 438     case TOP_PATCHLIST_5:
 439     case TOP_PATCHLIST_6:
 440     case TOP_PATCHLIST_7:
 441     case TOP_PATCHLIST_8:
 442     case TOP_PATCHLIST_9:
 443     case TOP_PATCHLIST_10:
 444     case TOP_PATCHLIST_11:
 445     case TOP_PATCHLIST_12:
 446     case TOP_PATCHLIST_13:
 447     case TOP_PATCHLIST_14:
 448     case TOP_PATCHLIST_15:
 449     case TOP_PATCHLIST_16:
 450     case TOP_PATCHLIST_17:
 451     case TOP_PATCHLIST_18:
 452     case TOP_PATCHLIST_19:
 453     case TOP_PATCHLIST_20:
 454     case TOP_PATCHLIST_21:
 455     case TOP_PATCHLIST_22:
 456     case TOP_PATCHLIST_23:
 457     case TOP_PATCHLIST_24:
 458     case TOP_PATCHLIST_25:
 459     case TOP_PATCHLIST_26:
 460     case TOP_PATCHLIST_27:
 461     case TOP_PATCHLIST_28:
 462     case TOP_PATCHLIST_29:
 463     case TOP_PATCHLIST_30:
 464     case TOP_PATCHLIST_31:
 465     case TOP_PATCHLIST_32:
 466         numVerts = topology - TOP_PATCHLIST_BASE;
 467         break;
 468     default:
 469         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 470         break;
 471     }
 472
 473     if (includeAdjVerts)
 474     {
 475         switch (topology)
 476         {
 477         case TOP_LISTSTRIP_ADJ:
 478         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 479         case TOP_TRI_STRIP_ADJ:
 480         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 481         default: break;
 482         }
 483     }
 484
 485     return numVerts;
 486 }
 487
 488 //////////////////////////////////////////////////////////////////////////
 489 /// @brief Generate mask from remaining work.
 490 /// @param numWorkItems - Number of items being worked on by a SIMD.
 491 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 492 {
 493     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 494     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 495     return _simd_castps_si(vMask(mask));
 496 }
 497
 498 //////////////////////////////////////////////////////////////////////////
 499 /// @brief StreamOut - Streams vertex data out to SO buffers.
 500 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 501 /// @param pDC - pointer to draw context.
 502 /// @param workerId - thread's worker id. Even thread has a unique id.
 503 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 504 static void StreamOut(
 505     DRAW_CONTEXT* pDC,
 506     PA_STATE& pa,
 507     uint32_t workerId,
 508     uint32_t* pPrimData,
 509     uint32_t streamIndex)
 510 {
 511     RDTSC_START(FEStreamout);
 512
 513     SWR_CONTEXT* pContext = pDC->pContext;
 514
 515     const API_STATE& state = GetApiState(pDC);
 516     const SWR_STREAMOUT_STATE &soState = state.soState;
 517
 518     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 519
 520     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 521     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 522
 523     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 524
 525     // Setup buffer state pointers.
 526     for (uint32_t i = 0; i < 4; ++i)
 527     {
 528         soContext.pBuffer[i] = &state.soBuffer[i];
 529     }
 530
 531     uint32_t numPrims = pa.NumPrims();
 532     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 533     {
 534         DWORD slot = 0;
 535         uint32_t soMask = soState.streamMasks[streamIndex];
 536
 537         // Write all entries into primitive data buffer for SOS.
 538         while (_BitScanForward(&slot, soMask))
 539         {
 540             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 541             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 542             pa.AssembleSingle(paSlot, primIndex, attrib);
 543
 544             // Attribute offset is relative offset from start of vertex.
 545             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 546             // to prim data starting at slot 0. Which is why we do (slot - 1).
 547             // Also note: GL works slightly differently, and needs slot 0
 548             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 549
 550             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 551             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 552             {
 553                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 554
 555                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 556             }
 557             soMask &= ~(1 << slot);
 558         }
 559
 560         // Update pPrimData pointer
 561         soContext.pPrimData = pPrimData;
 562
 563         // Call SOS
 564         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 565         state.pfnSoFunc[streamIndex](soContext);
 566     }
 567
 568     // Update SO write offset. The driver provides memory for the update.
 569     for (uint32_t i = 0; i < 4; ++i)
 570     {
 571         if (state.soBuffer[i].pWriteOffset)
 572         {
 573             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 574         }
 575
 576         if (state.soBuffer[i].soWriteEnable)
 577         {
 578             pDC->dynState.SoWriteOffset[i] = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 579             pDC->dynState.SoWriteOffsetDirty[i] = true;
 580         }
 581     }
 582
 583     UPDATE_STAT_FE(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 584     UPDATE_STAT_FE(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 585
 586     RDTSC_STOP(FEStreamout, 1, 0);
 587 }
 588
 589 //////////////////////////////////////////////////////////////////////////
 590 /// @brief Computes number of invocations. The current index represents
 591 ///        the start of the SIMD. The max index represents how much work
 592 ///        items are remaining. If there is less then a SIMD's left of work
 593 ///        then return the remaining amount of work.
 594 /// @param curIndex - The start index for the SIMD.
 595 /// @param maxIndex - The last index for all work items.
 596 static INLINE uint32_t GetNumInvocations(
 597     uint32_t curIndex,
 598     uint32_t maxIndex)
 599 {
 600     uint32_t remainder = (maxIndex - curIndex);
 601     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 602 }
 603
 604 //////////////////////////////////////////////////////////////////////////
 605 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 606 ///        The geometry shader will loop over each active streamout buffer, assembling
 607 ///        primitives for the downstream stages. When multistream output is enabled,
 608 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 609 ///        buffer for the primitive assembler.
 610 /// @param stream - stream id to generate the cut buffer for
 611 /// @param pStreamIdBase - pointer to the stream ID buffer
 612 /// @param numEmittedVerts - Number of total verts emitted by the GS
 613 /// @param pCutBuffer - output buffer to write cuts to
 614 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 615 {
 616     SWR_ASSERT(stream < MAX_SO_STREAMS);
 617
 618     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 619     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 620
 621     for (uint32_t b = 0; b < numOutputBytes; ++b)
 622     {
 623         uint8_t curInputByte = pStreamIdBase[2*b];
 624         uint8_t outByte = 0;
 625         for (uint32_t i = 0; i < 4; ++i)
 626         {
 627             if ((curInputByte & 0x3) != stream)
 628             {
 629                 outByte |= (1 << i);
 630             }
 631             curInputByte >>= 2;
 632         }
 633
 634         curInputByte = pStreamIdBase[2 * b + 1];
 635         for (uint32_t i = 0; i < 4; ++i)
 636         {
 637             if ((curInputByte & 0x3) != stream)
 638             {
 639                 outByte |= (1 << (i + 4));
 640             }
 641             curInputByte >>= 2;
 642         }
 643
 644         *pCutBuffer++ = outByte;
 645     }
 646 }
 647
 648 THREAD SWR_GS_CONTEXT tlsGsContext;
 649
 650 //////////////////////////////////////////////////////////////////////////
 651 /// @brief Implements GS stage.
 652 /// @param pDC - pointer to draw context.
 653 /// @param workerId - thread's worker id. Even thread has a unique id.
 654 /// @param pa - The primitive assembly object.
 655 /// @param pGsOut - output stream for GS
 656 template <
 657     typename HasStreamOutT,
 658     typename HasRastT>
 659 static void GeometryShaderStage(
 660     DRAW_CONTEXT *pDC,
 661     uint32_t workerId,
 662     PA_STATE& pa,
 663     void* pGsOut,
 664     void* pCutBuffer,
 665     void* pStreamCutBuffer,
 666     uint32_t* pSoPrimData,
 667     simdscalari primID)
 668 {
 669     RDTSC_START(FEGeometryShader);
 670
 671     SWR_CONTEXT* pContext = pDC->pContext;
 672
 673     const API_STATE& state = GetApiState(pDC);
 674     const SWR_GS_STATE* pState = &state.gsState;
 675
 676     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 677     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 678
 679     tlsGsContext.pStream = (uint8_t*)pGsOut;
 680     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 681     tlsGsContext.PrimitiveID = primID;
 682
 683     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 684     simdvector attrib[MAX_ATTRIBUTES];
 685
 686     // assemble all attributes for the input primitive
 687     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 688     {
 689         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 690         pa.Assemble(attribSlot, attrib);
 691
 692         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 693         {
 694             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 695         }
 696     }
 697
 698     // assemble position
 699     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 700     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 701     {
 702         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 703     }
 704
 705     const uint32_t vertexStride = sizeof(simdvertex);
 706     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 707     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 708     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 709     uint32_t cutPrimStride;
 710     uint32_t cutInstanceStride;
 711
 712     if (pState->isSingleStream)
 713     {
 714         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 715         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 716     }
 717     else
 718     {
 719         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 720         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 721     }
 722
 723     // record valid prims from the frontend to avoid over binning the newly generated
 724     // prims from the GS
 725     uint32_t numInputPrims = pa.NumPrims();
 726
 727     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 728     {
 729         tlsGsContext.InstanceID = instance;
 730         tlsGsContext.mask = GenerateMask(numInputPrims);
 731
 732         // execute the geometry shader
 733         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 734
 735         tlsGsContext.pStream += instanceStride;
 736         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 737     }
 738
 739     // set up new binner and state for the GS output topology
 740     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 741     if (HasRastT::value)
 742     {
 743         switch (pState->outputTopology)
 744         {
 745         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 746         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 747         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 748         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 749         }
 750     }
 751
 752     // foreach input prim:
 753     // - setup a new PA based on the emitted verts for that prim
 754     // - loop over the new verts, calling PA to assemble each prim
 755     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 756     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 757
 758     uint32_t totalPrimsGenerated = 0;
 759     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 760     {
 761         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 762         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 763         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 764         {
 765             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 766             if (numEmittedVerts == 0)
 767             {
 768                 continue;
 769             }
 770
 771             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 772             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 773
 774             uint32_t numAttribs = state.feNumAttributes;
 775
 776             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 777             {
 778                 bool processCutVerts = false;
 779
 780                 uint8_t* pCutBuffer = pCutBase;
 781
 782                 // assign default stream ID, only relevant when GS is outputting a single stream
 783                 uint32_t streamID = 0;
 784                 if (pState->isSingleStream)
 785                 {
 786                     processCutVerts = true;
 787                     streamID = pState->singleStreamID;
 788                     if (streamID != stream) continue;
 789                 }
 790                 else
 791                 {
 792                     // early exit if this stream is not enabled for streamout
 793                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 794                     {
 795                         continue;
 796                     }
 797
 798                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 799                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 800                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 801                     processCutVerts = false;
 802                 }
 803
 804                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 805
 806                 while (gsPa.GetNextStreamOutput())
 807                 {
 808                     do
 809                     {
 810                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 811
 812                         if (assemble)
 813                         {
 814                             totalPrimsGenerated += gsPa.NumPrims();
 815
 816                             if (HasStreamOutT::value)
 817                             {
 818                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 819                             }
 820
 821                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 822                             {
 823                                 simdscalari vPrimId;
 824                                 // pull primitiveID from the GS output if available
 825                                 if (state.gsState.emitsPrimitiveID)
 826                                 {
 827                                     simdvector primIdAttrib[3];
 828                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 829                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 830                                 }
 831                                 else
 832                                 {
 833                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 834                                 }
 835
 836                                 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
 837                                 simdscalari vViewPortIdx;
 838                                 if (state.gsState.emitsViewportArrayIndex)
 839                                 {
 840                                     simdvector vpiAttrib[3];
 841                                     gsPa.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT, vpiAttrib);
 842
 843                                     // OOB indices => forced to zero.
 844                                     simdscalari vNumViewports = _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS);
 845                                     simdscalar vClearMask = _simd_cmplt_ps(vpiAttrib[0].x, _simd_castsi_ps(vNumViewports));
 846                                     vpiAttrib[0].x = _simd_and_ps(vClearMask, vpiAttrib[0].x);
 847
 848                                     vViewPortIdx = _simd_castps_si(vpiAttrib[0].x);
 849                                 }
 850                                 else
 851                                 {
 852                                     vViewPortIdx = _simd_set1_epi32(0);
 853                                 }
 854
 855                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId, vViewPortIdx);
 856                             }
 857                         }
 858                     } while (gsPa.NextPrim());
 859                 }
 860             }
 861         }
 862     }
 863
 864     // update GS pipeline stats
 865     UPDATE_STAT_FE(GsInvocations, numInputPrims * pState->instanceCount);
 866     UPDATE_STAT_FE(GsPrimitives, totalPrimsGenerated);
 867
 868     RDTSC_STOP(FEGeometryShader, 1, 0);
 869 }
 870
 871 //////////////////////////////////////////////////////////////////////////
 872 /// @brief Allocate GS buffers
 873 /// @param pDC - pointer to draw context.
 874 /// @param state - API state
 875 /// @param ppGsOut - pointer to GS output buffer allocation
 876 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 877 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 878     void **ppStreamCutBuffer)
 879 {
 880     auto pArena = pDC->pArena;
 881     SWR_ASSERT(pArena != nullptr);
 882     SWR_ASSERT(state.gsState.gsEnable);
 883     // allocate arena space to hold GS output verts
 884     // @todo pack attribs
 885     // @todo support multiple streams
 886     const uint32_t vertexStride = sizeof(simdvertex);
 887     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 888     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 889     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 890
 891     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 892     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 893     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 894     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 895
 896     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 897     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 898
 899     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 900     if (state.gsState.isSingleStream)
 901     {
 902         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 903         *ppStreamCutBuffer = nullptr;
 904     }
 905     else
 906     {
 907         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 908         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 909     }
 910
 911 }
 912
 913 //////////////////////////////////////////////////////////////////////////
 914 /// @brief Contains all data generated by the HS and passed to the
 915 /// tessellator and DS.
 916 struct TessellationThreadLocalData
 917 {
 918     SWR_HS_CONTEXT hsContext;
 919     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 920     void* pTxCtx;
 921     size_t tsCtxSize;
 922
 923     simdscalar* pDSOutput;
 924     size_t numDSOutputVectors;
 925 };
 926
 927 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 928
 929 //////////////////////////////////////////////////////////////////////////
 930 /// @brief Allocate tessellation data for this worker thread.
 931 INLINE
 932 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 933 {
 934     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 935     if (gt_pTessellationThreadData == nullptr)
 936     {
 937         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 938             AlignedMalloc(sizeof(TessellationThreadLocalData), 64);
 939         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 940     }
 941 }
 942
 943 //////////////////////////////////////////////////////////////////////////
 944 /// @brief Implements Tessellation Stages.
 945 /// @param pDC - pointer to draw context.
 946 /// @param workerId - thread's worker id. Even thread has a unique id.
 947 /// @param pa - The primitive assembly object.
 948 /// @param pGsOut - output stream for GS
 949 template <
 950     typename HasGeometryShaderT,
 951     typename HasStreamOutT,
 952     typename HasRastT>
 953 static void TessellationStages(
 954     DRAW_CONTEXT *pDC,
 955     uint32_t workerId,
 956     PA_STATE& pa,
 957     void* pGsOut,
 958     void* pCutBuffer,
 959     void* pCutStreamBuffer,
 960     uint32_t* pSoPrimData,
 961     simdscalari primID)
 962 {
 963     const API_STATE& state = GetApiState(pDC);
 964     const SWR_TS_STATE& tsState = state.tsState;
 965     SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
 966
 967     SWR_ASSERT(gt_pTessellationThreadData);
 968
 969     HANDLE tsCtx = TSInitCtx(
 970         tsState.domain,
 971         tsState.partitioning,
 972         tsState.tsOutputTopology,
 973         gt_pTessellationThreadData->pTxCtx,
 974         gt_pTessellationThreadData->tsCtxSize);
 975     if (tsCtx == nullptr)
 976     {
 977         gt_pTessellationThreadData->pTxCtx = AlignedMalloc(gt_pTessellationThreadData->tsCtxSize, 64);
 978         tsCtx = TSInitCtx(
 979             tsState.domain,
 980             tsState.partitioning,
 981             tsState.tsOutputTopology,
 982             gt_pTessellationThreadData->pTxCtx,
 983             gt_pTessellationThreadData->tsCtxSize);
 984     }
 985     SWR_ASSERT(tsCtx);
 986
 987     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 988     if (HasRastT::value)
 989     {
 990         switch (tsState.postDSTopology)
 991         {
 992         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
 993         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
 994         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
 995         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
 996         }
 997     }
 998
 999     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1000     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1001     hsContext.PrimitiveID = primID;
1002
1003     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1004     // Max storage for one attribute for an entire simdprimitive
1005     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1006
1007     // assemble all attributes for the input primitives
1008     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1009     {
1010         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1011         pa.Assemble(attribSlot, simdattrib);
1012
1013         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1014         {
1015             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1016         }
1017     }
1018
1019 #if defined(_DEBUG)
1020     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1021 #endif
1022
1023     uint32_t numPrims = pa.NumPrims();
1024     hsContext.mask = GenerateMask(numPrims);
1025
1026     // Run the HS
1027     RDTSC_START(FEHullShader);
1028     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1029     RDTSC_STOP(FEHullShader, 0, 0);
1030
1031     UPDATE_STAT_FE(HsInvocations, numPrims);
1032
1033     const uint32_t* pPrimId = (const uint32_t*)&primID;
1034
1035     for (uint32_t p = 0; p < numPrims; ++p)
1036     {
1037         // Run Tessellator
1038         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1039         RDTSC_START(FETessellation);
1040         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1041         RDTSC_STOP(FETessellation, 0, 0);
1042
1043         if (tsData.NumPrimitives == 0)
1044         {
1045             continue;
1046         }
1047         SWR_ASSERT(tsData.NumDomainPoints);
1048
1049         // Allocate DS Output memory
1050         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1051         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1052         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1053         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1054         {
1055             AlignedFree(gt_pTessellationThreadData->pDSOutput);
1056             gt_pTessellationThreadData->pDSOutput = (simdscalar*)AlignedMalloc(requiredAllocSize, 64);
1057             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1058         }
1059         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1060         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1061
1062 #if defined(_DEBUG)
1063         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1064 #endif
1065
1066         // Run Domain Shader
1067         SWR_DS_CONTEXT dsContext;
1068         dsContext.PrimitiveID = pPrimId[p];
1069         dsContext.pCpIn = &hsContext.pCPout[p];
1070         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1071         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1072         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1073         dsContext.vectorStride = requiredDSVectorInvocations;
1074
1075         uint32_t dsInvocations = 0;
1076
1077         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1078         {
1079             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1080
1081             RDTSC_START(FEDomainShader);
1082             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1083             RDTSC_STOP(FEDomainShader, 0, 0);
1084
1085             dsInvocations += KNOB_SIMD_WIDTH;
1086         }
1087         UPDATE_STAT_FE(DsInvocations, tsData.NumDomainPoints);
1088
1089         PA_TESS tessPa(
1090             pDC,
1091             dsContext.pOutputData,
1092             dsContext.vectorStride,
1093             tsState.numDsOutputAttribs,
1094             tsData.ppIndices,
1095             tsData.NumPrimitives,
1096             tsState.postDSTopology);
1097
1098         while (tessPa.HasWork())
1099         {
1100             if (HasGeometryShaderT::value)
1101             {
1102                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1103                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1104                     _simd_set1_epi32(dsContext.PrimitiveID));
1105             }
1106             else
1107             {
1108                 if (HasStreamOutT::value)
1109                 {
1110                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1111                 }
1112
1113                 if (HasRastT::value)
1114                 {
1115                     simdvector prim[3]; // Only deal with triangles, lines, or points
1116                     RDTSC_START(FEPAAssemble);
1117 #if SWR_ENABLE_ASSERTS
1118                     bool assemble =
1119 #endif
1120                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1121                     RDTSC_STOP(FEPAAssemble, 1, 0);
1122                     SWR_ASSERT(assemble);
1123
1124                     SWR_ASSERT(pfnClipFunc);
1125                     pfnClipFunc(pDC, tessPa, workerId, prim,
1126                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0));
1127                 }
1128             }
1129
1130             tessPa.NextPrim();
1131
1132         } // while (tessPa.HasWork())
1133     } // for (uint32_t p = 0; p < numPrims; ++p)
1134
1135     TSDestroyCtx(tsCtx);
1136 }
1137
1138 //////////////////////////////////////////////////////////////////////////
1139 /// @brief FE handler for SwrDraw.
1140 /// @tparam IsIndexedT - Is indexed drawing enabled
1141 /// @tparam HasTessellationT - Is tessellation enabled
1142 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1143 /// @tparam HasStreamOutT - Is stream-out enabled
1144 /// @tparam HasRastT - Is rasterization enabled
1145 /// @param pContext - pointer to SWR context.
1146 /// @param pDC - pointer to draw context.
1147 /// @param workerId - thread's worker id.
1148 /// @param pUserData - Pointer to DRAW_WORK
1149 template <
1150     typename IsIndexedT,
1151     typename IsCutIndexEnabledT,
1152     typename HasTessellationT,
1153     typename HasGeometryShaderT,
1154     typename HasStreamOutT,
1155     typename HasRastT>
1156 void ProcessDraw(
1157     SWR_CONTEXT *pContext,
1158     DRAW_CONTEXT *pDC,
1159     uint32_t workerId,
1160     void *pUserData)
1161 {
1162
1163 #if KNOB_ENABLE_TOSS_POINTS
1164     if (KNOB_TOSS_QUEUE_FE)
1165     {
1166         return;
1167     }
1168 #endif
1169
1170     RDTSC_START(FEProcessDraw);
1171
1172     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1173     const API_STATE&    state = GetApiState(pDC);
1174     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1175     SWR_VS_CONTEXT      vsContext;
1176     simdvertex          vin;
1177
1178     int indexSize = 0;
1179     uint32_t endVertex = work.numVerts;
1180
1181     const int32_t* pLastRequestedIndex = nullptr;
1182     if (IsIndexedT::value)
1183     {
1184         switch (work.type)
1185         {
1186         case R32_UINT:
1187             indexSize = sizeof(uint32_t);
1188             pLastRequestedIndex = &(work.pIB[endVertex]);
1189             break;
1190         case R16_UINT:
1191             indexSize = sizeof(uint16_t);
1192             // nasty address offset to last index
1193             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1194             break;
1195         case R8_UINT:
1196             indexSize = sizeof(uint8_t);
1197             // nasty address offset to last index
1198             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1199             break;
1200         default:
1201             SWR_ASSERT(0);
1202         }
1203     }
1204     else
1205     {
1206         // No cuts, prune partial primitives.
1207         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1208     }
1209
1210     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1211     fetchInfo.pStreams = &state.vertexBuffers[0];
1212     fetchInfo.StartInstance = work.startInstance;
1213     fetchInfo.StartVertex = 0;
1214
1215     vsContext.pVin = &vin;
1216
1217     if (IsIndexedT::value)
1218     {
1219         fetchInfo.BaseVertex = work.baseVertex;
1220
1221         // if the entire index buffer isn't being consumed, set the last index
1222         // so that fetches < a SIMD wide will be masked off
1223         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1224         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1225         {
1226             fetchInfo.pLastIndex = pLastRequestedIndex;
1227         }
1228     }
1229     else
1230     {
1231         fetchInfo.StartVertex = work.startVertex;
1232     }
1233
1234 #ifdef KNOB_ENABLE_RDTSC
1235     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1236 #endif
1237
1238     void* pGsOut = nullptr;
1239     void* pCutBuffer = nullptr;
1240     void* pStreamCutBuffer = nullptr;
1241     if (HasGeometryShaderT::value)
1242     {
1243         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1244     }
1245
1246     if (HasTessellationT::value)
1247     {
1248         SWR_ASSERT(state.tsState.tsEnable == true);
1249         SWR_ASSERT(state.pfnHsFunc != nullptr);
1250         SWR_ASSERT(state.pfnDsFunc != nullptr);
1251
1252         AllocateTessellationData(pContext);
1253     }
1254     else
1255     {
1256         SWR_ASSERT(state.tsState.tsEnable == false);
1257         SWR_ASSERT(state.pfnHsFunc == nullptr);
1258         SWR_ASSERT(state.pfnDsFunc == nullptr);
1259     }
1260
1261     // allocate space for streamout input prim data
1262     uint32_t* pSoPrimData = nullptr;
1263     if (HasStreamOutT::value)
1264     {
1265         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1266     }
1267
1268     // choose primitive assembler
1269     PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts);
1270     PA_STATE& pa = paFactory.GetPA();
1271
1272     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1273     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1274     {
1275         simdscalari vIndex;
1276         uint32_t  i = 0;
1277
1278         if (IsIndexedT::value)
1279         {
1280             fetchInfo.pIndices = work.pIB;
1281         }
1282         else
1283         {
1284             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1285             fetchInfo.pIndices = (const int32_t*)&vIndex;
1286         }
1287
1288         fetchInfo.CurInstance = instanceNum;
1289         vsContext.InstanceID = instanceNum;
1290
1291         while (pa.HasWork())
1292         {
1293             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1294             // So we need to keep this outside of (i < endVertex) check.
1295             simdmask* pvCutIndices = nullptr;
1296             if (IsIndexedT::value)
1297             {
1298                 pvCutIndices = &pa.GetNextVsIndices();
1299             }
1300
1301             simdvertex& vout = pa.GetNextVsOutput();
1302             vsContext.pVout = &vout;
1303
1304             if (i < endVertex)
1305             {
1306
1307                 // 1. Execute FS/VS for a single SIMD.
1308                 RDTSC_START(FEFetchShader);
1309                 state.pfnFetchFunc(fetchInfo, vin);
1310                 RDTSC_STOP(FEFetchShader, 0, 0);
1311
1312                 // forward fetch generated vertex IDs to the vertex shader
1313                 vsContext.VertexID = fetchInfo.VertexID;
1314
1315                 // Setup active mask for vertex shader.
1316                 vsContext.mask = GenerateMask(endVertex - i);
1317
1318                 // forward cut mask to the PA
1319                 if (IsIndexedT::value)
1320                 {
1321                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1322                 }
1323
1324                 UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex));
1325
1326 #if KNOB_ENABLE_TOSS_POINTS
1327                 if (!KNOB_TOSS_FETCH)
1328 #endif
1329                 {
1330                     RDTSC_START(FEVertexShader);
1331                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1332                     RDTSC_STOP(FEVertexShader, 0, 0);
1333
1334                     UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex));
1335                 }
1336             }
1337
1338             // 2. Assemble primitives given the last two SIMD.
1339             do
1340             {
1341                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1342                 // PaAssemble returns false if there is not enough verts to assemble.
1343                 RDTSC_START(FEPAAssemble);
1344                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1345                 RDTSC_STOP(FEPAAssemble, 1, 0);
1346
1347 #if KNOB_ENABLE_TOSS_POINTS
1348                 if (!KNOB_TOSS_FETCH)
1349 #endif
1350                 {
1351 #if KNOB_ENABLE_TOSS_POINTS
1352                     if (!KNOB_TOSS_VS)
1353 #endif
1354                     {
1355                         if (assemble)
1356                         {
1357                             UPDATE_STAT_FE(IaPrimitives, pa.NumPrims());
1358
1359                             if (HasTessellationT::value)
1360                             {
1361                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1362                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1363                             }
1364                             else if (HasGeometryShaderT::value)
1365                             {
1366                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1367                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1368                             }
1369                             else
1370                             {
1371                                 // If streamout is enabled then stream vertices out to memory.
1372                                 if (HasStreamOutT::value)
1373                                 {
1374                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1375                                 }
1376
1377                                 if (HasRastT::value)
1378                                 {
1379                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1380                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1381                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0));
1382                                 }
1383                             }
1384                         }
1385                     }
1386                 }
1387             } while (pa.NextPrim());
1388
1389             i += KNOB_SIMD_WIDTH;
1390             if (IsIndexedT::value)
1391             {
1392                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1393             }
1394             else
1395             {
1396                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1397             }
1398         }
1399         pa.Reset();
1400     }
1401
1402     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1403 }
1404
1405 struct FEDrawChooser
1406 {
1407     typedef PFN_FE_WORK_FUNC FuncType;
1408
1409     template <typename... ArgsB>
1410     static FuncType GetFunc()
1411     {
1412         return ProcessDraw<ArgsB...>;
1413     }
1414 };
1415
1416
1417 // Selector for correct templated Draw front-end function
1418 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1419     bool IsIndexed,
1420     bool IsCutIndexEnabled,
1421     bool HasTessellation,
1422     bool HasGeometryShader,
1423     bool HasStreamOut,
1424     bool HasRasterization)
1425 {
1426     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, IsCutIndexEnabled, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1427 }
1428
1429 //////////////////////////////////////////////////////////////////////////
1430 /// @brief Processes attributes for the backend based on linkage mask and
1431 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1432 /// @param pDC - Draw context
1433 /// @param pa - Primitive Assembly state
1434 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1435 /// @param pLinkageMap - maps VS attribute slot to PS slot
1436 /// @param triIndex - Triangle to process attributes for
1437 /// @param pBuffer - Output result
1438 template<typename NumVertsT, typename IsSwizzledT, typename HasConstantInterpT, typename IsDegenerate>
1439 INLINE void ProcessAttributes(
1440     DRAW_CONTEXT *pDC,
1441     PA_STATE&pa,
1442     uint32_t triIndex,
1443     uint32_t primId,
1444     float *pBuffer)
1445 {
1446     static_assert(NumVertsT::value > 0 && NumVertsT::value <= 3, "Invalid value for NumVertsT");
1447     const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
1448     // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1449     LONG constantInterpMask = IsDegenerate::value ? 0xFFFFFFFF : backendState.constantInterpolationMask;
1450     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1451     const PRIMITIVE_TOPOLOGY topo = pDC->pState->state.topology;
1452
1453     static const float constTable[3][4] = {
1454         {0.0f, 0.0f, 0.0f, 0.0f},
1455         {0.0f, 0.0f, 0.0f, 1.0f},
1456         {1.0f, 1.0f, 1.0f, 1.0f}
1457     };
1458
1459     for (uint32_t i = 0; i < backendState.numAttributes; ++i)
1460     {
1461         uint32_t inputSlot;
1462         if (IsSwizzledT::value)
1463         {
1464             SWR_ATTRIB_SWIZZLE attribSwizzle = backendState.swizzleMap[i];
1465             inputSlot = VERTEX_ATTRIB_START_SLOT + attribSwizzle.sourceAttrib;
1466
1467         }
1468         else
1469         {
1470             inputSlot = VERTEX_ATTRIB_START_SLOT + i;
1471         }
1472
1473         __m128 attrib[3];    // triangle attribs (always 4 wide)
1474         float* pAttribStart = pBuffer;
1475
1476         if (HasConstantInterpT::value || IsDegenerate::value)
1477         {
1478             if (_bittest(&constantInterpMask, i))
1479             {
1480                 uint32_t vid;
1481                 uint32_t adjustedTriIndex;
1482                 static const uint32_t tristripProvokingVertex[] = { 0, 2, 1 };
1483                 static const int32_t quadProvokingTri[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1484                 static const uint32_t quadProvokingVertex[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1485                 static const int32_t qstripProvokingTri[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1486                 static const uint32_t qstripProvokingVertex[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1487
1488                 switch (topo) {
1489                 case TOP_QUAD_LIST:
1490                     adjustedTriIndex = triIndex + quadProvokingTri[triIndex & 1][provokingVertex];
1491                     vid = quadProvokingVertex[triIndex & 1][provokingVertex];
1492                     break;
1493                 case TOP_QUAD_STRIP:
1494                     adjustedTriIndex = triIndex + qstripProvokingTri[triIndex & 1][provokingVertex];
1495                     vid = qstripProvokingVertex[triIndex & 1][provokingVertex];
1496                     break;
1497                 case TOP_TRIANGLE_STRIP:
1498                     adjustedTriIndex = triIndex;
1499                     vid = (triIndex & 1)
1500                         ? tristripProvokingVertex[provokingVertex]
1501                         : provokingVertex;
1502                     break;
1503                 default:
1504                     adjustedTriIndex = triIndex;
1505                     vid = provokingVertex;
1506                     break;
1507                 }
1508
1509                 pa.AssembleSingle(inputSlot, adjustedTriIndex, attrib);
1510
1511                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1512                 {
1513                     _mm_store_ps(pBuffer, attrib[vid]);
1514                     pBuffer += 4;
1515                 }
1516             }
1517             else
1518             {
1519                 pa.AssembleSingle(inputSlot, triIndex, attrib);
1520
1521                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
1522                 {
1523                     _mm_store_ps(pBuffer, attrib[i]);
1524                     pBuffer += 4;
1525                 }
1526             }
1527         }
1528         else
1529         {
1530             pa.AssembleSingle(inputSlot, triIndex, attrib);
1531
1532             for (uint32_t i = 0; i < NumVertsT::value; ++i)
1533             {
1534                 _mm_store_ps(pBuffer, attrib[i]);
1535                 pBuffer += 4;
1536             }
1537         }
1538
1539         // pad out the attrib buffer to 3 verts to ensure the triangle
1540         // interpolation code in the pixel shader works correctly for the
1541         // 3 topologies - point, line, tri.  This effectively zeros out the
1542         // effect of the missing vertices in the triangle interpolation.
1543         for (uint32_t v = NumVertsT::value; v < 3; ++v)
1544         {
1545             _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
1546             pBuffer += 4;
1547         }
1548
1549         // check for constant source overrides
1550         if (IsSwizzledT::value)
1551         {
1552             uint32_t mask = backendState.swizzleMap[i].componentOverrideMask;
1553             if (mask)
1554             {
1555                 DWORD comp;
1556                 while (_BitScanForward(&comp, mask))
1557                 {
1558                     mask &= ~(1 << comp);
1559
1560                     float constantValue = 0.0f;
1561                     switch ((SWR_CONSTANT_SOURCE)backendState.swizzleMap[i].constantSource)
1562                     {
1563                     case SWR_CONSTANT_SOURCE_CONST_0000:
1564                     case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT:
1565                     case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT:
1566                         constantValue = constTable[backendState.swizzleMap[i].constantSource][comp];
1567                         break;
1568                     case SWR_CONSTANT_SOURCE_PRIM_ID:
1569                         constantValue = *(float*)&primId;
1570                         break;
1571                     }
1572
1573                     // apply constant value to all 3 vertices
1574                     for (uint32_t v = 0; v < 3; ++v)
1575                     {
1576                         pAttribStart[comp + v * 4] = constantValue;
1577                     }
1578                 }
1579             }
1580         }
1581     }
1582 }
1583
1584
1585 typedef void(*PFN_PROCESS_ATTRIBUTES)(DRAW_CONTEXT*, PA_STATE&, uint32_t, uint32_t, float*);
1586
1587 struct ProcessAttributesChooser
1588 {
1589     typedef PFN_PROCESS_ATTRIBUTES FuncType;
1590
1591     template <typename... ArgsB>
1592     static FuncType GetFunc()
1593     {
1594         return ProcessAttributes<ArgsB...>;
1595     }
1596 };
1597
1598 PFN_PROCESS_ATTRIBUTES GetProcessAttributesFunc(uint32_t NumVerts, bool IsSwizzled, bool HasConstantInterp, bool IsDegenerate = false)
1599 {
1600     return TemplateArgUnroller<ProcessAttributesChooser>::GetFunc(IntArg<1, 3>{NumVerts}, IsSwizzled, HasConstantInterp, IsDegenerate);
1601 }
1602
1603 //////////////////////////////////////////////////////////////////////////
1604 /// @brief Processes enabled user clip distances. Loads the active clip
1605 ///        distances from the PA, sets up barycentric equations, and
1606 ///        stores the results to the output buffer
1607 /// @param pa - Primitive Assembly state
1608 /// @param primIndex - primitive index to process
1609 /// @param clipDistMask - mask of enabled clip distances
1610 /// @param pUserClipBuffer - buffer to store results
1611 template<uint32_t NumVerts>
1612 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1613 {
1614     DWORD clipDist;
1615     while (_BitScanForward(&clipDist, clipDistMask))
1616     {
1617         clipDistMask &= ~(1 << clipDist);
1618         uint32_t clipSlot = clipDist >> 2;
1619         uint32_t clipComp = clipDist & 0x3;
1620         uint32_t clipAttribSlot = clipSlot == 0 ?
1621             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1622
1623         __m128 primClipDist[3];
1624         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1625
1626         float vertClipDist[NumVerts];
1627         for (uint32_t e = 0; e < NumVerts; ++e)
1628         {
1629             OSALIGNSIMD(float) aVertClipDist[4];
1630             _mm_store_ps(aVertClipDist, primClipDist[e]);
1631             vertClipDist[e] = aVertClipDist[clipComp];
1632         };
1633
1634         // setup plane equations for barycentric interpolation in the backend
1635         float baryCoeff[NumVerts];
1636         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1637         {
1638             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1639         }
1640         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1641
1642         for (uint32_t e = 0; e < NumVerts; ++e)
1643         {
1644             *(pUserClipBuffer++) = baryCoeff[e];
1645         }
1646     }
1647 }
1648
1649 //////////////////////////////////////////////////////////////////////////
1650 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1651 /// Point precision from FP32.
1652 template <typename PT = FixedPointTraits<Fixed_16_8>>
1653 INLINE simdscalari fpToFixedPointVertical(const simdscalar vIn)
1654 {
1655     simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(PT::ScaleT::value));
1656     return _simd_cvtps_epi32(vFixed);
1657 }
1658
1659 //////////////////////////////////////////////////////////////////////////
1660 /// @brief Helper function to set the X,Y coords of a triangle to the
1661 /// requested Fixed Point precision from FP32.
1662 /// @param tri: simdvector[3] of FP triangle verts
1663 /// @param vXi: fixed point X coords of tri verts
1664 /// @param vYi: fixed point Y coords of tri verts
1665 INLINE static void FPToFixedPoint(const simdvector * const tri, simdscalari (&vXi)[3], simdscalari (&vYi)[3])
1666 {
1667     vXi[0] = fpToFixedPointVertical(tri[0].x);
1668     vYi[0] = fpToFixedPointVertical(tri[0].y);
1669     vXi[1] = fpToFixedPointVertical(tri[1].x);
1670     vYi[1] = fpToFixedPointVertical(tri[1].y);
1671     vXi[2] = fpToFixedPointVertical(tri[2].x);
1672     vYi[2] = fpToFixedPointVertical(tri[2].y);
1673 }
1674
1675 //////////////////////////////////////////////////////////////////////////
1676 /// @brief Calculate bounding box for current triangle
1677 /// @tparam CT: ConservativeRastFETraits type
1678 /// @param vX: fixed point X position for triangle verts
1679 /// @param vY: fixed point Y position for triangle verts
1680 /// @param bbox: fixed point bbox
1681 /// *Note*: expects vX, vY to be in the correct precision for the type
1682 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1683 template <typename CT>
1684 INLINE void calcBoundingBoxIntVertical(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1685 {
1686     simdscalari vMinX = vX[0];
1687     vMinX = _simd_min_epi32(vMinX, vX[1]);
1688     vMinX = _simd_min_epi32(vMinX, vX[2]);
1689
1690     simdscalari vMaxX = vX[0];
1691     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1692     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1693
1694     simdscalari vMinY = vY[0];
1695     vMinY = _simd_min_epi32(vMinY, vY[1]);
1696     vMinY = _simd_min_epi32(vMinY, vY[2]);
1697
1698     simdscalari vMaxY = vY[0];
1699     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1700     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1701
1702     bbox.left = vMinX;
1703     bbox.right = vMaxX;
1704     bbox.top = vMinY;
1705     bbox.bottom = vMaxY;
1706 }
1707
1708 //////////////////////////////////////////////////////////////////////////
1709 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1710 /// Offsets BBox for conservative rast
1711 template <>
1712 INLINE void calcBoundingBoxIntVertical<FEConservativeRastT>(const simdvector * const tri, simdscalari (&vX)[3], simdscalari (&vY)[3], simdBBox &bbox)
1713 {
1714     // FE conservative rast traits
1715     typedef FEConservativeRastT CT;
1716
1717     simdscalari vMinX = vX[0];
1718     vMinX = _simd_min_epi32(vMinX, vX[1]);
1719     vMinX = _simd_min_epi32(vMinX, vX[2]);
1720
1721     simdscalari vMaxX = vX[0];
1722     vMaxX = _simd_max_epi32(vMaxX, vX[1]);
1723     vMaxX = _simd_max_epi32(vMaxX, vX[2]);
1724
1725     simdscalari vMinY = vY[0];
1726     vMinY = _simd_min_epi32(vMinY, vY[1]);
1727     vMinY = _simd_min_epi32(vMinY, vY[2]);
1728
1729     simdscalari vMaxY = vY[0];
1730     vMaxY = _simd_max_epi32(vMaxY, vY[1]);
1731     vMaxY = _simd_max_epi32(vMaxY, vY[2]);
1732
1733     /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1734     /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1735     bbox.left = _simd_sub_epi32(vMinX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1736     bbox.right = _simd_add_epi32(vMaxX, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1737     bbox.top = _simd_sub_epi32(vMinY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1738     bbox.bottom = _simd_add_epi32(vMaxY, _simd_set1_epi32(CT::BoundingBoxOffsetT::value));
1739 }
1740
1741 //////////////////////////////////////////////////////////////////////////
1742 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1743 ///        culling, viewport transform, etc.
1744 /// @param pDC - pointer to draw context.
1745 /// @param pa - The primitive assembly object.
1746 /// @param workerId - thread's worker id. Even thread has a unique id.
1747 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1748 /// @param primID - Primitive ID for each triangle.
1749 /// @param viewportIdx - viewport array index for each triangle.
1750 /// @tparam CT - ConservativeRastFETraits
1751 template <typename CT>
1752 void BinTriangles(
1753     DRAW_CONTEXT *pDC,
1754     PA_STATE& pa,
1755     uint32_t workerId,
1756     simdvector tri[3],
1757     uint32_t triMask,
1758     simdscalari primID,
1759     simdscalari viewportIdx)
1760 {
1761     RDTSC_START(FEBinTriangles);
1762
1763     const API_STATE& state = GetApiState(pDC);
1764     const SWR_RASTSTATE& rastState = state.rastState;
1765     const SWR_FRONTEND_STATE& feState = state.frontendState;
1766     const SWR_GS_STATE& gsState = state.gsState;
1767     MacroTileMgr *pTileMgr = pDC->pTileMgr;
1768
1769
1770     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1771     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1772     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1773
1774     if (!feState.vpTransformDisable)
1775     {
1776         // perspective divide
1777         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1778         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1779         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1780
1781         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1782         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1783         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1784
1785         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1786         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1787         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1788
1789         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1790         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1791         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1792
1793         // viewport transform to screen coords
1794         if (state.gsState.emitsViewportArrayIndex)
1795         {
1796             viewportTransform<3>(tri, state.vpMatrices, viewportIdx);
1797         }
1798         else
1799         {
1800             viewportTransform<3>(tri, state.vpMatrices);
1801         }
1802     }
1803
1804     // adjust for pixel center location
1805     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1806     tri[0].x = _simd_add_ps(tri[0].x, offset);
1807     tri[0].y = _simd_add_ps(tri[0].y, offset);
1808
1809     tri[1].x = _simd_add_ps(tri[1].x, offset);
1810     tri[1].y = _simd_add_ps(tri[1].y, offset);
1811
1812     tri[2].x = _simd_add_ps(tri[2].x, offset);
1813     tri[2].y = _simd_add_ps(tri[2].y, offset);
1814
1815     simdscalari vXi[3], vYi[3];
1816     // Set vXi, vYi to required fixed point precision
1817     FPToFixedPoint(tri, vXi, vYi);
1818
1819     // triangle setup
1820     simdscalari vAi[3], vBi[3];
1821     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1822
1823     // determinant
1824     simdscalari vDet[2];
1825     calcDeterminantIntVertical(vAi, vBi, vDet);
1826
1827     // cull zero area
1828     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1829     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1830
1831     int cullZeroAreaMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH / 2));
1832
1833     uint32_t origTriMask = triMask;
1834     // don't cull degenerate triangles if we're conservatively rasterizing
1835     if(!CT::IsConservativeT::value)
1836     {
1837         triMask &= ~cullZeroAreaMask;
1838     }
1839
1840     // determine front winding tris
1841     // CW  +det
1842     // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1843     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1844     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1845     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1846
1847     uint32_t frontWindingTris;
1848     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1849     {
1850         frontWindingTris = cwTriMask;
1851     }
1852     else
1853     {
1854         frontWindingTris = ~cwTriMask;
1855     }
1856
1857     // cull
1858     uint32_t cullTris;
1859     switch ((SWR_CULLMODE)rastState.cullMode)
1860     {
1861     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1862     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1863     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1864     // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1865     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1866     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1867     }
1868
1869     triMask &= ~cullTris;
1870
1871     if (origTriMask ^ triMask)
1872     {
1873         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1874     }
1875
1876     /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1877     // compute per tri backface
1878     uint32_t frontFaceMask = frontWindingTris;
1879     uint32_t *pPrimID = (uint32_t *)&primID;
1880     DWORD triIndex = 0;
1881     // for center sample pattern, all samples are at pixel center; calculate coverage
1882     // once at center and broadcast the results in the backend
1883     const SWR_MULTISAMPLE_COUNT sampleCount = (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN) ? rastState.sampleCount : SWR_MULTISAMPLE_1X;
1884     uint32_t edgeEnable;
1885     PFN_WORK_FUNC pfnWork;
1886     if(CT::IsConservativeT::value)
1887     {
1888         // determine which edges of the degenerate tri, if any, are valid to rasterize.
1889         // used to call the appropriate templated rasterizer function
1890         if(cullZeroAreaMask > 0)
1891         {
1892             // e0 = v1-v0
1893             simdscalari x0x1Mask = _simd_cmpeq_epi32(vXi[0], vXi[1]);
1894             simdscalari y0y1Mask = _simd_cmpeq_epi32(vYi[0], vYi[1]);
1895             uint32_t e0Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask, y0y1Mask)));
1896
1897             // e1 = v2-v1
1898             simdscalari x1x2Mask = _simd_cmpeq_epi32(vXi[1], vXi[2]);
1899             simdscalari y1y2Mask = _simd_cmpeq_epi32(vYi[1], vYi[2]);
1900             uint32_t e1Mask = _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask, y1y2Mask)));
1901
1902             // e2 = v0-v2
1903             // if v0 == v1 & v1 == v2, v0 == v2
1904             uint32_t e2Mask = e0Mask & e1Mask;
1905             SWR_ASSERT(KNOB_SIMD_WIDTH == 8, "Need to update degenerate mask code for avx512");
1906
1907             // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1908             // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1909             e0Mask = pdep_u32(e0Mask, 0x00249249);
1910             // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1911             e1Mask = pdep_u32(e1Mask, 0x00492492);
1912             // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1913             e2Mask = pdep_u32(e2Mask, 0x00924924);
1914
1915             edgeEnable = (0x00FFFFFF & (~(e0Mask | e1Mask | e2Mask)));
1916         }
1917         else
1918         {
1919             edgeEnable = 0x00FFFFFF;
1920         }
1921     }
1922     else
1923     {
1924         // degenerate triangles won't be sent to rasterizer; just enable all edges
1925         pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
1926                                     (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, ALL_EDGES_VALID,
1927                                     (rastState.scissorEnable > 0));
1928     }
1929
1930     if (!triMask)
1931     {
1932         goto endBinTriangles;
1933     }
1934
1935     // Calc bounding box of triangles
1936     simdBBox bbox;
1937     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
1938
1939     // determine if triangle falls between pixel centers and discard
1940     // only discard for non-MSAA case and when conservative rast is disabled
1941     // (left + 127) & ~255
1942     // (right + 128) & ~255
1943     if(rastState.sampleCount == SWR_MULTISAMPLE_1X && (!CT::IsConservativeT::value))
1944     {
1945         origTriMask = triMask;
1946
1947         int cullCenterMask;
1948         {
1949             simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1950             left = _simd_and_si(left, _simd_set1_epi32(~255));
1951             simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1952             right = _simd_and_si(right, _simd_set1_epi32(~255));
1953
1954             simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1955
1956             simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1957             top = _simd_and_si(top, _simd_set1_epi32(~255));
1958             simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1959             bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1960
1961             simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1962             vMaskV = _simd_or_si(vMaskH, vMaskV);
1963             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1964         }
1965
1966         triMask &= ~cullCenterMask;
1967
1968         if(origTriMask ^ triMask)
1969         {
1970             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1971         }
1972     }
1973
1974     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1975     bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1976     bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1977     bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1978     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1979
1980     if(CT::IsConservativeT::value)
1981     {
1982         // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1983         // some area. Bump the right/bottom edges out
1984         simdscalari topEqualsBottom = _simd_cmpeq_epi32(bbox.top, bbox.bottom);
1985         bbox.bottom = _simd_blendv_epi32(bbox.bottom, _simd_add_epi32(bbox.bottom, _simd_set1_epi32(1)), topEqualsBottom);
1986         simdscalari leftEqualsRight = _simd_cmpeq_epi32(bbox.left, bbox.right);
1987         bbox.right = _simd_blendv_epi32(bbox.right, _simd_add_epi32(bbox.right, _simd_set1_epi32(1)), leftEqualsRight);
1988     }
1989
1990     // Cull tris completely outside scissor
1991     {
1992         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1993         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1994         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1995         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1996         triMask = triMask & ~maskOutsideScissor;
1997     }
1998
1999     if (!triMask)
2000     {
2001         goto endBinTriangles;
2002     }
2003
2004     // Convert triangle bbox to macrotile units.
2005     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2006     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2007     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2008     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2009
2010     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2011     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2012     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2013     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2014     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2015
2016     // transpose verts needed for backend
2017     /// @todo modify BE to take non-transformed verts
2018     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2019     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
2020     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
2021     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
2022     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
2023
2024     // store render target array index
2025     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2026     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2027     {
2028         simdvector vRtai[3];
2029         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2030         simdscalari vRtaii;
2031         vRtaii = _simd_castps_si(vRtai[0].x);
2032         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2033     }
2034     else
2035     {
2036         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2037     }
2038
2039     // scan remaining valid triangles and bin each separately
2040     while (_BitScanForward(&triIndex, triMask))
2041     {
2042         uint32_t linkageCount = state.backendState.numAttributes;
2043         uint32_t numScalarAttribs = linkageCount * 4;
2044
2045         BE_WORK work;
2046         work.type = DRAW;
2047
2048         bool isDegenerate;
2049         if(CT::IsConservativeT::value)
2050         {
2051             // only rasterize valid edges if we have a degenerate primitive
2052             int32_t triEdgeEnable = (edgeEnable >> (triIndex * 3)) & ALL_EDGES_VALID;
2053             work.pfnWork = GetRasterizerFunc(sampleCount, (rastState.conservativeRast > 0),
2054                                         (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, triEdgeEnable,
2055                                         (rastState.scissorEnable > 0));
2056
2057             // Degenerate triangles are required to be constant interpolated
2058             isDegenerate = (triEdgeEnable != ALL_EDGES_VALID) ? true : false;
2059         }
2060         else
2061         {
2062             isDegenerate = false;
2063             work.pfnWork = pfnWork;
2064         }
2065
2066         // Select attribute processor
2067         PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(3,
2068             state.backendState.swizzleEnable,  state.backendState.constantInterpolationMask, isDegenerate);
2069
2070         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2071
2072         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
2073         desc.triFlags.primID = pPrimID[triIndex];
2074         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
2075
2076         auto pArena = pDC->pArena;
2077         SWR_ASSERT(pArena != nullptr);
2078
2079         // store active attribs
2080         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2081         desc.pAttribs = pAttribs;
2082         desc.numAttribs = linkageCount;
2083         pfnProcessAttribs(pDC, pa, triIndex, pPrimID[triIndex], desc.pAttribs);
2084
2085         // store triangle vertex data
2086         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2087
2088         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
2089         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
2090         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
2091         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
2092
2093         // store user clip distances
2094         if (rastState.clipDistanceMask)
2095         {
2096             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2097             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
2098             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2099         }
2100
2101         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
2102         {
2103             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
2104             {
2105 #if KNOB_ENABLE_TOSS_POINTS
2106                 if (!KNOB_TOSS_SETUP_TRIS)
2107 #endif
2108                 {
2109                     pTileMgr->enqueue(x, y, &work);
2110                 }
2111             }
2112         }
2113         triMask &= ~(1 << triIndex);
2114     }
2115
2116 endBinTriangles:
2117     RDTSC_STOP(FEBinTriangles, 1, 0);
2118 }
2119
2120 struct FEBinTrianglesChooser
2121 {
2122     typedef PFN_PROCESS_PRIMS FuncType;
2123
2124     template <typename... ArgsB>
2125     static FuncType GetFunc()
2126     {
2127         return BinTriangles<ConservativeRastFETraits<ArgsB...>>;
2128     }
2129 };
2130
2131 // Selector for correct templated BinTrinagles function
2132 PFN_PROCESS_PRIMS GetBinTrianglesFunc(bool IsConservative)
2133 {
2134     return TemplateArgUnroller<FEBinTrianglesChooser>::GetFunc(IsConservative);
2135 }
2136
2137 //////////////////////////////////////////////////////////////////////////
2138 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
2139 /// @param pDC - pointer to draw context.
2140 /// @param pa - The primitive assembly object.
2141 /// @param workerId - thread's worker id. Even thread has a unique id.
2142 /// @param tri - Contains point position data for SIMDs worth of points.
2143 /// @param primID - Primitive ID for each point.
2144 void BinPoints(
2145     DRAW_CONTEXT *pDC,
2146     PA_STATE& pa,
2147     uint32_t workerId,
2148     simdvector prim[3],
2149     uint32_t primMask,
2150     simdscalari primID,
2151     simdscalari viewportIdx)
2152 {
2153     RDTSC_START(FEBinPoints);
2154
2155     simdvector& primVerts = prim[0];
2156
2157     const API_STATE& state = GetApiState(pDC);
2158     const SWR_FRONTEND_STATE& feState = state.frontendState;
2159     const SWR_GS_STATE& gsState = state.gsState;
2160     const SWR_RASTSTATE& rastState = state.rastState;
2161
2162     // Select attribute processor
2163     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(1,
2164         state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2165
2166     if (!feState.vpTransformDisable)
2167     {
2168         // perspective divide
2169         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
2170         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
2171         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
2172         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
2173
2174         // viewport transform to screen coords
2175         if (state.gsState.emitsViewportArrayIndex)
2176         {
2177             viewportTransform<1>(&primVerts, state.vpMatrices, viewportIdx);
2178         }
2179         else
2180         {
2181             viewportTransform<1>(&primVerts, state.vpMatrices);
2182         }
2183     }
2184
2185     // adjust for pixel center location
2186     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2187     primVerts.x = _simd_add_ps(primVerts.x, offset);
2188     primVerts.y = _simd_add_ps(primVerts.y, offset);
2189
2190     // convert to fixed point
2191     simdscalari vXi, vYi;
2192     vXi = fpToFixedPointVertical(primVerts.x);
2193     vYi = fpToFixedPointVertical(primVerts.y);
2194
2195     if (CanUseSimplePoints(pDC))
2196     {
2197         // adjust for top-left rule
2198         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
2199         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
2200
2201         // cull points off the top-left edge of the viewport
2202         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
2203         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
2204
2205         // compute macro tile coordinates
2206         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2207         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2208
2209         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
2210         _simd_store_si((simdscalari*)aMacroX, macroX);
2211         _simd_store_si((simdscalari*)aMacroY, macroY);
2212
2213         // compute raster tile coordinates
2214         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
2215         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
2216
2217         // compute raster tile relative x,y for coverage mask
2218         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
2219         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
2220
2221         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
2222         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
2223
2224         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
2225         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
2226         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
2227         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
2228
2229         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
2230         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
2231         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
2232         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
2233
2234         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
2235         _simd_store_ps((float*)aZ, primVerts.z);
2236
2237         // store render target array index
2238         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2239         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2240         {
2241             simdvector vRtai;
2242             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
2243             simdscalari vRtaii = _simd_castps_si(vRtai.x);
2244             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2245         }
2246         else
2247         {
2248             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2249         }
2250
2251         uint32_t *pPrimID = (uint32_t *)&primID;
2252         DWORD primIndex = 0;
2253
2254         const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
2255
2256         // scan remaining valid triangles and bin each separately
2257         while (_BitScanForward(&primIndex, primMask))
2258         {
2259             uint32_t linkageCount = backendState.numAttributes;
2260             uint32_t numScalarAttribs = linkageCount * 4;
2261
2262             BE_WORK work;
2263             work.type = DRAW;
2264
2265             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2266
2267             // points are always front facing
2268             desc.triFlags.frontFacing = 1;
2269             desc.triFlags.primID = pPrimID[primIndex];
2270             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2271
2272             work.pfnWork = RasterizeSimplePoint;
2273
2274             auto pArena = pDC->pArena;
2275             SWR_ASSERT(pArena != nullptr);
2276
2277             // store attributes
2278             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
2279             desc.pAttribs = pAttribs;
2280             desc.numAttribs = linkageCount;
2281
2282             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], pAttribs);
2283
2284             // store raster tile aligned x, y, perspective correct z
2285             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2286             desc.pTriBuffer = pTriBuffer;
2287             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
2288             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
2289             *pTriBuffer = aZ[primIndex];
2290
2291             uint32_t tX = aTileRelativeX[primIndex];
2292             uint32_t tY = aTileRelativeY[primIndex];
2293
2294             // pack the relative x,y into the coverageMask, the rasterizer will
2295             // generate the true coverage mask from it
2296             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2297
2298             // bin it
2299             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2300 #if KNOB_ENABLE_TOSS_POINTS
2301             if (!KNOB_TOSS_SETUP_TRIS)
2302 #endif
2303             {
2304                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2305             }
2306             primMask &= ~(1 << primIndex);
2307         }
2308     }
2309     else
2310     {
2311         // non simple points need to be potentially binned to multiple macro tiles
2312         simdscalar vPointSize;
2313         if (rastState.pointParam)
2314         {
2315             simdvector size[3];
2316             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2317             vPointSize = size[0].x;
2318         }
2319         else
2320         {
2321             vPointSize = _simd_set1_ps(rastState.pointSize);
2322         }
2323
2324         // bloat point to bbox
2325         simdBBox bbox;
2326         bbox.left = bbox.right = vXi;
2327         bbox.top = bbox.bottom = vYi;
2328
2329         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2330         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2331         bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2332         bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2333         bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2334         bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2335
2336         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2337         bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2338         bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2339         bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2340         bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2341
2342         // Cull bloated points completely outside scissor
2343         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2344         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2345         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2346         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2347         primMask = primMask & ~maskOutsideScissor;
2348
2349         // Convert bbox to macrotile units.
2350         bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2351         bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2352         bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2353         bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2354
2355         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2356         _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2357         _simd_store_si((simdscalari*)aMTRight, bbox.right);
2358         _simd_store_si((simdscalari*)aMTTop, bbox.top);
2359         _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2360
2361         // store render target array index
2362         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2363         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2364         {
2365             simdvector vRtai[2];
2366             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2367             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2368             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2369         }
2370         else
2371         {
2372             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2373         }
2374
2375         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2376         _simd_store_ps((float*)aPointSize, vPointSize);
2377
2378         uint32_t *pPrimID = (uint32_t *)&primID;
2379
2380         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2381         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2382         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2383
2384         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2385         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2386         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2387
2388         // scan remaining valid prims and bin each separately
2389         const SWR_BACKEND_STATE& backendState = state.backendState;
2390         DWORD primIndex;
2391         while (_BitScanForward(&primIndex, primMask))
2392         {
2393             uint32_t linkageCount = backendState.numAttributes;
2394             uint32_t numScalarAttribs = linkageCount * 4;
2395
2396             BE_WORK work;
2397             work.type = DRAW;
2398
2399             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2400
2401             desc.triFlags.frontFacing = 1;
2402             desc.triFlags.primID = pPrimID[primIndex];
2403             desc.triFlags.pointSize = aPointSize[primIndex];
2404             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2405
2406             work.pfnWork = RasterizeTriPoint;
2407
2408             auto pArena = pDC->pArena;
2409             SWR_ASSERT(pArena != nullptr);
2410
2411             // store active attribs
2412             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2413             desc.numAttribs = linkageCount;
2414             pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2415
2416             // store point vertex data
2417             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2418             desc.pTriBuffer = pTriBuffer;
2419             *pTriBuffer++ = aPrimVertsX[primIndex];
2420             *pTriBuffer++ = aPrimVertsY[primIndex];
2421             *pTriBuffer = aPrimVertsZ[primIndex];
2422
2423             // store user clip distances
2424             if (rastState.clipDistanceMask)
2425             {
2426                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2427                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2428                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2429             }
2430
2431             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2432             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2433             {
2434                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2435                 {
2436 #if KNOB_ENABLE_TOSS_POINTS
2437                     if (!KNOB_TOSS_SETUP_TRIS)
2438 #endif
2439                     {
2440                         pTileMgr->enqueue(x, y, &work);
2441                     }
2442                 }
2443             }
2444
2445             primMask &= ~(1 << primIndex);
2446         }
2447     }
2448
2449
2450
2451
2452     RDTSC_STOP(FEBinPoints, 1, 0);
2453 }
2454
2455 //////////////////////////////////////////////////////////////////////////
2456 /// @brief Bin SIMD lines to the backend.
2457 /// @param pDC - pointer to draw context.
2458 /// @param pa - The primitive assembly object.
2459 /// @param workerId - thread's worker id. Even thread has a unique id.
2460 /// @param tri - Contains line position data for SIMDs worth of points.
2461 /// @param primID - Primitive ID for each line.
2462 void BinLines(
2463     DRAW_CONTEXT *pDC,
2464     PA_STATE& pa,
2465     uint32_t workerId,
2466     simdvector prim[],
2467     uint32_t primMask,
2468     simdscalari primID,
2469     simdscalari viewportIdx)
2470 {
2471     RDTSC_START(FEBinLines);
2472
2473     const API_STATE& state = GetApiState(pDC);
2474     const SWR_RASTSTATE& rastState = state.rastState;
2475     const SWR_FRONTEND_STATE& feState = state.frontendState;
2476     const SWR_GS_STATE& gsState = state.gsState;
2477
2478     // Select attribute processor
2479     PFN_PROCESS_ATTRIBUTES pfnProcessAttribs = GetProcessAttributesFunc(2,
2480     state.backendState.swizzleEnable, state.backendState.constantInterpolationMask);
2481
2482     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2483     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2484
2485     if (!feState.vpTransformDisable)
2486     {
2487         // perspective divide
2488         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2489         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2490
2491         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2492         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2493
2494         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2495         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2496
2497         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2498         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2499
2500         // viewport transform to screen coords
2501         if (state.gsState.emitsViewportArrayIndex)
2502         {
2503             viewportTransform<2>(prim, state.vpMatrices, viewportIdx);
2504         }
2505         else
2506         {
2507             viewportTransform<2>(prim, state.vpMatrices);
2508         }
2509     }
2510
2511     // adjust for pixel center location
2512     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2513     prim[0].x = _simd_add_ps(prim[0].x, offset);
2514     prim[0].y = _simd_add_ps(prim[0].y, offset);
2515
2516     prim[1].x = _simd_add_ps(prim[1].x, offset);
2517     prim[1].y = _simd_add_ps(prim[1].y, offset);
2518
2519     // convert to fixed point
2520     simdscalari vXi[2], vYi[2];
2521     vXi[0] = fpToFixedPointVertical(prim[0].x);
2522     vYi[0] = fpToFixedPointVertical(prim[0].y);
2523     vXi[1] = fpToFixedPointVertical(prim[1].x);
2524     vYi[1] = fpToFixedPointVertical(prim[1].y);
2525
2526     // compute x-major vs y-major mask
2527     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2528     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2529     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2530     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2531
2532     // cull zero-length lines
2533     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2534     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2535
2536     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2537
2538     uint32_t *pPrimID = (uint32_t *)&primID;
2539
2540     simdscalar vUnused = _simd_setzero_ps();
2541
2542     // Calc bounding box of lines
2543     simdBBox bbox;
2544     bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2545     bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2546     bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2547     bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2548
2549     // bloat bbox by line width along minor axis
2550     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2551     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2552     simdBBox bloatBox;
2553     bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2554     bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2555     bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2556     bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2557
2558     bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2559     bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2560     bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2561     bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2562
2563     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2564     bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2565     bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2566     bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2567     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2568
2569     // Cull prims completely outside scissor
2570     {
2571         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2572         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2573         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2574         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2575         primMask = primMask & ~maskOutsideScissor;
2576     }
2577
2578     if (!primMask)
2579     {
2580         goto endBinLines;
2581     }
2582
2583     // Convert triangle bbox to macrotile units.
2584     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2585     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2586     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2587     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2588
2589     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2590     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2591     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2592     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2593     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2594
2595     // transpose verts needed for backend
2596     /// @todo modify BE to take non-transformed verts
2597     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2598     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2599     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2600     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2601     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2602
2603     // store render target array index
2604     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2605     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2606     {
2607         simdvector vRtai[2];
2608         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2609         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2610         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2611     }
2612     else
2613     {
2614         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2615     }
2616
2617     // scan remaining valid prims and bin each separately
2618     DWORD primIndex;
2619     while (_BitScanForward(&primIndex, primMask))
2620     {
2621         uint32_t linkageCount = state.backendState.numAttributes;
2622         uint32_t numScalarAttribs = linkageCount * 4;
2623
2624         BE_WORK work;
2625         work.type = DRAW;
2626
2627         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2628
2629         desc.triFlags.frontFacing = 1;
2630         desc.triFlags.primID = pPrimID[primIndex];
2631         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2632         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2633
2634         work.pfnWork = RasterizeLine;
2635
2636         auto pArena = pDC->pArena;
2637         SWR_ASSERT(pArena != nullptr);
2638
2639         // store active attribs
2640         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2641         desc.numAttribs = linkageCount;
2642         pfnProcessAttribs(pDC, pa, primIndex, pPrimID[primIndex], desc.pAttribs);
2643
2644         // store line vertex data
2645         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2646         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2647         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2648         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2649         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2650
2651         // store user clip distances
2652         if (rastState.clipDistanceMask)
2653         {
2654             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2655             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2656             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2657         }
2658
2659         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2660         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2661         {
2662             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2663             {
2664 #if KNOB_ENABLE_TOSS_POINTS
2665                 if (!KNOB_TOSS_SETUP_TRIS)
2666 #endif
2667                 {
2668                     pTileMgr->enqueue(x, y, &work);
2669                 }
2670             }
2671         }
2672
2673         primMask &= ~(1 << primIndex);
2674     }
2675
2676 endBinLines:
2677
2678     RDTSC_STOP(FEBinLines, 1, 0);
2679 }