src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "utils.h"
  37 #include "threads.h"
  38 #include "pa.h"
  39 #include "clip.h"
  40 #include "tilemgr.h"
  41 #include "tessellator.h"
  42
  43 //////////////////////////////////////////////////////////////////////////
  44 /// @brief Helper macro to generate a bitmask
  45 static INLINE uint32_t GenMask(uint32_t numBits)
  46 {
  47     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  48     return ((1U << numBits) - 1);
  49 }
  50
  51 //////////////////////////////////////////////////////////////////////////
  52 /// @brief Offsets added to post-viewport vertex positions based on
  53 /// raster state.
  54 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  55 {
  56     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  57     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  58 };
  59
  60 //////////////////////////////////////////////////////////////////////////
  61 /// @brief FE handler for SwrSync.
  62 /// @param pContext - pointer to SWR context.
  63 /// @param pDC - pointer to draw context.
  64 /// @param workerId - thread's worker id. Even thread has a unique id.
  65 /// @param pUserData - Pointer to user data passed back to sync callback.
  66 /// @todo This should go away when we switch this to use compute threading.
  67 void ProcessSync(
  68     SWR_CONTEXT *pContext,
  69     DRAW_CONTEXT *pDC,
  70     uint32_t workerId,
  71     void *pUserData)
  72 {
  73     SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
  74     BE_WORK work;
  75     work.type = SYNC;
  76     work.pfnWork = ProcessSyncBE;
  77     work.desc.sync = *pSync;
  78
  79     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  80     pTileMgr->enqueue(0, 0, &work);
  81 }
  82
  83 //////////////////////////////////////////////////////////////////////////
  84 /// @brief FE handler for SwrGetStats.
  85 /// @param pContext - pointer to SWR context.
  86 /// @param pDC - pointer to draw context.
  87 /// @param workerId - thread's worker id. Even thread has a unique id.
  88 /// @param pUserData - Pointer to user data passed back to stats callback.
  89 /// @todo This should go away when we switch this to use compute threading.
  90 void ProcessQueryStats(
  91     SWR_CONTEXT *pContext,
  92     DRAW_CONTEXT *pDC,
  93     uint32_t workerId,
  94     void *pUserData)
  95 {
  96     QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
  97     BE_WORK work;
  98     work.type = QUERYSTATS;
  99     work.pfnWork = ProcessQueryStatsBE;
 100     work.desc.queryStats = *pQueryStats;
 101
 102     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 103     pTileMgr->enqueue(0, 0, &work);
 104 }
 105
 106 //////////////////////////////////////////////////////////////////////////
 107 /// @brief FE handler for SwrClearRenderTarget.
 108 /// @param pContext - pointer to SWR context.
 109 /// @param pDC - pointer to draw context.
 110 /// @param workerId - thread's worker id. Even thread has a unique id.
 111 /// @param pUserData - Pointer to user data passed back to clear callback.
 112 /// @todo This should go away when we switch this to use compute threading.
 113 void ProcessClear(
 114     SWR_CONTEXT *pContext,
 115     DRAW_CONTEXT *pDC,
 116     uint32_t workerId,
 117     void *pUserData)
 118 {
 119     CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
 120     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 121
 122     const API_STATE& state = GetApiState(pDC);
 123
 124     // queue a clear to each macro tile
 125     // compute macro tile bounds for the current scissor/viewport
 126     uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
 127     uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
 128     uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
 129     uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
 130
 131     BE_WORK work;
 132     work.type = CLEAR;
 133     work.pfnWork = ProcessClearBE;
 134     work.desc.clear = *pClear;
 135
 136     for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
 137     {
 138         for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
 139         {
 140             pTileMgr->enqueue(x, y, &work);
 141         }
 142     }
 143 }
 144
 145 //////////////////////////////////////////////////////////////////////////
 146 /// @brief FE handler for SwrStoreTiles.
 147 /// @param pContext - pointer to SWR context.
 148 /// @param pDC - pointer to draw context.
 149 /// @param workerId - thread's worker id. Even thread has a unique id.
 150 /// @param pUserData - Pointer to user data passed back to callback.
 151 /// @todo This should go away when we switch this to use compute threading.
 152 void ProcessStoreTiles(
 153     SWR_CONTEXT *pContext,
 154     DRAW_CONTEXT *pDC,
 155     uint32_t workerId,
 156     void *pUserData)
 157 {
 158     RDTSC_START(FEProcessStoreTiles);
 159     STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
 160     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 161
 162     const API_STATE& state = GetApiState(pDC);
 163
 164     // queue a store to each macro tile
 165     // compute macro tile bounds for the current render target
 166     const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 167     const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 168
 169     uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
 170     uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
 171
 172     // store tiles
 173     BE_WORK work;
 174     work.type = STORETILES;
 175     work.pfnWork = ProcessStoreTileBE;
 176     work.desc.storeTiles = *pStore;
 177
 178     for (uint32_t x = 0; x < numMacroTilesX; ++x)
 179     {
 180         for (uint32_t y = 0; y < numMacroTilesY; ++y)
 181         {
 182             pTileMgr->enqueue(x, y, &work);
 183         }
 184     }
 185
 186     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 187 }
 188
 189 //////////////////////////////////////////////////////////////////////////
 190 /// @brief FE handler for SwrInvalidateTiles.
 191 /// @param pContext - pointer to SWR context.
 192 /// @param pDC - pointer to draw context.
 193 /// @param workerId - thread's worker id. Even thread has a unique id.
 194 /// @param pUserData - Pointer to user data passed back to callback.
 195 /// @todo This should go away when we switch this to use compute threading.
 196 void ProcessDiscardInvalidateTiles(
 197     SWR_CONTEXT *pContext,
 198     DRAW_CONTEXT *pDC,
 199     uint32_t workerId,
 200     void *pUserData)
 201 {
 202     RDTSC_START(FEProcessInvalidateTiles);
 203     DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 204     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 205
 206     SWR_RECT rect;
 207
 208     if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
 209     {
 210         // Valid rect
 211         rect = pInv->rect;
 212     }
 213     else
 214     {
 215         // Use viewport dimensions
 216         const API_STATE& state = GetApiState(pDC);
 217
 218         rect.left   = (uint32_t)state.vp[0].x;
 219         rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
 220         rect.top    = (uint32_t)state.vp[0].y;
 221         rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
 222     }
 223
 224     // queue a store to each macro tile
 225     // compute macro tile bounds for the current render target
 226     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 227     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 228
 229     // Setup region assuming full tiles
 230     uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
 231     uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
 232
 233     uint32_t macroTileEndX = rect.right / macroWidth;
 234     uint32_t macroTileEndY = rect.bottom / macroHeight;
 235
 236     if (pInv->fullTilesOnly == false)
 237     {
 238         // include partial tiles
 239         macroTileStartX = rect.left / macroWidth;
 240         macroTileStartY = rect.top / macroHeight;
 241
 242         macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
 243         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
 244     }
 245
 246     SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
 247     SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 248
 249     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
 250     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 251
 252     // load tiles
 253     BE_WORK work;
 254     work.type = DISCARDINVALIDATETILES;
 255     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 256     work.desc.discardInvalidateTiles = *pInv;
 257
 258     for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
 259     {
 260         for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
 261         {
 262             pTileMgr->enqueue(x, y, &work);
 263         }
 264     }
 265
 266     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 267 }
 268
 269 //////////////////////////////////////////////////////////////////////////
 270 /// @brief Computes the number of primitives given the number of verts.
 271 /// @param mode - primitive topology for draw operation.
 272 /// @param numPrims - number of vertices or indices for draw.
 273 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 274 uint32_t GetNumPrims(
 275     PRIMITIVE_TOPOLOGY mode,
 276     uint32_t numPrims)
 277 {
 278     switch (mode)
 279     {
 280     case TOP_POINT_LIST: return numPrims;
 281     case TOP_TRIANGLE_LIST: return numPrims / 3;
 282     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 283     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 284     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 285     case TOP_QUAD_LIST: return numPrims / 4;
 286     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 287     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 288     case TOP_LINE_LIST: return numPrims / 2;
 289     case TOP_LINE_LOOP: return numPrims;
 290     case TOP_RECT_LIST: return numPrims / 3;
 291     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 292     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 293     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 294     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 295
 296     case TOP_PATCHLIST_1:
 297     case TOP_PATCHLIST_2:
 298     case TOP_PATCHLIST_3:
 299     case TOP_PATCHLIST_4:
 300     case TOP_PATCHLIST_5:
 301     case TOP_PATCHLIST_6:
 302     case TOP_PATCHLIST_7:
 303     case TOP_PATCHLIST_8:
 304     case TOP_PATCHLIST_9:
 305     case TOP_PATCHLIST_10:
 306     case TOP_PATCHLIST_11:
 307     case TOP_PATCHLIST_12:
 308     case TOP_PATCHLIST_13:
 309     case TOP_PATCHLIST_14:
 310     case TOP_PATCHLIST_15:
 311     case TOP_PATCHLIST_16:
 312     case TOP_PATCHLIST_17:
 313     case TOP_PATCHLIST_18:
 314     case TOP_PATCHLIST_19:
 315     case TOP_PATCHLIST_20:
 316     case TOP_PATCHLIST_21:
 317     case TOP_PATCHLIST_22:
 318     case TOP_PATCHLIST_23:
 319     case TOP_PATCHLIST_24:
 320     case TOP_PATCHLIST_25:
 321     case TOP_PATCHLIST_26:
 322     case TOP_PATCHLIST_27:
 323     case TOP_PATCHLIST_28:
 324     case TOP_PATCHLIST_29:
 325     case TOP_PATCHLIST_30:
 326     case TOP_PATCHLIST_31:
 327     case TOP_PATCHLIST_32:
 328         return numPrims / (mode - TOP_PATCHLIST_BASE);
 329
 330     case TOP_POLYGON:
 331     case TOP_POINT_LIST_BF:
 332     case TOP_LINE_STRIP_CONT:
 333     case TOP_LINE_STRIP_BF:
 334     case TOP_LINE_STRIP_CONT_BF:
 335     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 336     case TOP_TRI_STRIP_REVERSE:
 337     case TOP_PATCHLIST_BASE:
 338     case TOP_UNKNOWN:
 339         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 340         return 0;
 341     }
 342
 343     return 0;
 344 }
 345
 346 //////////////////////////////////////////////////////////////////////////
 347 /// @brief Computes the number of verts given the number of primitives.
 348 /// @param mode - primitive topology for draw operation.
 349 /// @param numPrims - number of primitives for draw.
 350 uint32_t GetNumVerts(
 351     PRIMITIVE_TOPOLOGY mode,
 352     uint32_t numPrims)
 353 {
 354     switch (mode)
 355     {
 356     case TOP_POINT_LIST: return numPrims;
 357     case TOP_TRIANGLE_LIST: return numPrims * 3;
 358     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 359     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 360     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 361     case TOP_QUAD_LIST: return numPrims * 4;
 362     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 363     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 364     case TOP_LINE_LIST: return numPrims * 2;
 365     case TOP_LINE_LOOP: return numPrims;
 366     case TOP_RECT_LIST: return numPrims * 3;
 367     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 368     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 369     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 370     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 371
 372     case TOP_PATCHLIST_1:
 373     case TOP_PATCHLIST_2:
 374     case TOP_PATCHLIST_3:
 375     case TOP_PATCHLIST_4:
 376     case TOP_PATCHLIST_5:
 377     case TOP_PATCHLIST_6:
 378     case TOP_PATCHLIST_7:
 379     case TOP_PATCHLIST_8:
 380     case TOP_PATCHLIST_9:
 381     case TOP_PATCHLIST_10:
 382     case TOP_PATCHLIST_11:
 383     case TOP_PATCHLIST_12:
 384     case TOP_PATCHLIST_13:
 385     case TOP_PATCHLIST_14:
 386     case TOP_PATCHLIST_15:
 387     case TOP_PATCHLIST_16:
 388     case TOP_PATCHLIST_17:
 389     case TOP_PATCHLIST_18:
 390     case TOP_PATCHLIST_19:
 391     case TOP_PATCHLIST_20:
 392     case TOP_PATCHLIST_21:
 393     case TOP_PATCHLIST_22:
 394     case TOP_PATCHLIST_23:
 395     case TOP_PATCHLIST_24:
 396     case TOP_PATCHLIST_25:
 397     case TOP_PATCHLIST_26:
 398     case TOP_PATCHLIST_27:
 399     case TOP_PATCHLIST_28:
 400     case TOP_PATCHLIST_29:
 401     case TOP_PATCHLIST_30:
 402     case TOP_PATCHLIST_31:
 403     case TOP_PATCHLIST_32:
 404         return numPrims * (mode - TOP_PATCHLIST_BASE);
 405
 406     case TOP_POLYGON:
 407     case TOP_POINT_LIST_BF:
 408     case TOP_LINE_STRIP_CONT:
 409     case TOP_LINE_STRIP_BF:
 410     case TOP_LINE_STRIP_CONT_BF:
 411     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 412     case TOP_TRI_STRIP_REVERSE:
 413     case TOP_PATCHLIST_BASE:
 414     case TOP_UNKNOWN:
 415         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 416         return 0;
 417     }
 418
 419     return 0;
 420 }
 421
 422 //////////////////////////////////////////////////////////////////////////
 423 /// @brief Return number of verts per primitive.
 424 /// @param topology - topology
 425 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 426 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 427 {
 428     uint32_t numVerts = 0;
 429     switch (topology)
 430     {
 431     case TOP_POINT_LIST:
 432     case TOP_POINT_LIST_BF:
 433         numVerts = 1;
 434         break;
 435     case TOP_LINE_LIST:
 436     case TOP_LINE_STRIP:
 437     case TOP_LINE_LIST_ADJ:
 438     case TOP_LINE_LOOP:
 439     case TOP_LINE_STRIP_CONT:
 440     case TOP_LINE_STRIP_BF:
 441     case TOP_LISTSTRIP_ADJ:
 442         numVerts = 2;
 443         break;
 444     case TOP_TRIANGLE_LIST:
 445     case TOP_TRIANGLE_STRIP:
 446     case TOP_TRIANGLE_FAN:
 447     case TOP_TRI_LIST_ADJ:
 448     case TOP_TRI_STRIP_ADJ:
 449     case TOP_TRI_STRIP_REVERSE:
 450     case TOP_RECT_LIST:
 451         numVerts = 3;
 452         break;
 453     case TOP_QUAD_LIST:
 454     case TOP_QUAD_STRIP:
 455         numVerts = 4;
 456         break;
 457     case TOP_PATCHLIST_1:
 458     case TOP_PATCHLIST_2:
 459     case TOP_PATCHLIST_3:
 460     case TOP_PATCHLIST_4:
 461     case TOP_PATCHLIST_5:
 462     case TOP_PATCHLIST_6:
 463     case TOP_PATCHLIST_7:
 464     case TOP_PATCHLIST_8:
 465     case TOP_PATCHLIST_9:
 466     case TOP_PATCHLIST_10:
 467     case TOP_PATCHLIST_11:
 468     case TOP_PATCHLIST_12:
 469     case TOP_PATCHLIST_13:
 470     case TOP_PATCHLIST_14:
 471     case TOP_PATCHLIST_15:
 472     case TOP_PATCHLIST_16:
 473     case TOP_PATCHLIST_17:
 474     case TOP_PATCHLIST_18:
 475     case TOP_PATCHLIST_19:
 476     case TOP_PATCHLIST_20:
 477     case TOP_PATCHLIST_21:
 478     case TOP_PATCHLIST_22:
 479     case TOP_PATCHLIST_23:
 480     case TOP_PATCHLIST_24:
 481     case TOP_PATCHLIST_25:
 482     case TOP_PATCHLIST_26:
 483     case TOP_PATCHLIST_27:
 484     case TOP_PATCHLIST_28:
 485     case TOP_PATCHLIST_29:
 486     case TOP_PATCHLIST_30:
 487     case TOP_PATCHLIST_31:
 488     case TOP_PATCHLIST_32:
 489         numVerts = topology - TOP_PATCHLIST_BASE;
 490         break;
 491     default:
 492         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 493         break;
 494     }
 495
 496     if (includeAdjVerts)
 497     {
 498         switch (topology)
 499         {
 500         case TOP_LISTSTRIP_ADJ:
 501         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 502         case TOP_TRI_STRIP_ADJ:
 503         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 504         default: break;
 505         }
 506     }
 507
 508     return numVerts;
 509 }
 510
 511 //////////////////////////////////////////////////////////////////////////
 512 /// @brief Generate mask from remaining work.
 513 /// @param numWorkItems - Number of items being worked on by a SIMD.
 514 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 515 {
 516     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 517     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 518     return _simd_castps_si(vMask(mask));
 519 }
 520
 521 //////////////////////////////////////////////////////////////////////////
 522 /// @brief StreamOut - Streams vertex data out to SO buffers.
 523 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 524 /// @param pDC - pointer to draw context.
 525 /// @param workerId - thread's worker id. Even thread has a unique id.
 526 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 527 static void StreamOut(
 528     DRAW_CONTEXT* pDC,
 529     PA_STATE& pa,
 530     uint32_t workerId,
 531     uint32_t* pPrimData,
 532     uint32_t streamIndex)
 533 {
 534     RDTSC_START(FEStreamout);
 535
 536     SWR_CONTEXT* pContext = pDC->pContext;
 537
 538     const API_STATE& state = GetApiState(pDC);
 539     const SWR_STREAMOUT_STATE &soState = state.soState;
 540
 541     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 542
 543     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 544     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 545
 546     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 547
 548     // Setup buffer state pointers.
 549     for (uint32_t i = 0; i < 4; ++i)
 550     {
 551         soContext.pBuffer[i] = &state.soBuffer[i];
 552     }
 553
 554     uint32_t numPrims = pa.NumPrims();
 555     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 556     {
 557         DWORD slot = 0;
 558         uint32_t soMask = soState.streamMasks[streamIndex];
 559
 560         // Write all entries into primitive data buffer for SOS.
 561         while (_BitScanForward(&slot, soMask))
 562         {
 563             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 564             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 565             pa.AssembleSingle(paSlot, primIndex, attrib);
 566
 567             // Attribute offset is relative offset from start of vertex.
 568             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 569             // to prim data starting at slot 0. Which is why we do (slot - 1).
 570             // Also note: GL works slightly differently, and needs slot 0
 571             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 572
 573             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 574             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 575             {
 576                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 577
 578                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 579             }
 580             soMask &= ~(1 << slot);
 581         }
 582
 583         // Update pPrimData pointer
 584         soContext.pPrimData = pPrimData;
 585
 586         // Call SOS
 587         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 588         state.pfnSoFunc[streamIndex](soContext);
 589     }
 590
 591     // Update SO write offset. The driver provides memory for the update.
 592     for (uint32_t i = 0; i < 4; ++i)
 593     {
 594         if (state.soBuffer[i].pWriteOffset)
 595         {
 596             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 597
 598             // The SOS increments the existing write offset. So we don't want to increment
 599             // the SoWriteOffset stat using an absolute offset instead of relative.
 600             SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
 601         }
 602     }
 603
 604     UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 605     UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 606
 607     RDTSC_STOP(FEStreamout, 1, 0);
 608 }
 609
 610 //////////////////////////////////////////////////////////////////////////
 611 /// @brief Computes number of invocations. The current index represents
 612 ///        the start of the SIMD. The max index represents how much work
 613 ///        items are remaining. If there is less then a SIMD's left of work
 614 ///        then return the remaining amount of work.
 615 /// @param curIndex - The start index for the SIMD.
 616 /// @param maxIndex - The last index for all work items.
 617 static INLINE uint32_t GetNumInvocations(
 618     uint32_t curIndex,
 619     uint32_t maxIndex)
 620 {
 621     uint32_t remainder = (maxIndex - curIndex);
 622     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 623 }
 624
 625 //////////////////////////////////////////////////////////////////////////
 626 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 627 ///        The geometry shader will loop over each active streamout buffer, assembling
 628 ///        primitives for the downstream stages. When multistream output is enabled,
 629 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 630 ///        buffer for the primitive assembler.
 631 /// @param stream - stream id to generate the cut buffer for
 632 /// @param pStreamIdBase - pointer to the stream ID buffer
 633 /// @param numEmittedVerts - Number of total verts emitted by the GS
 634 /// @param pCutBuffer - output buffer to write cuts to
 635 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 636 {
 637     SWR_ASSERT(stream < MAX_SO_STREAMS);
 638
 639     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 640     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 641
 642     for (uint32_t b = 0; b < numOutputBytes; ++b)
 643     {
 644         uint8_t curInputByte = pStreamIdBase[2*b];
 645         uint8_t outByte = 0;
 646         for (uint32_t i = 0; i < 4; ++i)
 647         {
 648             if ((curInputByte & 0x3) != stream)
 649             {
 650                 outByte |= (1 << i);
 651             }
 652             curInputByte >>= 2;
 653         }
 654
 655         curInputByte = pStreamIdBase[2 * b + 1];
 656         for (uint32_t i = 0; i < 4; ++i)
 657         {
 658             if ((curInputByte & 0x3) != stream)
 659             {
 660                 outByte |= (1 << (i + 4));
 661             }
 662             curInputByte >>= 2;
 663         }
 664
 665         *pCutBuffer++ = outByte;
 666     }
 667 }
 668
 669 THREAD SWR_GS_CONTEXT tlsGsContext;
 670
 671 //////////////////////////////////////////////////////////////////////////
 672 /// @brief Implements GS stage.
 673 /// @param pDC - pointer to draw context.
 674 /// @param workerId - thread's worker id. Even thread has a unique id.
 675 /// @param pa - The primitive assembly object.
 676 /// @param pGsOut - output stream for GS
 677 template <
 678     typename HasStreamOutT,
 679     typename HasRastT>
 680 static void GeometryShaderStage(
 681     DRAW_CONTEXT *pDC,
 682     uint32_t workerId,
 683     PA_STATE& pa,
 684     void* pGsOut,
 685     void* pCutBuffer,
 686     void* pStreamCutBuffer,
 687     uint32_t* pSoPrimData,
 688     simdscalari primID)
 689 {
 690     RDTSC_START(FEGeometryShader);
 691
 692     SWR_CONTEXT* pContext = pDC->pContext;
 693
 694     const API_STATE& state = GetApiState(pDC);
 695     const SWR_GS_STATE* pState = &state.gsState;
 696
 697     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 698     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 699
 700     tlsGsContext.pStream = (uint8_t*)pGsOut;
 701     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 702     tlsGsContext.PrimitiveID = primID;
 703
 704     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 705     simdvector attrib[MAX_ATTRIBUTES];
 706
 707     // assemble all attributes for the input primitive
 708     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 709     {
 710         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 711         pa.Assemble(attribSlot, attrib);
 712
 713         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 714         {
 715             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 716         }
 717     }
 718
 719     // assemble position
 720     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 721     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 722     {
 723         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 724     }
 725
 726     const uint32_t vertexStride = sizeof(simdvertex);
 727     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 728     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 729     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 730     uint32_t cutPrimStride;
 731     uint32_t cutInstanceStride;
 732
 733     if (pState->isSingleStream)
 734     {
 735         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 736         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 737     }
 738     else
 739     {
 740         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 741         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 742     }
 743
 744     // record valid prims from the frontend to avoid over binning the newly generated
 745     // prims from the GS
 746     uint32_t numInputPrims = pa.NumPrims();
 747
 748     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 749     {
 750         tlsGsContext.InstanceID = instance;
 751         tlsGsContext.mask = GenerateMask(numInputPrims);
 752
 753         // execute the geometry shader
 754         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 755
 756         tlsGsContext.pStream += instanceStride;
 757         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 758     }
 759
 760     // set up new binner and state for the GS output topology
 761     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 762     if (HasRastT::value)
 763     {
 764         switch (pState->outputTopology)
 765         {
 766         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 767         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 768         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 769         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 770         }
 771     }
 772
 773     // foreach input prim:
 774     // - setup a new PA based on the emitted verts for that prim
 775     // - loop over the new verts, calling PA to assemble each prim
 776     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 777     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 778
 779     uint32_t totalPrimsGenerated = 0;
 780     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 781     {
 782         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 783         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 784         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 785         {
 786             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 787             if (numEmittedVerts == 0)
 788             {
 789                 continue;
 790             }
 791
 792             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 793             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 794
 795             DWORD numAttribs;
 796             if (_BitScanReverse(&numAttribs, state.feAttribMask))
 797             {
 798                 numAttribs++;
 799             }
 800             else
 801             {
 802                 numAttribs = 0;
 803             }
 804
 805             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 806             {
 807                 bool processCutVerts = false;
 808
 809                 uint8_t* pCutBuffer = pCutBase;
 810
 811                 // assign default stream ID, only relevant when GS is outputting a single stream
 812                 uint32_t streamID = 0;
 813                 if (pState->isSingleStream)
 814                 {
 815                     processCutVerts = true;
 816                     streamID = pState->singleStreamID;
 817                     if (streamID != stream) continue;
 818                 }
 819                 else
 820                 {
 821                     // early exit if this stream is not enabled for streamout
 822                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 823                     {
 824                         continue;
 825                     }
 826
 827                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 828                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 829                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 830                     processCutVerts = false;
 831                 }
 832
 833                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 834
 835                 while (gsPa.GetNextStreamOutput())
 836                 {
 837                     do
 838                     {
 839                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 840
 841                         if (assemble)
 842                         {
 843                             totalPrimsGenerated += gsPa.NumPrims();
 844
 845                             if (HasStreamOutT::value)
 846                             {
 847                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 848                             }
 849
 850                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 851                             {
 852                                 simdscalari vPrimId;
 853                                 // pull primitiveID from the GS output if available
 854                                 if (state.gsState.emitsPrimitiveID)
 855                                 {
 856                                     simdvector primIdAttrib[3];
 857                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 858                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 859                                 }
 860                                 else
 861                                 {
 862                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 863                                 }
 864
 865                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 866                             }
 867                         }
 868                     } while (gsPa.NextPrim());
 869                 }
 870             }
 871         }
 872     }
 873
 874     // update GS pipeline stats
 875     UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
 876     UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
 877
 878     RDTSC_STOP(FEGeometryShader, 1, 0);
 879 }
 880
 881 //////////////////////////////////////////////////////////////////////////
 882 /// @brief Allocate GS buffers
 883 /// @param pDC - pointer to draw context.
 884 /// @param state - API state
 885 /// @param ppGsOut - pointer to GS output buffer allocation
 886 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 887 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 888     void **ppStreamCutBuffer)
 889 {
 890     auto pArena = pDC->pArena;
 891     SWR_ASSERT(pArena != nullptr);
 892     SWR_ASSERT(state.gsState.gsEnable);
 893     // allocate arena space to hold GS output verts
 894     // @todo pack attribs
 895     // @todo support multiple streams
 896     const uint32_t vertexStride = sizeof(simdvertex);
 897     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 898     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 899     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 900
 901     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 902     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 903     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 904     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 905
 906     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 907     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 908
 909     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 910     if (state.gsState.isSingleStream)
 911     {
 912         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 913         *ppStreamCutBuffer = nullptr;
 914     }
 915     else
 916     {
 917         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 918         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 919     }
 920
 921 }
 922
 923 //////////////////////////////////////////////////////////////////////////
 924 /// @brief Contains all data generated by the HS and passed to the
 925 /// tessellator and DS.
 926 struct TessellationThreadLocalData
 927 {
 928     SWR_HS_CONTEXT hsContext;
 929     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 930     void* pTxCtx;
 931     size_t tsCtxSize;
 932
 933     simdscalar* pDSOutput;
 934     size_t numDSOutputVectors;
 935 };
 936
 937 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 938
 939 //////////////////////////////////////////////////////////////////////////
 940 /// @brief Allocate tessellation data for this worker thread.
 941 INLINE
 942 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 943 {
 944     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 945     if (gt_pTessellationThreadData == nullptr)
 946     {
 947         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 948             _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
 949         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 950     }
 951 }
 952
 953 //////////////////////////////////////////////////////////////////////////
 954 /// @brief Implements Tessellation Stages.
 955 /// @param pDC - pointer to draw context.
 956 /// @param workerId - thread's worker id. Even thread has a unique id.
 957 /// @param pa - The primitive assembly object.
 958 /// @param pGsOut - output stream for GS
 959 template <
 960     typename HasGeometryShaderT,
 961     typename HasStreamOutT,
 962     typename HasRastT>
 963 static void TessellationStages(
 964     DRAW_CONTEXT *pDC,
 965     uint32_t workerId,
 966     PA_STATE& pa,
 967     void* pGsOut,
 968     void* pCutBuffer,
 969     void* pCutStreamBuffer,
 970     uint32_t* pSoPrimData,
 971     simdscalari primID)
 972 {
 973     const API_STATE& state = GetApiState(pDC);
 974     const SWR_TS_STATE& tsState = state.tsState;
 975     SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
 976
 977     SWR_ASSERT(gt_pTessellationThreadData);
 978
 979     HANDLE tsCtx = TSInitCtx(
 980         tsState.domain,
 981         tsState.partitioning,
 982         tsState.tsOutputTopology,
 983         gt_pTessellationThreadData->pTxCtx,
 984         gt_pTessellationThreadData->tsCtxSize);
 985     if (tsCtx == nullptr)
 986     {
 987         gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
 988         tsCtx = TSInitCtx(
 989             tsState.domain,
 990             tsState.partitioning,
 991             tsState.tsOutputTopology,
 992             gt_pTessellationThreadData->pTxCtx,
 993             gt_pTessellationThreadData->tsCtxSize);
 994     }
 995     SWR_ASSERT(tsCtx);
 996
 997     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 998     if (HasRastT::value)
 999     {
1000         switch (tsState.postDSTopology)
1001         {
1002         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1003         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1004         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1005         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1006         }
1007     }
1008
1009     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1010     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1011     hsContext.PrimitiveID = primID;
1012
1013     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1014     // Max storage for one attribute for an entire simdprimitive
1015     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1016
1017     // assemble all attributes for the input primitives
1018     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1019     {
1020         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1021         pa.Assemble(attribSlot, simdattrib);
1022
1023         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1024         {
1025             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1026         }
1027     }
1028
1029 #if defined(_DEBUG)
1030     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1031 #endif
1032
1033     uint32_t numPrims = pa.NumPrims();
1034     hsContext.mask = GenerateMask(numPrims);
1035
1036     // Run the HS
1037     RDTSC_START(FEHullShader);
1038     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1039     RDTSC_STOP(FEHullShader, 0, 0);
1040
1041     UPDATE_STAT(HsInvocations, numPrims);
1042
1043     const uint32_t* pPrimId = (const uint32_t*)&primID;
1044
1045     for (uint32_t p = 0; p < numPrims; ++p)
1046     {
1047         // Run Tessellator
1048         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1049         RDTSC_START(FETessellation);
1050         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1051         RDTSC_STOP(FETessellation, 0, 0);
1052
1053         if (tsData.NumPrimitives == 0)
1054         {
1055             continue;
1056         }
1057         SWR_ASSERT(tsData.NumDomainPoints);
1058
1059         // Allocate DS Output memory
1060         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1061         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1062         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1063         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1064         {
1065             _aligned_free(gt_pTessellationThreadData->pDSOutput);
1066             gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
1067             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1068         }
1069         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1070         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1071
1072 #if defined(_DEBUG)
1073         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1074 #endif
1075
1076         // Run Domain Shader
1077         SWR_DS_CONTEXT dsContext;
1078         dsContext.PrimitiveID = pPrimId[p];
1079         dsContext.pCpIn = &hsContext.pCPout[p];
1080         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1081         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1082         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1083         dsContext.vectorStride = requiredDSVectorInvocations;
1084
1085         uint32_t dsInvocations = 0;
1086
1087         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1088         {
1089             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1090
1091             RDTSC_START(FEDomainShader);
1092             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1093             RDTSC_STOP(FEDomainShader, 0, 0);
1094
1095             dsInvocations += KNOB_SIMD_WIDTH;
1096         }
1097         UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1098
1099         PA_TESS tessPa(
1100             pDC,
1101             dsContext.pOutputData,
1102             dsContext.vectorStride,
1103             tsState.numDsOutputAttribs,
1104             tsData.ppIndices,
1105             tsData.NumPrimitives,
1106             tsState.postDSTopology);
1107
1108         while (tessPa.HasWork())
1109         {
1110             if (HasGeometryShaderT::value)
1111             {
1112                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1113                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1114                     _simd_set1_epi32(dsContext.PrimitiveID));
1115             }
1116             else
1117             {
1118                 if (HasStreamOutT::value)
1119                 {
1120                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1121                 }
1122
1123                 if (HasRastT::value)
1124                 {
1125                     simdvector prim[3]; // Only deal with triangles, lines, or points
1126                     RDTSC_START(FEPAAssemble);
1127 #if SWR_ENABLE_ASSERTS
1128                     bool assemble =
1129 #endif
1130                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1131                     RDTSC_STOP(FEPAAssemble, 1, 0);
1132                     SWR_ASSERT(assemble);
1133
1134                     SWR_ASSERT(pfnClipFunc);
1135                     pfnClipFunc(pDC, tessPa, workerId, prim,
1136                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1137                 }
1138             }
1139
1140             tessPa.NextPrim();
1141
1142         } // while (tessPa.HasWork())
1143     } // for (uint32_t p = 0; p < numPrims; ++p)
1144
1145     TSDestroyCtx(tsCtx);
1146 }
1147
1148 //////////////////////////////////////////////////////////////////////////
1149 /// @brief FE handler for SwrDraw.
1150 /// @tparam IsIndexedT - Is indexed drawing enabled
1151 /// @tparam HasTessellationT - Is tessellation enabled
1152 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1153 /// @tparam HasStreamOutT - Is stream-out enabled
1154 /// @tparam HasRastT - Is rasterization enabled
1155 /// @param pContext - pointer to SWR context.
1156 /// @param pDC - pointer to draw context.
1157 /// @param workerId - thread's worker id.
1158 /// @param pUserData - Pointer to DRAW_WORK
1159 template <
1160     typename IsIndexedT,
1161     typename HasTessellationT,
1162     typename HasGeometryShaderT,
1163     typename HasStreamOutT,
1164     typename HasRastT>
1165 void ProcessDraw(
1166     SWR_CONTEXT *pContext,
1167     DRAW_CONTEXT *pDC,
1168     uint32_t workerId,
1169     void *pUserData)
1170 {
1171
1172 #if KNOB_ENABLE_TOSS_POINTS
1173     if (KNOB_TOSS_QUEUE_FE)
1174     {
1175         return;
1176     }
1177 #endif
1178
1179     RDTSC_START(FEProcessDraw);
1180
1181     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1182     const API_STATE&    state = GetApiState(pDC);
1183     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1184     SWR_VS_CONTEXT      vsContext;
1185     simdvertex          vin;
1186
1187     int indexSize = 0;
1188     uint32_t endVertex = work.numVerts;
1189
1190     const int32_t* pLastRequestedIndex = nullptr;
1191     if (IsIndexedT::value)
1192     {
1193         switch (work.type)
1194         {
1195         case R32_UINT:
1196             indexSize = sizeof(uint32_t);
1197             pLastRequestedIndex = &(work.pIB[endVertex]);
1198             break;
1199         case R16_UINT:
1200             indexSize = sizeof(uint16_t);
1201             // nasty address offset to last index
1202             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1203             break;
1204         case R8_UINT:
1205             indexSize = sizeof(uint8_t);
1206             // nasty address offset to last index
1207             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1208             break;
1209         default:
1210             SWR_ASSERT(0);
1211         }
1212     }
1213     else
1214     {
1215         // No cuts, prune partial primitives.
1216         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1217     }
1218
1219     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1220     fetchInfo.pStreams = &state.vertexBuffers[0];
1221     fetchInfo.StartInstance = work.startInstance;
1222     fetchInfo.StartVertex = 0;
1223
1224     vsContext.pVin = &vin;
1225
1226     if (IsIndexedT::value)
1227     {
1228         fetchInfo.BaseVertex = work.baseVertex;
1229
1230         // if the entire index buffer isn't being consumed, set the last index
1231         // so that fetches < a SIMD wide will be masked off
1232         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1233         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1234         {
1235             fetchInfo.pLastIndex = pLastRequestedIndex;
1236         }
1237     }
1238     else
1239     {
1240         fetchInfo.StartVertex = work.startVertex;
1241     }
1242
1243 #ifdef KNOB_ENABLE_RDTSC
1244     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1245 #endif
1246
1247     void* pGsOut = nullptr;
1248     void* pCutBuffer = nullptr;
1249     void* pStreamCutBuffer = nullptr;
1250     if (HasGeometryShaderT::value)
1251     {
1252         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1253     }
1254
1255     if (HasTessellationT::value)
1256     {
1257         SWR_ASSERT(state.tsState.tsEnable == true);
1258         SWR_ASSERT(state.pfnHsFunc != nullptr);
1259         SWR_ASSERT(state.pfnDsFunc != nullptr);
1260
1261         AllocateTessellationData(pContext);
1262     }
1263     else
1264     {
1265         SWR_ASSERT(state.tsState.tsEnable == false);
1266         SWR_ASSERT(state.pfnHsFunc == nullptr);
1267         SWR_ASSERT(state.pfnDsFunc == nullptr);
1268     }
1269
1270     // allocate space for streamout input prim data
1271     uint32_t* pSoPrimData = nullptr;
1272     if (HasStreamOutT::value)
1273     {
1274         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1275
1276         // update the
1277         for (uint32_t i = 0; i < 4; ++i)
1278         {
1279             SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1280         }
1281
1282     }
1283
1284     // choose primitive assembler
1285     PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
1286     PA_STATE& pa = paFactory.GetPA();
1287
1288     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1289     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1290     {
1291         simdscalari vIndex;
1292         uint32_t  i = 0;
1293
1294         if (IsIndexedT::value)
1295         {
1296             fetchInfo.pIndices = work.pIB;
1297         }
1298         else
1299         {
1300             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1301             fetchInfo.pIndices = (const int32_t*)&vIndex;
1302         }
1303
1304         fetchInfo.CurInstance = instanceNum;
1305         vsContext.InstanceID = instanceNum;
1306
1307         while (pa.HasWork())
1308         {
1309             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1310             // So we need to keep this outside of (i < endVertex) check.
1311             simdmask* pvCutIndices = nullptr;
1312             if (IsIndexedT::value)
1313             {
1314                 pvCutIndices = &pa.GetNextVsIndices();
1315             }
1316
1317             simdvertex& vout = pa.GetNextVsOutput();
1318             vsContext.pVout = &vout;
1319
1320             if (i < endVertex)
1321             {
1322
1323                 // 1. Execute FS/VS for a single SIMD.
1324                 RDTSC_START(FEFetchShader);
1325                 state.pfnFetchFunc(fetchInfo, vin);
1326                 RDTSC_STOP(FEFetchShader, 0, 0);
1327
1328                 // forward fetch generated vertex IDs to the vertex shader
1329                 vsContext.VertexID = fetchInfo.VertexID;
1330
1331                 // Setup active mask for vertex shader.
1332                 vsContext.mask = GenerateMask(endVertex - i);
1333
1334                 // forward cut mask to the PA
1335                 if (IsIndexedT::value)
1336                 {
1337                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1338                 }
1339
1340                 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1341
1342 #if KNOB_ENABLE_TOSS_POINTS
1343                 if (!KNOB_TOSS_FETCH)
1344 #endif
1345                 {
1346                     RDTSC_START(FEVertexShader);
1347                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1348                     RDTSC_STOP(FEVertexShader, 0, 0);
1349
1350                     UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1351                 }
1352             }
1353
1354             // 2. Assemble primitives given the last two SIMD.
1355             do
1356             {
1357                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1358                 // PaAssemble returns false if there is not enough verts to assemble.
1359                 RDTSC_START(FEPAAssemble);
1360                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1361                 RDTSC_STOP(FEPAAssemble, 1, 0);
1362
1363 #if KNOB_ENABLE_TOSS_POINTS
1364                 if (!KNOB_TOSS_FETCH)
1365 #endif
1366                 {
1367 #if KNOB_ENABLE_TOSS_POINTS
1368                     if (!KNOB_TOSS_VS)
1369 #endif
1370                     {
1371                         if (assemble)
1372                         {
1373                             UPDATE_STAT(IaPrimitives, pa.NumPrims());
1374
1375                             if (HasTessellationT::value)
1376                             {
1377                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1378                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1379                             }
1380                             else if (HasGeometryShaderT::value)
1381                             {
1382                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1383                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1384                             }
1385                             else
1386                             {
1387                                 // If streamout is enabled then stream vertices out to memory.
1388                                 if (HasStreamOutT::value)
1389                                 {
1390                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1391                                 }
1392
1393                                 if (HasRastT::value)
1394                                 {
1395                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1396                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1397                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1398                                 }
1399                             }
1400                         }
1401                     }
1402                 }
1403             } while (pa.NextPrim());
1404
1405             i += KNOB_SIMD_WIDTH;
1406             if (IsIndexedT::value)
1407             {
1408                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1409             }
1410             else
1411             {
1412                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1413             }
1414         }
1415         pa.Reset();
1416     }
1417
1418     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1419 }
1420
1421 struct FEDrawChooser
1422 {
1423     typedef PFN_FE_WORK_FUNC FuncType;
1424
1425     template <typename... ArgsB>
1426     static FuncType GetFunc()
1427     {
1428         return ProcessDraw<ArgsB...>;
1429     }
1430 };
1431
1432
1433 // Selector for correct templated Draw front-end function
1434 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1435     bool IsIndexed,
1436     bool HasTessellation,
1437     bool HasGeometryShader,
1438     bool HasStreamOut,
1439     bool HasRasterization)
1440 {
1441     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1442 }
1443
1444
1445 //////////////////////////////////////////////////////////////////////////
1446 /// @brief Processes attributes for the backend based on linkage mask and
1447 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1448 /// @param pDC - Draw context
1449 /// @param pa - Primitive Assembly state
1450 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1451 /// @param pLinkageMap - maps VS attribute slot to PS slot
1452 /// @param triIndex - Triangle to process attributes for
1453 /// @param pBuffer - Output result
1454 template<uint32_t NumVerts>
1455 INLINE void ProcessAttributes(
1456     DRAW_CONTEXT *pDC,
1457     PA_STATE&pa,
1458     uint32_t linkageMask,
1459     const uint8_t* pLinkageMap,
1460     uint32_t triIndex,
1461     float *pBuffer)
1462 {
1463     DWORD slot = 0;
1464     uint32_t mapIdx = 0;
1465     LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1466     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1467
1468     while (_BitScanForward(&slot, linkageMask))
1469     {
1470         linkageMask &= ~(1 << slot); // done with this bit.
1471
1472         // compute absolute slot in vertex attrib array
1473         uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1474
1475         __m128 attrib[3];    // triangle attribs (always 4 wide)
1476         pa.AssembleSingle(inputSlot, triIndex, attrib);
1477
1478         if (_bittest(&constantInterpMask, mapIdx))
1479         {
1480             for (uint32_t i = 0; i < NumVerts; ++i)
1481             {
1482                 _mm_store_ps(pBuffer, attrib[provokingVertex]);
1483                 pBuffer += 4;
1484             }
1485         }
1486         else
1487         {
1488             for (uint32_t i = 0; i < NumVerts; ++i)
1489             {
1490                 _mm_store_ps(pBuffer, attrib[i]);
1491                 pBuffer += 4;
1492             }
1493         }
1494
1495         // pad out the attrib buffer to 3 verts to ensure the triangle
1496         // interpolation code in the pixel shader works correctly for the
1497         // 3 topologies - point, line, tri.  This effectively zeros out the
1498         // effect of the missing vertices in the triangle interpolation.
1499         for (uint32_t i = NumVerts; i < 3; ++i)
1500         {
1501             _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1502             pBuffer += 4;
1503         }
1504
1505         mapIdx++;
1506     }
1507 }
1508
1509 //////////////////////////////////////////////////////////////////////////
1510 /// @brief Processes enabled user clip distances. Loads the active clip
1511 ///        distances from the PA, sets up barycentric equations, and
1512 ///        stores the results to the output buffer
1513 /// @param pa - Primitive Assembly state
1514 /// @param primIndex - primitive index to process
1515 /// @param clipDistMask - mask of enabled clip distances
1516 /// @param pUserClipBuffer - buffer to store results
1517 template<uint32_t NumVerts>
1518 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1519 {
1520     DWORD clipDist;
1521     while (_BitScanForward(&clipDist, clipDistMask))
1522     {
1523         clipDistMask &= ~(1 << clipDist);
1524         uint32_t clipSlot = clipDist >> 2;
1525         uint32_t clipComp = clipDist & 0x3;
1526         uint32_t clipAttribSlot = clipSlot == 0 ?
1527             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1528
1529         __m128 primClipDist[3];
1530         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1531
1532         float vertClipDist[NumVerts];
1533         for (uint32_t e = 0; e < NumVerts; ++e)
1534         {
1535             OSALIGNSIMD(float) aVertClipDist[4];
1536             _mm_store_ps(aVertClipDist, primClipDist[e]);
1537             vertClipDist[e] = aVertClipDist[clipComp];
1538         };
1539
1540         // setup plane equations for barycentric interpolation in the backend
1541         float baryCoeff[NumVerts];
1542         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1543         {
1544             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1545         }
1546         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1547
1548         for (uint32_t e = 0; e < NumVerts; ++e)
1549         {
1550             *(pUserClipBuffer++) = baryCoeff[e];
1551         }
1552     }
1553 }
1554
1555 //////////////////////////////////////////////////////////////////////////
1556 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1557 ///        culling, viewport transform, etc.
1558 /// @param pDC - pointer to draw context.
1559 /// @param pa - The primitive assembly object.
1560 /// @param workerId - thread's worker id. Even thread has a unique id.
1561 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1562 /// @param primID - Primitive ID for each triangle.
1563 void BinTriangles(
1564     DRAW_CONTEXT *pDC,
1565     PA_STATE& pa,
1566     uint32_t workerId,
1567     simdvector tri[3],
1568     uint32_t triMask,
1569     simdscalari primID)
1570 {
1571     RDTSC_START(FEBinTriangles);
1572
1573     const API_STATE& state = GetApiState(pDC);
1574     const SWR_RASTSTATE& rastState = state.rastState;
1575     const SWR_FRONTEND_STATE& feState = state.frontendState;
1576     const SWR_GS_STATE& gsState = state.gsState;
1577
1578     // Simple wireframe mode for debugging purposes only
1579
1580     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1581     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1582     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1583
1584     if (!feState.vpTransformDisable)
1585     {
1586         // perspective divide
1587         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1588         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1589         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1590
1591         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1592         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1593         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1594
1595         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1596         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1597         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1598
1599         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1600         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1601         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1602
1603         // viewport transform to screen coords
1604         viewportTransform<3>(tri, state.vpMatrix[0]);
1605     }
1606
1607     // adjust for pixel center location
1608     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1609     tri[0].x = _simd_add_ps(tri[0].x, offset);
1610     tri[0].y = _simd_add_ps(tri[0].y, offset);
1611
1612     tri[1].x = _simd_add_ps(tri[1].x, offset);
1613     tri[1].y = _simd_add_ps(tri[1].y, offset);
1614
1615     tri[2].x = _simd_add_ps(tri[2].x, offset);
1616     tri[2].y = _simd_add_ps(tri[2].y, offset);
1617
1618     // convert to fixed point
1619     simdscalari vXi[3], vYi[3];
1620     vXi[0] = fpToFixedPointVertical(tri[0].x);
1621     vYi[0] = fpToFixedPointVertical(tri[0].y);
1622     vXi[1] = fpToFixedPointVertical(tri[1].x);
1623     vYi[1] = fpToFixedPointVertical(tri[1].y);
1624     vXi[2] = fpToFixedPointVertical(tri[2].x);
1625     vYi[2] = fpToFixedPointVertical(tri[2].y);
1626
1627     // triangle setup
1628     simdscalari vAi[3], vBi[3];
1629     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1630
1631     // determinant
1632     simdscalari vDet[2];
1633     calcDeterminantIntVertical(vAi, vBi, vDet);
1634
1635     // cull zero area
1636     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1637     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1638
1639     int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
1640
1641     uint32_t origTriMask = triMask;
1642     triMask &= ~cullZeroAreaMask;
1643
1644     // determine front winding tris
1645     // CW  +det
1646     // CCW -det
1647     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1648     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1649     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1650
1651     uint32_t frontWindingTris;
1652     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1653     {
1654         frontWindingTris = cwTriMask;
1655     }
1656     else
1657     {
1658         frontWindingTris = ~cwTriMask;
1659     }
1660
1661     // cull
1662     uint32_t cullTris;
1663     switch ((SWR_CULLMODE)rastState.cullMode)
1664     {
1665     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1666     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1667     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1668     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1669     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1670     }
1671
1672     triMask &= ~cullTris;
1673
1674     if (origTriMask ^ triMask)
1675     {
1676         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1677     }
1678
1679     // compute per tri backface
1680     uint32_t frontFaceMask = frontWindingTris;
1681
1682     uint32_t *pPrimID = (uint32_t *)&primID;
1683     DWORD triIndex = 0;
1684
1685     if (!triMask)
1686     {
1687         goto endBinTriangles;
1688     }
1689
1690     // Calc bounding box of triangles
1691     simdBBox bbox;
1692     calcBoundingBoxIntVertical(vXi, vYi, bbox);
1693
1694     // determine if triangle falls between pixel centers and discard
1695     // only discard for non-MSAA case
1696     // (left + 127) & ~255
1697     // (right + 128) & ~255
1698
1699     if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
1700     {
1701         origTriMask = triMask;
1702
1703         int cullCenterMask;
1704         {
1705             simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1706             left = _simd_and_si(left, _simd_set1_epi32(~255));
1707             simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1708             right = _simd_and_si(right, _simd_set1_epi32(~255));
1709
1710             simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1711
1712             simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1713             top = _simd_and_si(top, _simd_set1_epi32(~255));
1714             simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1715             bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1716
1717             simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1718             vMaskV = _simd_or_si(vMaskH, vMaskV);
1719             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1720         }
1721
1722         triMask &= ~cullCenterMask;
1723
1724         if(origTriMask ^ triMask)
1725         {
1726             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1727         }
1728     }
1729
1730     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1731     bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1732     bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1733     bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1734     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1735
1736     // Cull tris completely outside scissor
1737     {
1738         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1739         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1740         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1741         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1742         triMask = triMask & ~maskOutsideScissor;
1743     }
1744
1745     if (!triMask)
1746     {
1747         goto endBinTriangles;
1748     }
1749
1750     // Convert triangle bbox to macrotile units.
1751     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1752     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1753     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1754     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1755
1756     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1757     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1758     _simd_store_si((simdscalari*)aMTRight, bbox.right);
1759     _simd_store_si((simdscalari*)aMTTop, bbox.top);
1760     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1761
1762     // transpose verts needed for backend
1763     /// @todo modify BE to take non-transformed verts
1764     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1765     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1766     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1767     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1768     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1769
1770     // store render target array index
1771     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1772     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1773     {
1774         simdvector vRtai[3];
1775         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1776         simdscalari vRtaii;
1777         vRtaii = _simd_castps_si(vRtai[0].x);
1778         _simd_store_si((simdscalari*)aRTAI, vRtaii);
1779     }
1780     else
1781     {
1782         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1783     }
1784
1785     // scan remaining valid triangles and bin each separately
1786     while (_BitScanForward(&triIndex, triMask))
1787     {
1788         uint32_t linkageCount = state.linkageCount;
1789         uint32_t linkageMask  = state.linkageMask;
1790         uint32_t numScalarAttribs = linkageCount * 4;
1791
1792         BE_WORK work;
1793         work.type = DRAW;
1794
1795         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1796
1797         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1798         desc.triFlags.primID = pPrimID[triIndex];
1799         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1800
1801         if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
1802         {
1803             work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
1804         }
1805         else
1806         {
1807             // for center sample pattern, all samples are at pixel center; calculate coverage
1808             // once at center and broadcast the results in the backend
1809             work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
1810         }
1811
1812         auto pArena = pDC->pArena;
1813         SWR_ASSERT(pArena != nullptr);
1814
1815         // store active attribs
1816         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1817         desc.pAttribs = pAttribs;
1818         desc.numAttribs = linkageCount;
1819         ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1820
1821         // store triangle vertex data
1822         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1823
1824         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1825         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1826         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1827         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1828
1829         // store user clip distances
1830         if (rastState.clipDistanceMask)
1831         {
1832             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1833             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1834             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1835         }
1836
1837         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1838         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1839         {
1840             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1841             {
1842 #if KNOB_ENABLE_TOSS_POINTS
1843                 if (!KNOB_TOSS_SETUP_TRIS)
1844 #endif
1845                 {
1846                     pTileMgr->enqueue(x, y, &work);
1847                 }
1848             }
1849         }
1850
1851         triMask &= ~(1 << triIndex);
1852     }
1853
1854 endBinTriangles:
1855     RDTSC_STOP(FEBinTriangles, 1, 0);
1856 }
1857
1858
1859
1860 //////////////////////////////////////////////////////////////////////////
1861 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1862 /// @param pDC - pointer to draw context.
1863 /// @param pa - The primitive assembly object.
1864 /// @param workerId - thread's worker id. Even thread has a unique id.
1865 /// @param tri - Contains point position data for SIMDs worth of points.
1866 /// @param primID - Primitive ID for each point.
1867 void BinPoints(
1868     DRAW_CONTEXT *pDC,
1869     PA_STATE& pa,
1870     uint32_t workerId,
1871     simdvector prim[3],
1872     uint32_t primMask,
1873     simdscalari primID)
1874 {
1875     RDTSC_START(FEBinPoints);
1876
1877     simdvector& primVerts = prim[0];
1878
1879     const API_STATE& state = GetApiState(pDC);
1880     const SWR_FRONTEND_STATE& feState = state.frontendState;
1881     const SWR_GS_STATE& gsState = state.gsState;
1882     const SWR_RASTSTATE& rastState = state.rastState;
1883
1884     if (!feState.vpTransformDisable)
1885     {
1886         // perspective divide
1887         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1888         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1889         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1890         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1891
1892         // viewport transform to screen coords
1893         viewportTransform<1>(&primVerts, state.vpMatrix[0]);
1894     }
1895
1896     // adjust for pixel center location
1897     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1898     primVerts.x = _simd_add_ps(primVerts.x, offset);
1899     primVerts.y = _simd_add_ps(primVerts.y, offset);
1900
1901     // convert to fixed point
1902     simdscalari vXi, vYi;
1903     vXi = fpToFixedPointVertical(primVerts.x);
1904     vYi = fpToFixedPointVertical(primVerts.y);
1905
1906     if (CanUseSimplePoints(pDC))
1907     {
1908         // adjust for top-left rule
1909         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1910         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1911
1912         // cull points off the top-left edge of the viewport
1913         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1914         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1915
1916         // compute macro tile coordinates
1917         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919
1920         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1921         _simd_store_si((simdscalari*)aMacroX, macroX);
1922         _simd_store_si((simdscalari*)aMacroY, macroY);
1923
1924         // compute raster tile coordinates
1925         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1926         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1927
1928         // compute raster tile relative x,y for coverage mask
1929         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1930         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1931
1932         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1933         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1934
1935         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1936         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1937         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1938         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1939
1940         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1941         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1942         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1943         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1944
1945         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1946         _simd_store_ps((float*)aZ, primVerts.z);
1947
1948         // store render target array index
1949         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1950         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1951         {
1952             simdvector vRtai;
1953             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1954             simdscalari vRtaii = _simd_castps_si(vRtai.x);
1955             _simd_store_si((simdscalari*)aRTAI, vRtaii);
1956         }
1957         else
1958         {
1959             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1960         }
1961
1962         uint32_t *pPrimID = (uint32_t *)&primID;
1963         DWORD primIndex = 0;
1964         // scan remaining valid triangles and bin each separately
1965         while (_BitScanForward(&primIndex, primMask))
1966         {
1967             uint32_t linkageCount = state.linkageCount;
1968             uint32_t linkageMask = state.linkageMask;
1969
1970             uint32_t numScalarAttribs = linkageCount * 4;
1971
1972             BE_WORK work;
1973             work.type = DRAW;
1974
1975             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1976
1977             // points are always front facing
1978             desc.triFlags.frontFacing = 1;
1979             desc.triFlags.primID = pPrimID[primIndex];
1980             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1981
1982             work.pfnWork = RasterizeSimplePoint;
1983
1984             auto pArena = pDC->pArena;
1985             SWR_ASSERT(pArena != nullptr);
1986
1987             // store attributes
1988             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1989             desc.pAttribs = pAttribs;
1990             desc.numAttribs = linkageCount;
1991
1992             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
1993
1994             // store raster tile aligned x, y, perspective correct z
1995             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1996             desc.pTriBuffer = pTriBuffer;
1997             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1998             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1999             *pTriBuffer = aZ[primIndex];
2000
2001             uint32_t tX = aTileRelativeX[primIndex];
2002             uint32_t tY = aTileRelativeY[primIndex];
2003
2004             // pack the relative x,y into the coverageMask, the rasterizer will
2005             // generate the true coverage mask from it
2006             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2007
2008             // bin it
2009             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2010 #if KNOB_ENABLE_TOSS_POINTS
2011             if (!KNOB_TOSS_SETUP_TRIS)
2012 #endif
2013             {
2014                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2015             }
2016             primMask &= ~(1 << primIndex);
2017         }
2018     }
2019     else
2020     {
2021         // non simple points need to be potentially binned to multiple macro tiles
2022         simdscalar vPointSize;
2023         if (rastState.pointParam)
2024         {
2025             simdvector size[3];
2026             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2027             vPointSize = size[0].x;
2028         }
2029         else
2030         {
2031             vPointSize = _simd_set1_ps(rastState.pointSize);
2032         }
2033
2034         // bloat point to bbox
2035         simdBBox bbox;
2036         bbox.left = bbox.right = vXi;
2037         bbox.top = bbox.bottom = vYi;
2038
2039         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2040         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2041         bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2042         bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2043         bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2044         bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2045
2046         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2047         bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2048         bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2049         bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2050         bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2051
2052         // Cull bloated points completely outside scissor
2053         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2054         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2055         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2056         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2057         primMask = primMask & ~maskOutsideScissor;
2058
2059         // Convert bbox to macrotile units.
2060         bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061         bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062         bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2063         bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2064
2065         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2066         _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2067         _simd_store_si((simdscalari*)aMTRight, bbox.right);
2068         _simd_store_si((simdscalari*)aMTTop, bbox.top);
2069         _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2070
2071         // store render target array index
2072         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2073         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2074         {
2075             simdvector vRtai[2];
2076             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2077             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2078             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2079         }
2080         else
2081         {
2082             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2083         }
2084
2085         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2086         _simd_store_ps((float*)aPointSize, vPointSize);
2087
2088         uint32_t *pPrimID = (uint32_t *)&primID;
2089
2090         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2091         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2092         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2093
2094         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2095         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2096         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2097
2098         // scan remaining valid prims and bin each separately
2099         DWORD primIndex;
2100         while (_BitScanForward(&primIndex, primMask))
2101         {
2102             uint32_t linkageCount = state.linkageCount;
2103             uint32_t linkageMask = state.linkageMask;
2104             uint32_t numScalarAttribs = linkageCount * 4;
2105
2106             BE_WORK work;
2107             work.type = DRAW;
2108
2109             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2110
2111             desc.triFlags.frontFacing = 1;
2112             desc.triFlags.primID = pPrimID[primIndex];
2113             desc.triFlags.pointSize = aPointSize[primIndex];
2114             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2115
2116             work.pfnWork = RasterizeTriPoint;
2117
2118             auto pArena = pDC->pArena;
2119             SWR_ASSERT(pArena != nullptr);
2120
2121             // store active attribs
2122             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2123             desc.numAttribs = linkageCount;
2124             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2125
2126             // store point vertex data
2127             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2128             desc.pTriBuffer = pTriBuffer;
2129             *pTriBuffer++ = aPrimVertsX[primIndex];
2130             *pTriBuffer++ = aPrimVertsY[primIndex];
2131             *pTriBuffer = aPrimVertsZ[primIndex];
2132
2133             // store user clip distances
2134             if (rastState.clipDistanceMask)
2135             {
2136                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2137                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2138                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2139             }
2140
2141             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2142             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2143             {
2144                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2145                 {
2146 #if KNOB_ENABLE_TOSS_POINTS
2147                     if (!KNOB_TOSS_SETUP_TRIS)
2148 #endif
2149                     {
2150                         pTileMgr->enqueue(x, y, &work);
2151                     }
2152                 }
2153             }
2154
2155             primMask &= ~(1 << primIndex);
2156         }
2157     }
2158
2159
2160
2161
2162     RDTSC_STOP(FEBinPoints, 1, 0);
2163 }
2164
2165 //////////////////////////////////////////////////////////////////////////
2166 /// @brief Bin SIMD lines to the backend.
2167 /// @param pDC - pointer to draw context.
2168 /// @param pa - The primitive assembly object.
2169 /// @param workerId - thread's worker id. Even thread has a unique id.
2170 /// @param tri - Contains line position data for SIMDs worth of points.
2171 /// @param primID - Primitive ID for each line.
2172 void BinLines(
2173     DRAW_CONTEXT *pDC,
2174     PA_STATE& pa,
2175     uint32_t workerId,
2176     simdvector prim[],
2177     uint32_t primMask,
2178     simdscalari primID)
2179 {
2180     RDTSC_START(FEBinLines);
2181
2182     const API_STATE& state = GetApiState(pDC);
2183     const SWR_RASTSTATE& rastState = state.rastState;
2184     const SWR_FRONTEND_STATE& feState = state.frontendState;
2185     const SWR_GS_STATE& gsState = state.gsState;
2186
2187     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2188     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2189
2190     if (!feState.vpTransformDisable)
2191     {
2192         // perspective divide
2193         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2194         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2195
2196         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2197         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2198
2199         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2200         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2201
2202         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2203         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2204
2205         // viewport transform to screen coords
2206         viewportTransform<2>(prim, state.vpMatrix[0]);
2207     }
2208
2209     // adjust for pixel center location
2210     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2211     prim[0].x = _simd_add_ps(prim[0].x, offset);
2212     prim[0].y = _simd_add_ps(prim[0].y, offset);
2213
2214     prim[1].x = _simd_add_ps(prim[1].x, offset);
2215     prim[1].y = _simd_add_ps(prim[1].y, offset);
2216
2217     // convert to fixed point
2218     simdscalari vXi[2], vYi[2];
2219     vXi[0] = fpToFixedPointVertical(prim[0].x);
2220     vYi[0] = fpToFixedPointVertical(prim[0].y);
2221     vXi[1] = fpToFixedPointVertical(prim[1].x);
2222     vYi[1] = fpToFixedPointVertical(prim[1].y);
2223
2224     // compute x-major vs y-major mask
2225     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2226     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2227     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2228     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2229
2230     // cull zero-length lines
2231     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2232     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2233
2234     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2235
2236     uint32_t *pPrimID = (uint32_t *)&primID;
2237
2238     simdscalar vUnused = _simd_setzero_ps();
2239
2240     // Calc bounding box of lines
2241     simdBBox bbox;
2242     bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2243     bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2244     bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2245     bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2246
2247     // bloat bbox by line width along minor axis
2248     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2249     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2250     simdBBox bloatBox;
2251     bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2252     bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2253     bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2254     bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2255
2256     bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2257     bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2258     bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2259     bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2260
2261     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2262     bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2263     bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2264     bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2265     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2266
2267     // Cull prims completely outside scissor
2268     {
2269         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2270         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2271         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2272         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2273         primMask = primMask & ~maskOutsideScissor;
2274     }
2275
2276     if (!primMask)
2277     {
2278         goto endBinLines;
2279     }
2280
2281     // Convert triangle bbox to macrotile units.
2282     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2283     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2284     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2285     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2286
2287     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2288     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2289     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2290     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2291     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2292
2293     // transpose verts needed for backend
2294     /// @todo modify BE to take non-transformed verts
2295     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2296     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2297     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2298     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2299     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2300
2301     // store render target array index
2302     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304     {
2305         simdvector vRtai[2];
2306         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2307         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2308         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309     }
2310     else
2311     {
2312         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313     }
2314
2315     // scan remaining valid prims and bin each separately
2316     DWORD primIndex;
2317     while (_BitScanForward(&primIndex, primMask))
2318     {
2319         uint32_t linkageCount = state.linkageCount;
2320         uint32_t linkageMask = state.linkageMask;
2321         uint32_t numScalarAttribs = linkageCount * 4;
2322
2323         BE_WORK work;
2324         work.type = DRAW;
2325
2326         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2327
2328         desc.triFlags.frontFacing = 1;
2329         desc.triFlags.primID = pPrimID[primIndex];
2330         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2331         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2332
2333         work.pfnWork = RasterizeLine;
2334
2335         auto pArena = pDC->pArena;
2336         SWR_ASSERT(pArena != nullptr);
2337
2338         // store active attribs
2339         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2340         desc.numAttribs = linkageCount;
2341         ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2342
2343         // store line vertex data
2344         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2345         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2346         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2347         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2348         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2349
2350         // store user clip distances
2351         if (rastState.clipDistanceMask)
2352         {
2353             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2354             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2355             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2356         }
2357
2358         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2359         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2360         {
2361             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2362             {
2363 #if KNOB_ENABLE_TOSS_POINTS
2364                 if (!KNOB_TOSS_SETUP_TRIS)
2365 #endif
2366                 {
2367                     pTileMgr->enqueue(x, y, &work);
2368                 }
2369             }
2370         }
2371
2372         primMask &= ~(1 << primIndex);
2373     }
2374
2375 endBinLines:
2376
2377     RDTSC_STOP(FEBinLines, 1, 0);
2378 }