src/gallium/drivers/swr/rasterizer/core/frontend.cpp

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file frontend.cpp
  24 *
  25 * @brief Implementation for Frontend which handles vertex processing,
  26 *        primitive assembly, clipping, binning, etc.
  27 *
  28 ******************************************************************************/
  29
  30 #include "api.h"
  31 #include "frontend.h"
  32 #include "backend.h"
  33 #include "context.h"
  34 #include "rdtsc_core.h"
  35 #include "rasterizer.h"
  36 #include "utils.h"
  37 #include "threads.h"
  38 #include "pa.h"
  39 #include "clip.h"
  40 #include "tilemgr.h"
  41 #include "tessellator.h"
  42 #include <limits>
  43
  44 //////////////////////////////////////////////////////////////////////////
  45 /// @brief Helper macro to generate a bitmask
  46 static INLINE uint32_t GenMask(uint32_t numBits)
  47 {
  48     SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
  49     return ((1U << numBits) - 1);
  50 }
  51
  52 //////////////////////////////////////////////////////////////////////////
  53 /// @brief Offsets added to post-viewport vertex positions based on
  54 /// raster state.
  55 static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
  56 {
  57     _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
  58     _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
  59 };
  60
  61 //////////////////////////////////////////////////////////////////////////
  62 /// @brief FE handler for SwrSync.
  63 /// @param pContext - pointer to SWR context.
  64 /// @param pDC - pointer to draw context.
  65 /// @param workerId - thread's worker id. Even thread has a unique id.
  66 /// @param pUserData - Pointer to user data passed back to sync callback.
  67 /// @todo This should go away when we switch this to use compute threading.
  68 void ProcessSync(
  69     SWR_CONTEXT *pContext,
  70     DRAW_CONTEXT *pDC,
  71     uint32_t workerId,
  72     void *pUserData)
  73 {
  74     SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
  75     BE_WORK work;
  76     work.type = SYNC;
  77     work.pfnWork = ProcessSyncBE;
  78     work.desc.sync = *pSync;
  79
  80     MacroTileMgr *pTileMgr = pDC->pTileMgr;
  81     pTileMgr->enqueue(0, 0, &work);
  82 }
  83
  84 //////////////////////////////////////////////////////////////////////////
  85 /// @brief FE handler for SwrGetStats.
  86 /// @param pContext - pointer to SWR context.
  87 /// @param pDC - pointer to draw context.
  88 /// @param workerId - thread's worker id. Even thread has a unique id.
  89 /// @param pUserData - Pointer to user data passed back to stats callback.
  90 /// @todo This should go away when we switch this to use compute threading.
  91 void ProcessQueryStats(
  92     SWR_CONTEXT *pContext,
  93     DRAW_CONTEXT *pDC,
  94     uint32_t workerId,
  95     void *pUserData)
  96 {
  97     QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
  98     BE_WORK work;
  99     work.type = QUERYSTATS;
 100     work.pfnWork = ProcessQueryStatsBE;
 101     work.desc.queryStats = *pQueryStats;
 102
 103     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 104     pTileMgr->enqueue(0, 0, &work);
 105 }
 106
 107 //////////////////////////////////////////////////////////////////////////
 108 /// @brief FE handler for SwrClearRenderTarget.
 109 /// @param pContext - pointer to SWR context.
 110 /// @param pDC - pointer to draw context.
 111 /// @param workerId - thread's worker id. Even thread has a unique id.
 112 /// @param pUserData - Pointer to user data passed back to clear callback.
 113 /// @todo This should go away when we switch this to use compute threading.
 114 void ProcessClear(
 115     SWR_CONTEXT *pContext,
 116     DRAW_CONTEXT *pDC,
 117     uint32_t workerId,
 118     void *pUserData)
 119 {
 120     CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
 121     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 122
 123     const API_STATE& state = GetApiState(pDC);
 124
 125     // queue a clear to each macro tile
 126     // compute macro tile bounds for the current scissor/viewport
 127     uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
 128     uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
 129     uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
 130     uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
 131
 132     BE_WORK work;
 133     work.type = CLEAR;
 134     work.pfnWork = ProcessClearBE;
 135     work.desc.clear = *pClear;
 136
 137     for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
 138     {
 139         for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
 140         {
 141             pTileMgr->enqueue(x, y, &work);
 142         }
 143     }
 144 }
 145
 146 //////////////////////////////////////////////////////////////////////////
 147 /// @brief FE handler for SwrStoreTiles.
 148 /// @param pContext - pointer to SWR context.
 149 /// @param pDC - pointer to draw context.
 150 /// @param workerId - thread's worker id. Even thread has a unique id.
 151 /// @param pUserData - Pointer to user data passed back to callback.
 152 /// @todo This should go away when we switch this to use compute threading.
 153 void ProcessStoreTiles(
 154     SWR_CONTEXT *pContext,
 155     DRAW_CONTEXT *pDC,
 156     uint32_t workerId,
 157     void *pUserData)
 158 {
 159     RDTSC_START(FEProcessStoreTiles);
 160     STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
 161     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 162
 163     const API_STATE& state = GetApiState(pDC);
 164
 165     // queue a store to each macro tile
 166     // compute macro tile bounds for the current render target
 167     const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 168     const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 169
 170     uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
 171     uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
 172
 173     // store tiles
 174     BE_WORK work;
 175     work.type = STORETILES;
 176     work.pfnWork = ProcessStoreTileBE;
 177     work.desc.storeTiles = *pStore;
 178
 179     for (uint32_t x = 0; x < numMacroTilesX; ++x)
 180     {
 181         for (uint32_t y = 0; y < numMacroTilesY; ++y)
 182         {
 183             pTileMgr->enqueue(x, y, &work);
 184         }
 185     }
 186
 187     RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
 188 }
 189
 190 //////////////////////////////////////////////////////////////////////////
 191 /// @brief FE handler for SwrInvalidateTiles.
 192 /// @param pContext - pointer to SWR context.
 193 /// @param pDC - pointer to draw context.
 194 /// @param workerId - thread's worker id. Even thread has a unique id.
 195 /// @param pUserData - Pointer to user data passed back to callback.
 196 /// @todo This should go away when we switch this to use compute threading.
 197 void ProcessDiscardInvalidateTiles(
 198     SWR_CONTEXT *pContext,
 199     DRAW_CONTEXT *pDC,
 200     uint32_t workerId,
 201     void *pUserData)
 202 {
 203     RDTSC_START(FEProcessInvalidateTiles);
 204     DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
 205     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 206
 207     SWR_RECT rect;
 208
 209     if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
 210     {
 211         // Valid rect
 212         rect = pInv->rect;
 213     }
 214     else
 215     {
 216         // Use viewport dimensions
 217         const API_STATE& state = GetApiState(pDC);
 218
 219         rect.left   = (uint32_t)state.vp[0].x;
 220         rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
 221         rect.top    = (uint32_t)state.vp[0].y;
 222         rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
 223     }
 224
 225     // queue a store to each macro tile
 226     // compute macro tile bounds for the current render target
 227     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
 228     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 229
 230     // Setup region assuming full tiles
 231     uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
 232     uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
 233
 234     uint32_t macroTileEndX = rect.right / macroWidth;
 235     uint32_t macroTileEndY = rect.bottom / macroHeight;
 236
 237     if (pInv->fullTilesOnly == false)
 238     {
 239         // include partial tiles
 240         macroTileStartX = rect.left / macroWidth;
 241         macroTileStartY = rect.top / macroHeight;
 242
 243         macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
 244         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
 245     }
 246
 247     SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
 248     SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 249
 250     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
 251     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 252
 253     // load tiles
 254     BE_WORK work;
 255     work.type = DISCARDINVALIDATETILES;
 256     work.pfnWork = ProcessDiscardInvalidateTilesBE;
 257     work.desc.discardInvalidateTiles = *pInv;
 258
 259     for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
 260     {
 261         for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
 262         {
 263             pTileMgr->enqueue(x, y, &work);
 264         }
 265     }
 266
 267     RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
 268 }
 269
 270 //////////////////////////////////////////////////////////////////////////
 271 /// @brief Computes the number of primitives given the number of verts.
 272 /// @param mode - primitive topology for draw operation.
 273 /// @param numPrims - number of vertices or indices for draw.
 274 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
 275 uint32_t GetNumPrims(
 276     PRIMITIVE_TOPOLOGY mode,
 277     uint32_t numPrims)
 278 {
 279     switch (mode)
 280     {
 281     case TOP_POINT_LIST: return numPrims;
 282     case TOP_TRIANGLE_LIST: return numPrims / 3;
 283     case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
 284     case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
 285     case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
 286     case TOP_QUAD_LIST: return numPrims / 4;
 287     case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
 288     case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
 289     case TOP_LINE_LIST: return numPrims / 2;
 290     case TOP_LINE_LOOP: return numPrims;
 291     case TOP_RECT_LIST: return numPrims / 3;
 292     case TOP_LINE_LIST_ADJ: return numPrims / 4;
 293     case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
 294     case TOP_TRI_LIST_ADJ: return numPrims / 6;
 295     case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
 296
 297     case TOP_PATCHLIST_1:
 298     case TOP_PATCHLIST_2:
 299     case TOP_PATCHLIST_3:
 300     case TOP_PATCHLIST_4:
 301     case TOP_PATCHLIST_5:
 302     case TOP_PATCHLIST_6:
 303     case TOP_PATCHLIST_7:
 304     case TOP_PATCHLIST_8:
 305     case TOP_PATCHLIST_9:
 306     case TOP_PATCHLIST_10:
 307     case TOP_PATCHLIST_11:
 308     case TOP_PATCHLIST_12:
 309     case TOP_PATCHLIST_13:
 310     case TOP_PATCHLIST_14:
 311     case TOP_PATCHLIST_15:
 312     case TOP_PATCHLIST_16:
 313     case TOP_PATCHLIST_17:
 314     case TOP_PATCHLIST_18:
 315     case TOP_PATCHLIST_19:
 316     case TOP_PATCHLIST_20:
 317     case TOP_PATCHLIST_21:
 318     case TOP_PATCHLIST_22:
 319     case TOP_PATCHLIST_23:
 320     case TOP_PATCHLIST_24:
 321     case TOP_PATCHLIST_25:
 322     case TOP_PATCHLIST_26:
 323     case TOP_PATCHLIST_27:
 324     case TOP_PATCHLIST_28:
 325     case TOP_PATCHLIST_29:
 326     case TOP_PATCHLIST_30:
 327     case TOP_PATCHLIST_31:
 328     case TOP_PATCHLIST_32:
 329         return numPrims / (mode - TOP_PATCHLIST_BASE);
 330
 331     case TOP_POLYGON:
 332     case TOP_POINT_LIST_BF:
 333     case TOP_LINE_STRIP_CONT:
 334     case TOP_LINE_STRIP_BF:
 335     case TOP_LINE_STRIP_CONT_BF:
 336     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 337     case TOP_TRI_STRIP_REVERSE:
 338     case TOP_PATCHLIST_BASE:
 339     case TOP_UNKNOWN:
 340         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 341         return 0;
 342     }
 343
 344     return 0;
 345 }
 346
 347 //////////////////////////////////////////////////////////////////////////
 348 /// @brief Computes the number of verts given the number of primitives.
 349 /// @param mode - primitive topology for draw operation.
 350 /// @param numPrims - number of primitives for draw.
 351 uint32_t GetNumVerts(
 352     PRIMITIVE_TOPOLOGY mode,
 353     uint32_t numPrims)
 354 {
 355     switch (mode)
 356     {
 357     case TOP_POINT_LIST: return numPrims;
 358     case TOP_TRIANGLE_LIST: return numPrims * 3;
 359     case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
 360     case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
 361     case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
 362     case TOP_QUAD_LIST: return numPrims * 4;
 363     case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
 364     case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
 365     case TOP_LINE_LIST: return numPrims * 2;
 366     case TOP_LINE_LOOP: return numPrims;
 367     case TOP_RECT_LIST: return numPrims * 3;
 368     case TOP_LINE_LIST_ADJ: return numPrims * 4;
 369     case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
 370     case TOP_TRI_LIST_ADJ: return numPrims * 6;
 371     case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
 372
 373     case TOP_PATCHLIST_1:
 374     case TOP_PATCHLIST_2:
 375     case TOP_PATCHLIST_3:
 376     case TOP_PATCHLIST_4:
 377     case TOP_PATCHLIST_5:
 378     case TOP_PATCHLIST_6:
 379     case TOP_PATCHLIST_7:
 380     case TOP_PATCHLIST_8:
 381     case TOP_PATCHLIST_9:
 382     case TOP_PATCHLIST_10:
 383     case TOP_PATCHLIST_11:
 384     case TOP_PATCHLIST_12:
 385     case TOP_PATCHLIST_13:
 386     case TOP_PATCHLIST_14:
 387     case TOP_PATCHLIST_15:
 388     case TOP_PATCHLIST_16:
 389     case TOP_PATCHLIST_17:
 390     case TOP_PATCHLIST_18:
 391     case TOP_PATCHLIST_19:
 392     case TOP_PATCHLIST_20:
 393     case TOP_PATCHLIST_21:
 394     case TOP_PATCHLIST_22:
 395     case TOP_PATCHLIST_23:
 396     case TOP_PATCHLIST_24:
 397     case TOP_PATCHLIST_25:
 398     case TOP_PATCHLIST_26:
 399     case TOP_PATCHLIST_27:
 400     case TOP_PATCHLIST_28:
 401     case TOP_PATCHLIST_29:
 402     case TOP_PATCHLIST_30:
 403     case TOP_PATCHLIST_31:
 404     case TOP_PATCHLIST_32:
 405         return numPrims * (mode - TOP_PATCHLIST_BASE);
 406
 407     case TOP_POLYGON:
 408     case TOP_POINT_LIST_BF:
 409     case TOP_LINE_STRIP_CONT:
 410     case TOP_LINE_STRIP_BF:
 411     case TOP_LINE_STRIP_CONT_BF:
 412     case TOP_TRIANGLE_FAN_NOSTIPPLE:
 413     case TOP_TRI_STRIP_REVERSE:
 414     case TOP_PATCHLIST_BASE:
 415     case TOP_UNKNOWN:
 416         SWR_ASSERT(false, "Unsupported topology: %d", mode);
 417         return 0;
 418     }
 419
 420     return 0;
 421 }
 422
 423 //////////////////////////////////////////////////////////////////////////
 424 /// @brief Return number of verts per primitive.
 425 /// @param topology - topology
 426 /// @param includeAdjVerts - include adjacent verts in primitive vertices
 427 INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
 428 {
 429     uint32_t numVerts = 0;
 430     switch (topology)
 431     {
 432     case TOP_POINT_LIST:
 433     case TOP_POINT_LIST_BF:
 434         numVerts = 1;
 435         break;
 436     case TOP_LINE_LIST:
 437     case TOP_LINE_STRIP:
 438     case TOP_LINE_LIST_ADJ:
 439     case TOP_LINE_LOOP:
 440     case TOP_LINE_STRIP_CONT:
 441     case TOP_LINE_STRIP_BF:
 442     case TOP_LISTSTRIP_ADJ:
 443         numVerts = 2;
 444         break;
 445     case TOP_TRIANGLE_LIST:
 446     case TOP_TRIANGLE_STRIP:
 447     case TOP_TRIANGLE_FAN:
 448     case TOP_TRI_LIST_ADJ:
 449     case TOP_TRI_STRIP_ADJ:
 450     case TOP_TRI_STRIP_REVERSE:
 451     case TOP_RECT_LIST:
 452         numVerts = 3;
 453         break;
 454     case TOP_QUAD_LIST:
 455     case TOP_QUAD_STRIP:
 456         numVerts = 4;
 457         break;
 458     case TOP_PATCHLIST_1:
 459     case TOP_PATCHLIST_2:
 460     case TOP_PATCHLIST_3:
 461     case TOP_PATCHLIST_4:
 462     case TOP_PATCHLIST_5:
 463     case TOP_PATCHLIST_6:
 464     case TOP_PATCHLIST_7:
 465     case TOP_PATCHLIST_8:
 466     case TOP_PATCHLIST_9:
 467     case TOP_PATCHLIST_10:
 468     case TOP_PATCHLIST_11:
 469     case TOP_PATCHLIST_12:
 470     case TOP_PATCHLIST_13:
 471     case TOP_PATCHLIST_14:
 472     case TOP_PATCHLIST_15:
 473     case TOP_PATCHLIST_16:
 474     case TOP_PATCHLIST_17:
 475     case TOP_PATCHLIST_18:
 476     case TOP_PATCHLIST_19:
 477     case TOP_PATCHLIST_20:
 478     case TOP_PATCHLIST_21:
 479     case TOP_PATCHLIST_22:
 480     case TOP_PATCHLIST_23:
 481     case TOP_PATCHLIST_24:
 482     case TOP_PATCHLIST_25:
 483     case TOP_PATCHLIST_26:
 484     case TOP_PATCHLIST_27:
 485     case TOP_PATCHLIST_28:
 486     case TOP_PATCHLIST_29:
 487     case TOP_PATCHLIST_30:
 488     case TOP_PATCHLIST_31:
 489     case TOP_PATCHLIST_32:
 490         numVerts = topology - TOP_PATCHLIST_BASE;
 491         break;
 492     default:
 493         SWR_ASSERT(false, "Unsupported topology: %d", topology);
 494         break;
 495     }
 496
 497     if (includeAdjVerts)
 498     {
 499         switch (topology)
 500         {
 501         case TOP_LISTSTRIP_ADJ:
 502         case TOP_LINE_LIST_ADJ: numVerts = 4; break;
 503         case TOP_TRI_STRIP_ADJ:
 504         case TOP_TRI_LIST_ADJ: numVerts = 6; break;
 505         default: break;
 506         }
 507     }
 508
 509     return numVerts;
 510 }
 511
 512 //////////////////////////////////////////////////////////////////////////
 513 /// @brief Generate mask from remaining work.
 514 /// @param numWorkItems - Number of items being worked on by a SIMD.
 515 static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
 516 {
 517     uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
 518     uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
 519     return _simd_castps_si(vMask(mask));
 520 }
 521
 522 //////////////////////////////////////////////////////////////////////////
 523 /// @brief StreamOut - Streams vertex data out to SO buffers.
 524 ///        Generally, we are only streaming out a SIMDs worth of triangles.
 525 /// @param pDC - pointer to draw context.
 526 /// @param workerId - thread's worker id. Even thread has a unique id.
 527 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
 528 static void StreamOut(
 529     DRAW_CONTEXT* pDC,
 530     PA_STATE& pa,
 531     uint32_t workerId,
 532     uint32_t* pPrimData,
 533     uint32_t streamIndex)
 534 {
 535     RDTSC_START(FEStreamout);
 536
 537     SWR_CONTEXT* pContext = pDC->pContext;
 538
 539     const API_STATE& state = GetApiState(pDC);
 540     const SWR_STREAMOUT_STATE &soState = state.soState;
 541
 542     uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
 543
 544     // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
 545     uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
 546
 547     SWR_STREAMOUT_CONTEXT soContext = { 0 };
 548
 549     // Setup buffer state pointers.
 550     for (uint32_t i = 0; i < 4; ++i)
 551     {
 552         soContext.pBuffer[i] = &state.soBuffer[i];
 553     }
 554
 555     uint32_t numPrims = pa.NumPrims();
 556     for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
 557     {
 558         DWORD slot = 0;
 559         uint32_t soMask = soState.streamMasks[streamIndex];
 560
 561         // Write all entries into primitive data buffer for SOS.
 562         while (_BitScanForward(&slot, soMask))
 563         {
 564             __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
 565             uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
 566             pa.AssembleSingle(paSlot, primIndex, attrib);
 567
 568             // Attribute offset is relative offset from start of vertex.
 569             // Note that attributes start at slot 1 in the PA buffer. We need to write this
 570             // to prim data starting at slot 0. Which is why we do (slot - 1).
 571             // Also note: GL works slightly differently, and needs slot 0
 572             uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
 573
 574             // Store each vertex's attrib at appropriate locations in pPrimData buffer.
 575             for (uint32_t v = 0; v < soVertsPerPrim; ++v)
 576             {
 577                 uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
 578
 579                 _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
 580             }
 581             soMask &= ~(1 << slot);
 582         }
 583
 584         // Update pPrimData pointer
 585         soContext.pPrimData = pPrimData;
 586
 587         // Call SOS
 588         SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
 589         state.pfnSoFunc[streamIndex](soContext);
 590     }
 591
 592     // Update SO write offset. The driver provides memory for the update.
 593     for (uint32_t i = 0; i < 4; ++i)
 594     {
 595         if (state.soBuffer[i].pWriteOffset)
 596         {
 597             *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
 598
 599             // The SOS increments the existing write offset. So we don't want to increment
 600             // the SoWriteOffset stat using an absolute offset instead of relative.
 601             SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
 602         }
 603     }
 604
 605     UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
 606     UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
 607
 608     RDTSC_STOP(FEStreamout, 1, 0);
 609 }
 610
 611 //////////////////////////////////////////////////////////////////////////
 612 /// @brief Computes number of invocations. The current index represents
 613 ///        the start of the SIMD. The max index represents how much work
 614 ///        items are remaining. If there is less then a SIMD's left of work
 615 ///        then return the remaining amount of work.
 616 /// @param curIndex - The start index for the SIMD.
 617 /// @param maxIndex - The last index for all work items.
 618 static INLINE uint32_t GetNumInvocations(
 619     uint32_t curIndex,
 620     uint32_t maxIndex)
 621 {
 622     uint32_t remainder = (maxIndex - curIndex);
 623     return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
 624 }
 625
 626 //////////////////////////////////////////////////////////////////////////
 627 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
 628 ///        The geometry shader will loop over each active streamout buffer, assembling
 629 ///        primitives for the downstream stages. When multistream output is enabled,
 630 ///        the generated stream ID buffer from the GS needs to be converted to a cut
 631 ///        buffer for the primitive assembler.
 632 /// @param stream - stream id to generate the cut buffer for
 633 /// @param pStreamIdBase - pointer to the stream ID buffer
 634 /// @param numEmittedVerts - Number of total verts emitted by the GS
 635 /// @param pCutBuffer - output buffer to write cuts to
 636 void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
 637 {
 638     SWR_ASSERT(stream < MAX_SO_STREAMS);
 639
 640     uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
 641     uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
 642
 643     for (uint32_t b = 0; b < numOutputBytes; ++b)
 644     {
 645         uint8_t curInputByte = pStreamIdBase[2*b];
 646         uint8_t outByte = 0;
 647         for (uint32_t i = 0; i < 4; ++i)
 648         {
 649             if ((curInputByte & 0x3) != stream)
 650             {
 651                 outByte |= (1 << i);
 652             }
 653             curInputByte >>= 2;
 654         }
 655
 656         curInputByte = pStreamIdBase[2 * b + 1];
 657         for (uint32_t i = 0; i < 4; ++i)
 658         {
 659             if ((curInputByte & 0x3) != stream)
 660             {
 661                 outByte |= (1 << (i + 4));
 662             }
 663             curInputByte >>= 2;
 664         }
 665
 666         *pCutBuffer++ = outByte;
 667     }
 668 }
 669
 670 THREAD SWR_GS_CONTEXT tlsGsContext;
 671
 672 //////////////////////////////////////////////////////////////////////////
 673 /// @brief Implements GS stage.
 674 /// @param pDC - pointer to draw context.
 675 /// @param workerId - thread's worker id. Even thread has a unique id.
 676 /// @param pa - The primitive assembly object.
 677 /// @param pGsOut - output stream for GS
 678 template <
 679     typename HasStreamOutT,
 680     typename HasRastT>
 681 static void GeometryShaderStage(
 682     DRAW_CONTEXT *pDC,
 683     uint32_t workerId,
 684     PA_STATE& pa,
 685     void* pGsOut,
 686     void* pCutBuffer,
 687     void* pStreamCutBuffer,
 688     uint32_t* pSoPrimData,
 689     simdscalari primID)
 690 {
 691     RDTSC_START(FEGeometryShader);
 692
 693     SWR_CONTEXT* pContext = pDC->pContext;
 694
 695     const API_STATE& state = GetApiState(pDC);
 696     const SWR_GS_STATE* pState = &state.gsState;
 697
 698     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
 699     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 700
 701     tlsGsContext.pStream = (uint8_t*)pGsOut;
 702     tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
 703     tlsGsContext.PrimitiveID = primID;
 704
 705     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
 706     simdvector attrib[MAX_ATTRIBUTES];
 707
 708     // assemble all attributes for the input primitive
 709     for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
 710     {
 711         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
 712         pa.Assemble(attribSlot, attrib);
 713
 714         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 715         {
 716             tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
 717         }
 718     }
 719
 720     // assemble position
 721     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
 722     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
 723     {
 724         tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
 725     }
 726
 727     const uint32_t vertexStride = sizeof(simdvertex);
 728     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 729     const uint32_t inputPrimStride = numSimdBatches * vertexStride;
 730     const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
 731     uint32_t cutPrimStride;
 732     uint32_t cutInstanceStride;
 733
 734     if (pState->isSingleStream)
 735     {
 736         cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 737         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 738     }
 739     else
 740     {
 741         cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 742         cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
 743     }
 744
 745     // record valid prims from the frontend to avoid over binning the newly generated
 746     // prims from the GS
 747     uint32_t numInputPrims = pa.NumPrims();
 748
 749     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 750     {
 751         tlsGsContext.InstanceID = instance;
 752         tlsGsContext.mask = GenerateMask(numInputPrims);
 753
 754         // execute the geometry shader
 755         state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 756
 757         tlsGsContext.pStream += instanceStride;
 758         tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
 759     }
 760
 761     // set up new binner and state for the GS output topology
 762     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 763     if (HasRastT::value)
 764     {
 765         switch (pState->outputTopology)
 766         {
 767         case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
 768         case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
 769         case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
 770         default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
 771         }
 772     }
 773
 774     // foreach input prim:
 775     // - setup a new PA based on the emitted verts for that prim
 776     // - loop over the new verts, calling PA to assemble each prim
 777     uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
 778     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 779
 780     uint32_t totalPrimsGenerated = 0;
 781     for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
 782     {
 783         uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
 784         uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
 785         for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
 786         {
 787             uint32_t numEmittedVerts = pVertexCount[inputPrim];
 788             if (numEmittedVerts == 0)
 789             {
 790                 continue;
 791             }
 792
 793             uint8_t* pBase = pInstanceBase + instance * instanceStride;
 794             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
 795
 796             DWORD numAttribs;
 797             if (_BitScanReverse(&numAttribs, state.feAttribMask))
 798             {
 799                 numAttribs++;
 800             }
 801             else
 802             {
 803                 numAttribs = 0;
 804             }
 805
 806             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
 807             {
 808                 bool processCutVerts = false;
 809
 810                 uint8_t* pCutBuffer = pCutBase;
 811
 812                 // assign default stream ID, only relevant when GS is outputting a single stream
 813                 uint32_t streamID = 0;
 814                 if (pState->isSingleStream)
 815                 {
 816                     processCutVerts = true;
 817                     streamID = pState->singleStreamID;
 818                     if (streamID != stream) continue;
 819                 }
 820                 else
 821                 {
 822                     // early exit if this stream is not enabled for streamout
 823                     if (HasStreamOutT::value && !state.soState.streamEnable[stream])
 824                     {
 825                         continue;
 826                     }
 827
 828                     // multi-stream output, need to translate StreamID buffer to a cut buffer
 829                     ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
 830                     pCutBuffer = (uint8_t*)pStreamCutBuffer;
 831                     processCutVerts = false;
 832                 }
 833
 834                 PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
 835
 836                 while (gsPa.GetNextStreamOutput())
 837                 {
 838                     do
 839                     {
 840                         bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
 841
 842                         if (assemble)
 843                         {
 844                             totalPrimsGenerated += gsPa.NumPrims();
 845
 846                             if (HasStreamOutT::value)
 847                             {
 848                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
 849                             }
 850
 851                             if (HasRastT::value && state.soState.streamToRasterizer == stream)
 852                             {
 853                                 simdscalari vPrimId;
 854                                 // pull primitiveID from the GS output if available
 855                                 if (state.gsState.emitsPrimitiveID)
 856                                 {
 857                                     simdvector primIdAttrib[3];
 858                                     gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
 859                                     vPrimId = _simd_castps_si(primIdAttrib[0].x);
 860                                 }
 861                                 else
 862                                 {
 863                                     vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
 864                                 }
 865
 866                                 pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
 867                             }
 868                         }
 869                     } while (gsPa.NextPrim());
 870                 }
 871             }
 872         }
 873     }
 874
 875     // update GS pipeline stats
 876     UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
 877     UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
 878
 879     RDTSC_STOP(FEGeometryShader, 1, 0);
 880 }
 881
 882 //////////////////////////////////////////////////////////////////////////
 883 /// @brief Allocate GS buffers
 884 /// @param pDC - pointer to draw context.
 885 /// @param state - API state
 886 /// @param ppGsOut - pointer to GS output buffer allocation
 887 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
 888 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
 889     void **ppStreamCutBuffer)
 890 {
 891     auto pArena = pDC->pArena;
 892     SWR_ASSERT(pArena != nullptr);
 893     SWR_ASSERT(state.gsState.gsEnable);
 894     // allocate arena space to hold GS output verts
 895     // @todo pack attribs
 896     // @todo support multiple streams
 897     const uint32_t vertexStride = sizeof(simdvertex);
 898     const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
 899     uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
 900     *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
 901
 902     const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
 903     const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
 904     const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 905     const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
 906
 907     // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
 908     // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
 909
 910     // allocate space for temporary per-stream cut buffer if multi-stream is enabled
 911     if (state.gsState.isSingleStream)
 912     {
 913         *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 914         *ppStreamCutBuffer = nullptr;
 915     }
 916     else
 917     {
 918         *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
 919         *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
 920     }
 921
 922 }
 923
 924 //////////////////////////////////////////////////////////////////////////
 925 /// @brief Contains all data generated by the HS and passed to the
 926 /// tessellator and DS.
 927 struct TessellationThreadLocalData
 928 {
 929     SWR_HS_CONTEXT hsContext;
 930     ScalarPatch patchData[KNOB_SIMD_WIDTH];
 931     void* pTxCtx;
 932     size_t tsCtxSize;
 933
 934     simdscalar* pDSOutput;
 935     size_t numDSOutputVectors;
 936 };
 937
 938 THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
 939
 940 //////////////////////////////////////////////////////////////////////////
 941 /// @brief Allocate tessellation data for this worker thread.
 942 INLINE
 943 static void AllocateTessellationData(SWR_CONTEXT* pContext)
 944 {
 945     /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
 946     if (gt_pTessellationThreadData == nullptr)
 947     {
 948         gt_pTessellationThreadData = (TessellationThreadLocalData*)
 949             _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
 950         memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
 951     }
 952 }
 953
 954 //////////////////////////////////////////////////////////////////////////
 955 /// @brief Implements Tessellation Stages.
 956 /// @param pDC - pointer to draw context.
 957 /// @param workerId - thread's worker id. Even thread has a unique id.
 958 /// @param pa - The primitive assembly object.
 959 /// @param pGsOut - output stream for GS
 960 template <
 961     typename HasGeometryShaderT,
 962     typename HasStreamOutT,
 963     typename HasRastT>
 964 static void TessellationStages(
 965     DRAW_CONTEXT *pDC,
 966     uint32_t workerId,
 967     PA_STATE& pa,
 968     void* pGsOut,
 969     void* pCutBuffer,
 970     void* pCutStreamBuffer,
 971     uint32_t* pSoPrimData,
 972     simdscalari primID)
 973 {
 974     const API_STATE& state = GetApiState(pDC);
 975     const SWR_TS_STATE& tsState = state.tsState;
 976     SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
 977
 978     SWR_ASSERT(gt_pTessellationThreadData);
 979
 980     HANDLE tsCtx = TSInitCtx(
 981         tsState.domain,
 982         tsState.partitioning,
 983         tsState.tsOutputTopology,
 984         gt_pTessellationThreadData->pTxCtx,
 985         gt_pTessellationThreadData->tsCtxSize);
 986     if (tsCtx == nullptr)
 987     {
 988         gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
 989         tsCtx = TSInitCtx(
 990             tsState.domain,
 991             tsState.partitioning,
 992             tsState.tsOutputTopology,
 993             gt_pTessellationThreadData->pTxCtx,
 994             gt_pTessellationThreadData->tsCtxSize);
 995     }
 996     SWR_ASSERT(tsCtx);
 997
 998     PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
 999     if (HasRastT::value)
1000     {
1001         switch (tsState.postDSTopology)
1002         {
1003         case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
1004         case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
1005         case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
1006         default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
1007         }
1008     }
1009
1010     SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
1011     hsContext.pCPout = gt_pTessellationThreadData->patchData;
1012     hsContext.PrimitiveID = primID;
1013
1014     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
1015     // Max storage for one attribute for an entire simdprimitive
1016     simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
1017
1018     // assemble all attributes for the input primitives
1019     for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
1020     {
1021         uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
1022         pa.Assemble(attribSlot, simdattrib);
1023
1024         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
1025         {
1026             hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
1027         }
1028     }
1029
1030 #if defined(_DEBUG)
1031     memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1032 #endif
1033
1034     uint32_t numPrims = pa.NumPrims();
1035     hsContext.mask = GenerateMask(numPrims);
1036
1037     // Run the HS
1038     RDTSC_START(FEHullShader);
1039     state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
1040     RDTSC_STOP(FEHullShader, 0, 0);
1041
1042     UPDATE_STAT(HsInvocations, numPrims);
1043
1044     const uint32_t* pPrimId = (const uint32_t*)&primID;
1045
1046     for (uint32_t p = 0; p < numPrims; ++p)
1047     {
1048         // Run Tessellator
1049         SWR_TS_TESSELLATED_DATA tsData = { 0 };
1050         RDTSC_START(FETessellation);
1051         TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
1052         RDTSC_STOP(FETessellation, 0, 0);
1053
1054         if (tsData.NumPrimitives == 0)
1055         {
1056             continue;
1057         }
1058         SWR_ASSERT(tsData.NumDomainPoints);
1059
1060         // Allocate DS Output memory
1061         uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
1062         size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
1063         size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
1064         if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
1065         {
1066             _aligned_free(gt_pTessellationThreadData->pDSOutput);
1067             gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
1068             gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
1069         }
1070         SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
1071         SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
1072
1073 #if defined(_DEBUG)
1074         memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
1075 #endif
1076
1077         // Run Domain Shader
1078         SWR_DS_CONTEXT dsContext;
1079         dsContext.PrimitiveID = pPrimId[p];
1080         dsContext.pCpIn = &hsContext.pCPout[p];
1081         dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
1082         dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
1083         dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
1084         dsContext.vectorStride = requiredDSVectorInvocations;
1085
1086         uint32_t dsInvocations = 0;
1087
1088         for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
1089         {
1090             dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
1091
1092             RDTSC_START(FEDomainShader);
1093             state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
1094             RDTSC_STOP(FEDomainShader, 0, 0);
1095
1096             dsInvocations += KNOB_SIMD_WIDTH;
1097         }
1098         UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
1099
1100         PA_TESS tessPa(
1101             pDC,
1102             dsContext.pOutputData,
1103             dsContext.vectorStride,
1104             tsState.numDsOutputAttribs,
1105             tsData.ppIndices,
1106             tsData.NumPrimitives,
1107             tsState.postDSTopology);
1108
1109         while (tessPa.HasWork())
1110         {
1111             if (HasGeometryShaderT::value)
1112             {
1113                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1114                     pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
1115                     _simd_set1_epi32(dsContext.PrimitiveID));
1116             }
1117             else
1118             {
1119                 if (HasStreamOutT::value)
1120                 {
1121                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
1122                 }
1123
1124                 if (HasRastT::value)
1125                 {
1126                     simdvector prim[3]; // Only deal with triangles, lines, or points
1127                     RDTSC_START(FEPAAssemble);
1128 #if SWR_ENABLE_ASSERTS
1129                     bool assemble =
1130 #endif
1131                         tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
1132                     RDTSC_STOP(FEPAAssemble, 1, 0);
1133                     SWR_ASSERT(assemble);
1134
1135                     SWR_ASSERT(pfnClipFunc);
1136                     pfnClipFunc(pDC, tessPa, workerId, prim,
1137                         GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
1138                 }
1139             }
1140
1141             tessPa.NextPrim();
1142
1143         } // while (tessPa.HasWork())
1144     } // for (uint32_t p = 0; p < numPrims; ++p)
1145
1146     TSDestroyCtx(tsCtx);
1147 }
1148
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief FE handler for SwrDraw.
1151 /// @tparam IsIndexedT - Is indexed drawing enabled
1152 /// @tparam HasTessellationT - Is tessellation enabled
1153 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1154 /// @tparam HasStreamOutT - Is stream-out enabled
1155 /// @tparam HasRastT - Is rasterization enabled
1156 /// @param pContext - pointer to SWR context.
1157 /// @param pDC - pointer to draw context.
1158 /// @param workerId - thread's worker id.
1159 /// @param pUserData - Pointer to DRAW_WORK
1160 template <
1161     typename IsIndexedT,
1162     typename HasTessellationT,
1163     typename HasGeometryShaderT,
1164     typename HasStreamOutT,
1165     typename HasRastT>
1166 void ProcessDraw(
1167     SWR_CONTEXT *pContext,
1168     DRAW_CONTEXT *pDC,
1169     uint32_t workerId,
1170     void *pUserData)
1171 {
1172
1173 #if KNOB_ENABLE_TOSS_POINTS
1174     if (KNOB_TOSS_QUEUE_FE)
1175     {
1176         return;
1177     }
1178 #endif
1179
1180     RDTSC_START(FEProcessDraw);
1181
1182     DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
1183     const API_STATE&    state = GetApiState(pDC);
1184     __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1185     SWR_VS_CONTEXT      vsContext;
1186     simdvertex          vin;
1187
1188     int indexSize = 0;
1189     uint32_t endVertex = work.numVerts;
1190
1191     const int32_t* pLastRequestedIndex = nullptr;
1192     if (IsIndexedT::value)
1193     {
1194         switch (work.type)
1195         {
1196         case R32_UINT:
1197             indexSize = sizeof(uint32_t);
1198             pLastRequestedIndex = &(work.pIB[endVertex]);
1199             break;
1200         case R16_UINT:
1201             indexSize = sizeof(uint16_t);
1202             // nasty address offset to last index
1203             pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
1204             break;
1205         case R8_UINT:
1206             indexSize = sizeof(uint8_t);
1207             // nasty address offset to last index
1208             pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
1209             break;
1210         default:
1211             SWR_ASSERT(0);
1212         }
1213     }
1214     else
1215     {
1216         // No cuts, prune partial primitives.
1217         endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
1218     }
1219
1220     SWR_FETCH_CONTEXT fetchInfo = { 0 };
1221     fetchInfo.pStreams = &state.vertexBuffers[0];
1222     fetchInfo.StartInstance = work.startInstance;
1223     fetchInfo.StartVertex = 0;
1224
1225     vsContext.pVin = &vin;
1226
1227     if (IsIndexedT::value)
1228     {
1229         fetchInfo.BaseVertex = work.baseVertex;
1230
1231         // if the entire index buffer isn't being consumed, set the last index
1232         // so that fetches < a SIMD wide will be masked off
1233         fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
1234         if (pLastRequestedIndex < fetchInfo.pLastIndex)
1235         {
1236             fetchInfo.pLastIndex = pLastRequestedIndex;
1237         }
1238     }
1239     else
1240     {
1241         fetchInfo.StartVertex = work.startVertex;
1242     }
1243
1244 #ifdef KNOB_ENABLE_RDTSC
1245     uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
1246 #endif
1247
1248     void* pGsOut = nullptr;
1249     void* pCutBuffer = nullptr;
1250     void* pStreamCutBuffer = nullptr;
1251     if (HasGeometryShaderT::value)
1252     {
1253         AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
1254     }
1255
1256     if (HasTessellationT::value)
1257     {
1258         SWR_ASSERT(state.tsState.tsEnable == true);
1259         SWR_ASSERT(state.pfnHsFunc != nullptr);
1260         SWR_ASSERT(state.pfnDsFunc != nullptr);
1261
1262         AllocateTessellationData(pContext);
1263     }
1264     else
1265     {
1266         SWR_ASSERT(state.tsState.tsEnable == false);
1267         SWR_ASSERT(state.pfnHsFunc == nullptr);
1268         SWR_ASSERT(state.pfnDsFunc == nullptr);
1269     }
1270
1271     // allocate space for streamout input prim data
1272     uint32_t* pSoPrimData = nullptr;
1273     if (HasStreamOutT::value)
1274     {
1275         pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
1276
1277         // update the
1278         for (uint32_t i = 0; i < 4; ++i)
1279         {
1280             SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
1281         }
1282
1283     }
1284
1285     // choose primitive assembler
1286     PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
1287     PA_STATE& pa = paFactory.GetPA();
1288
1289     /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1290     for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
1291     {
1292         simdscalari vIndex;
1293         uint32_t  i = 0;
1294
1295         if (IsIndexedT::value)
1296         {
1297             fetchInfo.pIndices = work.pIB;
1298         }
1299         else
1300         {
1301             vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
1302             fetchInfo.pIndices = (const int32_t*)&vIndex;
1303         }
1304
1305         fetchInfo.CurInstance = instanceNum;
1306         vsContext.InstanceID = instanceNum;
1307
1308         while (pa.HasWork())
1309         {
1310             // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1311             // So we need to keep this outside of (i < endVertex) check.
1312             simdmask* pvCutIndices = nullptr;
1313             if (IsIndexedT::value)
1314             {
1315                 pvCutIndices = &pa.GetNextVsIndices();
1316             }
1317
1318             simdvertex& vout = pa.GetNextVsOutput();
1319             vsContext.pVout = &vout;
1320
1321             if (i < endVertex)
1322             {
1323
1324                 // 1. Execute FS/VS for a single SIMD.
1325                 RDTSC_START(FEFetchShader);
1326                 state.pfnFetchFunc(fetchInfo, vin);
1327                 RDTSC_STOP(FEFetchShader, 0, 0);
1328
1329                 // forward fetch generated vertex IDs to the vertex shader
1330                 vsContext.VertexID = fetchInfo.VertexID;
1331
1332                 // Setup active mask for vertex shader.
1333                 vsContext.mask = GenerateMask(endVertex - i);
1334
1335                 // forward cut mask to the PA
1336                 if (IsIndexedT::value)
1337                 {
1338                     *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
1339                 }
1340
1341                 UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
1342
1343 #if KNOB_ENABLE_TOSS_POINTS
1344                 if (!KNOB_TOSS_FETCH)
1345 #endif
1346                 {
1347                     RDTSC_START(FEVertexShader);
1348                     state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
1349                     RDTSC_STOP(FEVertexShader, 0, 0);
1350
1351                     UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
1352                 }
1353             }
1354
1355             // 2. Assemble primitives given the last two SIMD.
1356             do
1357             {
1358                 simdvector prim[MAX_NUM_VERTS_PER_PRIM];
1359                 // PaAssemble returns false if there is not enough verts to assemble.
1360                 RDTSC_START(FEPAAssemble);
1361                 bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
1362                 RDTSC_STOP(FEPAAssemble, 1, 0);
1363
1364 #if KNOB_ENABLE_TOSS_POINTS
1365                 if (!KNOB_TOSS_FETCH)
1366 #endif
1367                 {
1368 #if KNOB_ENABLE_TOSS_POINTS
1369                     if (!KNOB_TOSS_VS)
1370 #endif
1371                     {
1372                         if (assemble)
1373                         {
1374                             UPDATE_STAT(IaPrimitives, pa.NumPrims());
1375
1376                             if (HasTessellationT::value)
1377                             {
1378                                 TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
1379                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1380                             }
1381                             else if (HasGeometryShaderT::value)
1382                             {
1383                                 GeometryShaderStage<HasStreamOutT, HasRastT>(
1384                                     pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
1385                             }
1386                             else
1387                             {
1388                                 // If streamout is enabled then stream vertices out to memory.
1389                                 if (HasStreamOutT::value)
1390                                 {
1391                                     StreamOut(pDC, pa, workerId, pSoPrimData, 0);
1392                                 }
1393
1394                                 if (HasRastT::value)
1395                                 {
1396                                     SWR_ASSERT(pDC->pState->pfnProcessPrims);
1397                                     pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
1398                                         GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
1399                                 }
1400                             }
1401                         }
1402                     }
1403                 }
1404             } while (pa.NextPrim());
1405
1406             i += KNOB_SIMD_WIDTH;
1407             if (IsIndexedT::value)
1408             {
1409                 fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
1410             }
1411             else
1412             {
1413                 vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
1414             }
1415         }
1416         pa.Reset();
1417     }
1418
1419     RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
1420 }
1421
1422 struct FEDrawChooser
1423 {
1424     typedef PFN_FE_WORK_FUNC FuncType;
1425
1426     template <typename... ArgsB>
1427     static FuncType GetFunc()
1428     {
1429         return ProcessDraw<ArgsB...>;
1430     }
1431 };
1432
1433
1434 // Selector for correct templated Draw front-end function
1435 PFN_FE_WORK_FUNC GetProcessDrawFunc(
1436     bool IsIndexed,
1437     bool HasTessellation,
1438     bool HasGeometryShader,
1439     bool HasStreamOut,
1440     bool HasRasterization)
1441 {
1442     return TemplateArgUnroller<FEDrawChooser>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, HasRasterization);
1443 }
1444
1445
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief Processes attributes for the backend based on linkage mask and
1448 ///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
1449 /// @param pDC - Draw context
1450 /// @param pa - Primitive Assembly state
1451 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1452 /// @param pLinkageMap - maps VS attribute slot to PS slot
1453 /// @param triIndex - Triangle to process attributes for
1454 /// @param pBuffer - Output result
1455 template<uint32_t NumVerts>
1456 INLINE void ProcessAttributes(
1457     DRAW_CONTEXT *pDC,
1458     PA_STATE&pa,
1459     uint32_t linkageMask,
1460     const uint8_t* pLinkageMap,
1461     uint32_t triIndex,
1462     float *pBuffer)
1463 {
1464     DWORD slot = 0;
1465     uint32_t mapIdx = 0;
1466     LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
1467     const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
1468
1469     while (_BitScanForward(&slot, linkageMask))
1470     {
1471         linkageMask &= ~(1 << slot); // done with this bit.
1472
1473         // compute absolute slot in vertex attrib array
1474         uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
1475
1476         __m128 attrib[3];    // triangle attribs (always 4 wide)
1477         pa.AssembleSingle(inputSlot, triIndex, attrib);
1478
1479         if (_bittest(&constantInterpMask, mapIdx))
1480         {
1481             for (uint32_t i = 0; i < NumVerts; ++i)
1482             {
1483                 _mm_store_ps(pBuffer, attrib[provokingVertex]);
1484                 pBuffer += 4;
1485             }
1486         }
1487         else
1488         {
1489             for (uint32_t i = 0; i < NumVerts; ++i)
1490             {
1491                 _mm_store_ps(pBuffer, attrib[i]);
1492                 pBuffer += 4;
1493             }
1494         }
1495
1496         // pad out the attrib buffer to 3 verts to ensure the triangle
1497         // interpolation code in the pixel shader works correctly for the
1498         // 3 topologies - point, line, tri.  This effectively zeros out the
1499         // effect of the missing vertices in the triangle interpolation.
1500         for (uint32_t i = NumVerts; i < 3; ++i)
1501         {
1502             _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
1503             pBuffer += 4;
1504         }
1505
1506         mapIdx++;
1507     }
1508 }
1509
1510 //////////////////////////////////////////////////////////////////////////
1511 /// @brief Processes enabled user clip distances. Loads the active clip
1512 ///        distances from the PA, sets up barycentric equations, and
1513 ///        stores the results to the output buffer
1514 /// @param pa - Primitive Assembly state
1515 /// @param primIndex - primitive index to process
1516 /// @param clipDistMask - mask of enabled clip distances
1517 /// @param pUserClipBuffer - buffer to store results
1518 template<uint32_t NumVerts>
1519 void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
1520 {
1521     DWORD clipDist;
1522     while (_BitScanForward(&clipDist, clipDistMask))
1523     {
1524         clipDistMask &= ~(1 << clipDist);
1525         uint32_t clipSlot = clipDist >> 2;
1526         uint32_t clipComp = clipDist & 0x3;
1527         uint32_t clipAttribSlot = clipSlot == 0 ?
1528             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
1529
1530         __m128 primClipDist[3];
1531         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
1532
1533         float vertClipDist[NumVerts];
1534         for (uint32_t e = 0; e < NumVerts; ++e)
1535         {
1536             OSALIGNSIMD(float) aVertClipDist[4];
1537             _mm_store_ps(aVertClipDist, primClipDist[e]);
1538             vertClipDist[e] = aVertClipDist[clipComp];
1539         };
1540
1541         // setup plane equations for barycentric interpolation in the backend
1542         float baryCoeff[NumVerts];
1543         for (uint32_t e = 0; e < NumVerts - 1; ++e)
1544         {
1545             baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
1546         }
1547         baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
1548
1549         for (uint32_t e = 0; e < NumVerts; ++e)
1550         {
1551             *(pUserClipBuffer++) = baryCoeff[e];
1552         }
1553     }
1554 }
1555
1556 //////////////////////////////////////////////////////////////////////////
1557 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1558 ///        culling, viewport transform, etc.
1559 /// @param pDC - pointer to draw context.
1560 /// @param pa - The primitive assembly object.
1561 /// @param workerId - thread's worker id. Even thread has a unique id.
1562 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1563 /// @param primID - Primitive ID for each triangle.
1564 void BinTriangles(
1565     DRAW_CONTEXT *pDC,
1566     PA_STATE& pa,
1567     uint32_t workerId,
1568     simdvector tri[3],
1569     uint32_t triMask,
1570     simdscalari primID)
1571 {
1572     RDTSC_START(FEBinTriangles);
1573
1574     const API_STATE& state = GetApiState(pDC);
1575     const SWR_RASTSTATE& rastState = state.rastState;
1576     const SWR_FRONTEND_STATE& feState = state.frontendState;
1577     const SWR_GS_STATE& gsState = state.gsState;
1578
1579     // Simple wireframe mode for debugging purposes only
1580
1581     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
1582     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
1583     simdscalar vRecipW2 = _simd_set1_ps(1.0f);
1584
1585     if (!feState.vpTransformDisable)
1586     {
1587         // perspective divide
1588         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
1589         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
1590         vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
1591
1592         tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
1593         tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
1594         tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
1595
1596         tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
1597         tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
1598         tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
1599
1600         tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
1601         tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
1602         tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
1603
1604         // viewport transform to screen coords
1605         viewportTransform<3>(tri, state.vpMatrix[0]);
1606     }
1607
1608     // adjust for pixel center location
1609     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1610     tri[0].x = _simd_add_ps(tri[0].x, offset);
1611     tri[0].y = _simd_add_ps(tri[0].y, offset);
1612
1613     tri[1].x = _simd_add_ps(tri[1].x, offset);
1614     tri[1].y = _simd_add_ps(tri[1].y, offset);
1615
1616     tri[2].x = _simd_add_ps(tri[2].x, offset);
1617     tri[2].y = _simd_add_ps(tri[2].y, offset);
1618
1619     // convert to fixed point
1620     simdscalari vXi[3], vYi[3];
1621     vXi[0] = fpToFixedPointVertical(tri[0].x);
1622     vYi[0] = fpToFixedPointVertical(tri[0].y);
1623     vXi[1] = fpToFixedPointVertical(tri[1].x);
1624     vYi[1] = fpToFixedPointVertical(tri[1].y);
1625     vXi[2] = fpToFixedPointVertical(tri[2].x);
1626     vYi[2] = fpToFixedPointVertical(tri[2].y);
1627
1628     // triangle setup
1629     simdscalari vAi[3], vBi[3];
1630     triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
1631
1632     // determinant
1633     simdscalari vDet[2];
1634     calcDeterminantIntVertical(vAi, vBi, vDet);
1635
1636     // cull zero area
1637     int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
1638     int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
1639
1640     int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
1641
1642     uint32_t origTriMask = triMask;
1643     triMask &= ~cullZeroAreaMask;
1644
1645     // determine front winding tris
1646     // CW  +det
1647     // CCW -det
1648     maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
1649     maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
1650     int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
1651
1652     uint32_t frontWindingTris;
1653     if (rastState.frontWinding == SWR_FRONTWINDING_CW)
1654     {
1655         frontWindingTris = cwTriMask;
1656     }
1657     else
1658     {
1659         frontWindingTris = ~cwTriMask;
1660     }
1661
1662     // cull
1663     uint32_t cullTris;
1664     switch ((SWR_CULLMODE)rastState.cullMode)
1665     {
1666     case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
1667     case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
1668     case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
1669     case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
1670     default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
1671     }
1672
1673     triMask &= ~cullTris;
1674
1675     if (origTriMask ^ triMask)
1676     {
1677         RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1678     }
1679
1680     // compute per tri backface
1681     uint32_t frontFaceMask = frontWindingTris;
1682
1683     uint32_t *pPrimID = (uint32_t *)&primID;
1684     DWORD triIndex = 0;
1685
1686     if (!triMask)
1687     {
1688         goto endBinTriangles;
1689     }
1690
1691     // Calc bounding box of triangles
1692     simdBBox bbox;
1693     calcBoundingBoxIntVertical(vXi, vYi, bbox);
1694
1695     // determine if triangle falls between pixel centers and discard
1696     // only discard for non-MSAA case
1697     // (left + 127) & ~255
1698     // (right + 128) & ~255
1699
1700     if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
1701     {
1702         origTriMask = triMask;
1703
1704         int cullCenterMask;
1705         {
1706             simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
1707             left = _simd_and_si(left, _simd_set1_epi32(~255));
1708             simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
1709             right = _simd_and_si(right, _simd_set1_epi32(~255));
1710
1711             simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
1712
1713             simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
1714             top = _simd_and_si(top, _simd_set1_epi32(~255));
1715             simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
1716             bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
1717
1718             simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
1719             vMaskV = _simd_or_si(vMaskH, vMaskV);
1720             cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
1721         }
1722
1723         triMask &= ~cullCenterMask;
1724
1725         if(origTriMask ^ triMask)
1726         {
1727             RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
1728         }
1729     }
1730
1731     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1732     bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
1733     bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
1734     bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
1735     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
1736
1737     // Cull tris completely outside scissor
1738     {
1739         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
1740         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
1741         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
1742         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
1743         triMask = triMask & ~maskOutsideScissor;
1744     }
1745
1746     if (!triMask)
1747     {
1748         goto endBinTriangles;
1749     }
1750
1751     // Convert triangle bbox to macrotile units.
1752     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1753     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1754     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1755     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1756
1757     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
1758     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
1759     _simd_store_si((simdscalari*)aMTRight, bbox.right);
1760     _simd_store_si((simdscalari*)aMTTop, bbox.top);
1761     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
1762
1763     // transpose verts needed for backend
1764     /// @todo modify BE to take non-transformed verts
1765     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
1766     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
1767     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
1768     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
1769     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
1770
1771     // store render target array index
1772     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1773     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1774     {
1775         simdvector vRtai[3];
1776         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
1777         simdscalari vRtaii;
1778         vRtaii = _simd_castps_si(vRtai[0].x);
1779         _simd_store_si((simdscalari*)aRTAI, vRtaii);
1780     }
1781     else
1782     {
1783         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1784     }
1785
1786     // scan remaining valid triangles and bin each separately
1787     while (_BitScanForward(&triIndex, triMask))
1788     {
1789         uint32_t linkageCount = state.linkageCount;
1790         uint32_t linkageMask  = state.linkageMask;
1791         uint32_t numScalarAttribs = linkageCount * 4;
1792
1793         BE_WORK work;
1794         work.type = DRAW;
1795
1796         TRIANGLE_WORK_DESC &desc = work.desc.tri;
1797
1798         desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
1799         desc.triFlags.primID = pPrimID[triIndex];
1800         desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
1801
1802         if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
1803         {
1804             work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
1805         }
1806         else
1807         {
1808             // for center sample pattern, all samples are at pixel center; calculate coverage
1809             // once at center and broadcast the results in the backend
1810             work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
1811         }
1812
1813         auto pArena = pDC->pArena;
1814         SWR_ASSERT(pArena != nullptr);
1815
1816         // store active attribs
1817         float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
1818         desc.pAttribs = pAttribs;
1819         desc.numAttribs = linkageCount;
1820         ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
1821
1822         // store triangle vertex data
1823         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
1824
1825         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
1826         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
1827         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
1828         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
1829
1830         // store user clip distances
1831         if (rastState.clipDistanceMask)
1832         {
1833             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
1834             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
1835             ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
1836         }
1837
1838         MacroTileMgr *pTileMgr = pDC->pTileMgr;
1839         for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
1840         {
1841             for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
1842             {
1843 #if KNOB_ENABLE_TOSS_POINTS
1844                 if (!KNOB_TOSS_SETUP_TRIS)
1845 #endif
1846                 {
1847                     pTileMgr->enqueue(x, y, &work);
1848                 }
1849             }
1850         }
1851         triMask &= ~(1 << triIndex);
1852     }
1853
1854 endBinTriangles:
1855     RDTSC_STOP(FEBinTriangles, 1, 0);
1856 }
1857
1858
1859
1860 //////////////////////////////////////////////////////////////////////////
1861 /// @brief Bin SIMD points to the backend.  Only supports point size of 1
1862 /// @param pDC - pointer to draw context.
1863 /// @param pa - The primitive assembly object.
1864 /// @param workerId - thread's worker id. Even thread has a unique id.
1865 /// @param tri - Contains point position data for SIMDs worth of points.
1866 /// @param primID - Primitive ID for each point.
1867 void BinPoints(
1868     DRAW_CONTEXT *pDC,
1869     PA_STATE& pa,
1870     uint32_t workerId,
1871     simdvector prim[3],
1872     uint32_t primMask,
1873     simdscalari primID)
1874 {
1875     RDTSC_START(FEBinPoints);
1876
1877     simdvector& primVerts = prim[0];
1878
1879     const API_STATE& state = GetApiState(pDC);
1880     const SWR_FRONTEND_STATE& feState = state.frontendState;
1881     const SWR_GS_STATE& gsState = state.gsState;
1882     const SWR_RASTSTATE& rastState = state.rastState;
1883
1884     if (!feState.vpTransformDisable)
1885     {
1886         // perspective divide
1887         simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
1888         primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
1889         primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
1890         primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
1891
1892         // viewport transform to screen coords
1893         viewportTransform<1>(&primVerts, state.vpMatrix[0]);
1894     }
1895
1896     // adjust for pixel center location
1897     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
1898     primVerts.x = _simd_add_ps(primVerts.x, offset);
1899     primVerts.y = _simd_add_ps(primVerts.y, offset);
1900
1901     // convert to fixed point
1902     simdscalari vXi, vYi;
1903     vXi = fpToFixedPointVertical(primVerts.x);
1904     vYi = fpToFixedPointVertical(primVerts.y);
1905
1906     if (CanUseSimplePoints(pDC))
1907     {
1908         // adjust for top-left rule
1909         vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
1910         vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
1911
1912         // cull points off the top-left edge of the viewport
1913         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
1914         primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
1915
1916         // compute macro tile coordinates
1917         simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
1918         simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
1919
1920         OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
1921         _simd_store_si((simdscalari*)aMacroX, macroX);
1922         _simd_store_si((simdscalari*)aMacroY, macroY);
1923
1924         // compute raster tile coordinates
1925         simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
1926         simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
1927
1928         // compute raster tile relative x,y for coverage mask
1929         simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
1930         simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
1931
1932         simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
1933         simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
1934
1935         OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
1936         OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
1937         _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
1938         _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
1939
1940         OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
1941         OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
1942         _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
1943         _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
1944
1945         OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
1946         _simd_store_ps((float*)aZ, primVerts.z);
1947
1948         // store render target array index
1949         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
1950         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
1951         {
1952             simdvector vRtai;
1953             pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
1954             simdscalari vRtaii = _simd_castps_si(vRtai.x);
1955             _simd_store_si((simdscalari*)aRTAI, vRtaii);
1956         }
1957         else
1958         {
1959             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
1960         }
1961
1962         uint32_t *pPrimID = (uint32_t *)&primID;
1963         DWORD primIndex = 0;
1964         // scan remaining valid triangles and bin each separately
1965         while (_BitScanForward(&primIndex, primMask))
1966         {
1967             uint32_t linkageCount = state.linkageCount;
1968             uint32_t linkageMask = state.linkageMask;
1969
1970             uint32_t numScalarAttribs = linkageCount * 4;
1971
1972             BE_WORK work;
1973             work.type = DRAW;
1974
1975             TRIANGLE_WORK_DESC &desc = work.desc.tri;
1976
1977             // points are always front facing
1978             desc.triFlags.frontFacing = 1;
1979             desc.triFlags.primID = pPrimID[primIndex];
1980             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
1981
1982             work.pfnWork = RasterizeSimplePoint;
1983
1984             auto pArena = pDC->pArena;
1985             SWR_ASSERT(pArena != nullptr);
1986
1987             // store attributes
1988             float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
1989             desc.pAttribs = pAttribs;
1990             desc.numAttribs = linkageCount;
1991
1992             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
1993
1994             // store raster tile aligned x, y, perspective correct z
1995             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
1996             desc.pTriBuffer = pTriBuffer;
1997             *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
1998             *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
1999             *pTriBuffer = aZ[primIndex];
2000
2001             uint32_t tX = aTileRelativeX[primIndex];
2002             uint32_t tY = aTileRelativeY[primIndex];
2003
2004             // pack the relative x,y into the coverageMask, the rasterizer will
2005             // generate the true coverage mask from it
2006             work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
2007
2008             // bin it
2009             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2010 #if KNOB_ENABLE_TOSS_POINTS
2011             if (!KNOB_TOSS_SETUP_TRIS)
2012 #endif
2013             {
2014                 pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
2015             }
2016             primMask &= ~(1 << primIndex);
2017         }
2018     }
2019     else
2020     {
2021         // non simple points need to be potentially binned to multiple macro tiles
2022         simdscalar vPointSize;
2023         if (rastState.pointParam)
2024         {
2025             simdvector size[3];
2026             pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
2027             vPointSize = size[0].x;
2028         }
2029         else
2030         {
2031             vPointSize = _simd_set1_ps(rastState.pointSize);
2032         }
2033
2034         // bloat point to bbox
2035         simdBBox bbox;
2036         bbox.left = bbox.right = vXi;
2037         bbox.top = bbox.bottom = vYi;
2038
2039         simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
2040         simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2041         bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2042         bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2043         bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2044         bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2045
2046         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2047         bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2048         bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2049         bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2050         bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2051
2052         // Cull bloated points completely outside scissor
2053         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2054         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2055         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2056         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2057         primMask = primMask & ~maskOutsideScissor;
2058
2059         // Convert bbox to macrotile units.
2060         bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2061         bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2062         bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2063         bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2064
2065         OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2066         _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2067         _simd_store_si((simdscalari*)aMTRight, bbox.right);
2068         _simd_store_si((simdscalari*)aMTTop, bbox.top);
2069         _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2070
2071         // store render target array index
2072         OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2073         if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2074         {
2075             simdvector vRtai[2];
2076             pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2077             simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2078             _simd_store_si((simdscalari*)aRTAI, vRtaii);
2079         }
2080         else
2081         {
2082             _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2083         }
2084
2085         OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
2086         _simd_store_ps((float*)aPointSize, vPointSize);
2087
2088         uint32_t *pPrimID = (uint32_t *)&primID;
2089
2090         OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
2091         OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
2092         OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
2093
2094         _simd_store_ps((float*)aPrimVertsX, primVerts.x);
2095         _simd_store_ps((float*)aPrimVertsY, primVerts.y);
2096         _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
2097
2098         // scan remaining valid prims and bin each separately
2099         DWORD primIndex;
2100         while (_BitScanForward(&primIndex, primMask))
2101         {
2102             uint32_t linkageCount = state.linkageCount;
2103             uint32_t linkageMask = state.linkageMask;
2104             uint32_t numScalarAttribs = linkageCount * 4;
2105
2106             BE_WORK work;
2107             work.type = DRAW;
2108
2109             TRIANGLE_WORK_DESC &desc = work.desc.tri;
2110
2111             desc.triFlags.frontFacing = 1;
2112             desc.triFlags.primID = pPrimID[primIndex];
2113             desc.triFlags.pointSize = aPointSize[primIndex];
2114             desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2115
2116             work.pfnWork = RasterizeTriPoint;
2117
2118             auto pArena = pDC->pArena;
2119             SWR_ASSERT(pArena != nullptr);
2120
2121             // store active attribs
2122             desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2123             desc.numAttribs = linkageCount;
2124             ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2125
2126             // store point vertex data
2127             float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
2128             desc.pTriBuffer = pTriBuffer;
2129             *pTriBuffer++ = aPrimVertsX[primIndex];
2130             *pTriBuffer++ = aPrimVertsY[primIndex];
2131             *pTriBuffer = aPrimVertsZ[primIndex];
2132
2133             // store user clip distances
2134             if (rastState.clipDistanceMask)
2135             {
2136                 uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2137                 desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2138                 ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2139             }
2140
2141             MacroTileMgr *pTileMgr = pDC->pTileMgr;
2142             for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2143             {
2144                 for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2145                 {
2146 #if KNOB_ENABLE_TOSS_POINTS
2147                     if (!KNOB_TOSS_SETUP_TRIS)
2148 #endif
2149                     {
2150                         pTileMgr->enqueue(x, y, &work);
2151                     }
2152                 }
2153             }
2154
2155             primMask &= ~(1 << primIndex);
2156         }
2157     }
2158
2159
2160
2161
2162     RDTSC_STOP(FEBinPoints, 1, 0);
2163 }
2164
2165 //////////////////////////////////////////////////////////////////////////
2166 /// @brief Bin SIMD lines to the backend.
2167 /// @param pDC - pointer to draw context.
2168 /// @param pa - The primitive assembly object.
2169 /// @param workerId - thread's worker id. Even thread has a unique id.
2170 /// @param tri - Contains line position data for SIMDs worth of points.
2171 /// @param primID - Primitive ID for each line.
2172 void BinLines(
2173     DRAW_CONTEXT *pDC,
2174     PA_STATE& pa,
2175     uint32_t workerId,
2176     simdvector prim[],
2177     uint32_t primMask,
2178     simdscalari primID)
2179 {
2180     RDTSC_START(FEBinLines);
2181
2182     const API_STATE& state = GetApiState(pDC);
2183     const SWR_RASTSTATE& rastState = state.rastState;
2184     const SWR_FRONTEND_STATE& feState = state.frontendState;
2185     const SWR_GS_STATE& gsState = state.gsState;
2186
2187     simdscalar vRecipW0 = _simd_set1_ps(1.0f);
2188     simdscalar vRecipW1 = _simd_set1_ps(1.0f);
2189
2190     if (!feState.vpTransformDisable)
2191     {
2192         // perspective divide
2193         vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
2194         vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
2195
2196         prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
2197         prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
2198
2199         prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
2200         prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
2201
2202         prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
2203         prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
2204
2205         // viewport transform to screen coords
2206         viewportTransform<2>(prim, state.vpMatrix[0]);
2207     }
2208
2209     // adjust for pixel center location
2210     simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
2211     prim[0].x = _simd_add_ps(prim[0].x, offset);
2212     prim[0].y = _simd_add_ps(prim[0].y, offset);
2213
2214     prim[1].x = _simd_add_ps(prim[1].x, offset);
2215     prim[1].y = _simd_add_ps(prim[1].y, offset);
2216
2217     // convert to fixed point
2218     simdscalari vXi[2], vYi[2];
2219     vXi[0] = fpToFixedPointVertical(prim[0].x);
2220     vYi[0] = fpToFixedPointVertical(prim[0].y);
2221     vXi[1] = fpToFixedPointVertical(prim[1].x);
2222     vYi[1] = fpToFixedPointVertical(prim[1].y);
2223
2224     // compute x-major vs y-major mask
2225     simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
2226     simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
2227     simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
2228     uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
2229
2230     // cull zero-length lines
2231     simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
2232     vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
2233
2234     primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
2235
2236     uint32_t *pPrimID = (uint32_t *)&primID;
2237
2238     simdscalar vUnused = _simd_setzero_ps();
2239
2240     // Calc bounding box of lines
2241     simdBBox bbox;
2242     bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
2243     bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
2244     bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
2245     bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
2246
2247     // bloat bbox by line width along minor axis
2248     simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
2249     simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
2250     simdBBox bloatBox;
2251     bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
2252     bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
2253     bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
2254     bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
2255
2256     bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
2257     bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
2258     bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
2259     bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
2260
2261     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2262     bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
2263     bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
2264     bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
2265     bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
2266
2267     // Cull prims completely outside scissor
2268     {
2269         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
2270         simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
2271         simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
2272         uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
2273         primMask = primMask & ~maskOutsideScissor;
2274     }
2275
2276     if (!primMask)
2277     {
2278         goto endBinLines;
2279     }
2280
2281     // Convert triangle bbox to macrotile units.
2282     bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2283     bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2284     bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
2285     bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
2286
2287     OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
2288     _simd_store_si((simdscalari*)aMTLeft, bbox.left);
2289     _simd_store_si((simdscalari*)aMTRight, bbox.right);
2290     _simd_store_si((simdscalari*)aMTTop, bbox.top);
2291     _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
2292
2293     // transpose verts needed for backend
2294     /// @todo modify BE to take non-transformed verts
2295     __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
2296     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
2297     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
2298     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
2299     vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
2300
2301     // store render target array index
2302     OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
2303     if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
2304     {
2305         simdvector vRtai[2];
2306         pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
2307         simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
2308         _simd_store_si((simdscalari*)aRTAI, vRtaii);
2309     }
2310     else
2311     {
2312         _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
2313     }
2314
2315     // scan remaining valid prims and bin each separately
2316     DWORD primIndex;
2317     while (_BitScanForward(&primIndex, primMask))
2318     {
2319         uint32_t linkageCount = state.linkageCount;
2320         uint32_t linkageMask = state.linkageMask;
2321         uint32_t numScalarAttribs = linkageCount * 4;
2322
2323         BE_WORK work;
2324         work.type = DRAW;
2325
2326         TRIANGLE_WORK_DESC &desc = work.desc.tri;
2327
2328         desc.triFlags.frontFacing = 1;
2329         desc.triFlags.primID = pPrimID[primIndex];
2330         desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
2331         desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
2332
2333         work.pfnWork = RasterizeLine;
2334
2335         auto pArena = pDC->pArena;
2336         SWR_ASSERT(pArena != nullptr);
2337
2338         // store active attribs
2339         desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
2340         desc.numAttribs = linkageCount;
2341         ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
2342
2343         // store line vertex data
2344         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
2345         _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
2346         _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
2347         _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
2348         _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
2349
2350         // store user clip distances
2351         if (rastState.clipDistanceMask)
2352         {
2353             uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
2354             desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
2355             ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
2356         }
2357
2358         MacroTileMgr *pTileMgr = pDC->pTileMgr;
2359         for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
2360         {
2361             for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
2362             {
2363 #if KNOB_ENABLE_TOSS_POINTS
2364                 if (!KNOB_TOSS_SETUP_TRIS)
2365 #endif
2366                 {
2367                     pTileMgr->enqueue(x, y, &work);
2368                 }
2369             }
2370         }
2371
2372         primMask &= ~(1 << primIndex);
2373     }
2374
2375 endBinLines:
2376
2377     RDTSC_STOP(FEBinLines, 1, 0);
2378 }