src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
  71     uint8_t* pStreamBase{ nullptr };    // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
  73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
  74
  75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  77
  78 #if ENABLE_AVX512_SIMD16
  79     bool useAlternateOffset{ false };
  80
  81 #endif
  82     PA_STATE() {}
  83     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
  84         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
  85
  86     virtual bool HasWork() = 0;
  87     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  88 #if ENABLE_AVX512_SIMD16
  89     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  90 #endif
  91     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  92 #if ENABLE_AVX512_SIMD16
  93     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
  94 #endif
  95     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
  96     virtual bool NextPrim() = 0;
  97     virtual SIMDVERTEX& GetNextVsOutput() = 0;
  98     virtual bool GetNextStreamOutput() = 0;
  99     virtual SIMDMASK& GetNextVsIndices() = 0;
 100     virtual uint32_t NumPrims() = 0;
 101     virtual void Reset() = 0;
 102     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 103 };
 104
 105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 106 // output. Here is the sequence
 107 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 108 //    2. Execute PA function to assemble and bin triangles.
 109 //        a.    The PA function is a set of functions that collectively make up the
 110 //            state machine for a given topology.
 111 //                1.    We use a state index to track which PA function to call.
 112 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 113 //                1.    We call this the current and previous simd vertex.
 114 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 115 //                    order to assemble the second triangle, for a triangle list, we'll need the
 116 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 117 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 118 //
 119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 120 // cuts
 121 struct PA_STATE_OPT : public PA_STATE
 122 {
 123     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 124     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 125
 126     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 127
 128     uint32_t cur{ 0 };                   // index to current VS output.
 129     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 130     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
 131
 132     uint32_t counter{ 0 };               // state counter
 133     bool reset{ false };                 // reset state
 134
 135     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 136     SIMDSCALARI primID;
 137
 138     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 139 #if ENABLE_AVX512_SIMD16
 140     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 141 #endif
 142     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 143
 144     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 145 #if ENABLE_AVX512_SIMD16
 146     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 147 #endif
 148     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 149     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 150 #if ENABLE_AVX512_SIMD16
 151     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 152 #endif
 153
 154     // state used to advance the PA when Next is called
 155     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 156 #if ENABLE_AVX512_SIMD16
 157     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 158 #endif
 159     uint32_t           nextNumSimdPrims{ 0 };
 160     uint32_t           nextNumPrimsIncrement{ 0 };
 161     bool               nextReset{ false };
 162     bool               isStreaming{ false };
 163
 164     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
 165
 166     PA_STATE_OPT() {}
 167     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 168         uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 169
 170     bool HasWork()
 171     {
 172         return (this->numPrimsComplete < this->numPrims) ? true : false;
 173     }
 174
 175     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 176     {
 177         SWR_ASSERT(slot < vertexStride);
 178         uint32_t offset = index * vertexStride + slot;
 179         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
 180         return vertexSlot;
 181     }
 182
 183 #if ENABLE_AVX512_SIMD16
 184     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 185     {
 186         SWR_ASSERT(slot < vertexStride);
 187         uint32_t offset = index * vertexStride + slot;
 188         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
 189         return vertexSlot;
 190     }
 191
 192 #endif
 193     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 194     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 195     bool Assemble(uint32_t slot, simdvector verts[])
 196     {
 197         return this->pfnPaFunc(*this, slot, verts);
 198     }
 199
 200 #if ENABLE_AVX512_SIMD16
 201     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 202     {
 203         return this->pfnPaFunc_simd16(*this, slot, verts);
 204     }
 205
 206 #endif
 207     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 208     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 209     {
 210         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 211     }
 212
 213     bool NextPrim()
 214     {
 215         this->pfnPaFunc = this->pfnPaNextFunc;
 216 #if ENABLE_AVX512_SIMD16
 217         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 218 #endif
 219         this->numSimdPrims = this->nextNumSimdPrims;
 220         this->numPrimsComplete += this->nextNumPrimsIncrement;
 221         this->reset = this->nextReset;
 222
 223         if (this->isStreaming)
 224         {
 225             this->reset = false;
 226         }
 227
 228         bool morePrims = false;
 229
 230         if (this->numSimdPrims > 0)
 231         {
 232             morePrims = true;
 233             this->numSimdPrims--;
 234         }
 235         else
 236         {
 237             this->counter = (this->reset) ? 0 : (this->counter + 1);
 238             this->reset = false;
 239         }
 240
 241         if (!HasWork())
 242         {
 243             morePrims = false;    // no more to do
 244         }
 245
 246         return morePrims;
 247     }
 248
 249     SIMDVERTEX& GetNextVsOutput()
 250     {
 251         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
 252
 253         // increment cur and prev indices
 254         if (counter < numSimdVerts)
 255         {
 256             // prev undefined for first state
 257             prev = cur;
 258             cur = counter;
 259         }
 260         else
 261         {
 262             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
 263             uint32_t temp = prev;
 264
 265             prev = cur;
 266             cur = temp;
 267         }
 268
 269         SWR_ASSERT(cur < numSimdVerts);
 270         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
 271
 272         return *(SIMDVERTEX*)pVertex;
 273     }
 274
 275     SIMDMASK& GetNextVsIndices()
 276     {
 277         // unused in optimized PA, pass tmp buffer back
 278         return junkIndices;
 279     }
 280
 281     bool GetNextStreamOutput()
 282     {
 283         this->prev = this->cur;
 284         this->cur = this->counter;
 285
 286         return HasWork();
 287     }
 288
 289     uint32_t NumPrims()
 290     {
 291         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 292             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 293     }
 294
 295     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 296         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 297         uint32_t numSimdPrims = 0,
 298         uint32_t numPrimsIncrement = 0,
 299         bool reset = false)
 300     {
 301         this->pfnPaNextFunc = pfnPaNextFunc;
 302         this->nextNumSimdPrims = numSimdPrims;
 303         this->nextNumPrimsIncrement = numPrimsIncrement;
 304         this->nextReset = reset;
 305
 306         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 307     }
 308
 309 #if ENABLE_AVX512_SIMD16
 310     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 311         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 312         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 313         uint32_t numSimdPrims = 0,
 314         uint32_t numPrimsIncrement = 0,
 315         bool reset = false)
 316     {
 317         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 318         this->pfnPaNextFunc = pfnPaNextFunc;
 319         this->nextNumSimdPrims = numSimdPrims;
 320         this->nextNumPrimsIncrement = numPrimsIncrement;
 321         this->nextReset = reset;
 322
 323         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 324     }
 325
 326 #endif
 327     void Reset()
 328     {
 329 #if ENABLE_AVX512_SIMD16
 330         useAlternateOffset = false;
 331
 332 #endif
 333         this->pfnPaFunc = this->pfnPaFuncReset;
 334 #if ENABLE_AVX512_SIMD16
 335         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 336 #endif
 337         this->numPrimsComplete = 0;
 338         this->numSimdPrims = 0;
 339         this->cur = 0;
 340         this->prev = 0;
 341         this->counter = 0;
 342         this->reset = false;
 343     }
 344
 345     SIMDSCALARI GetPrimID(uint32_t startID)
 346     {
 347 #if USE_SIMD16_FRONTEND
 348         return _simd16_add_epi32(this->primID,
 349             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 350 #else
 351         return _simd_add_epi32(this->primID,
 352             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 353 #endif
 354     }
 355 };
 356
 357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 359     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 360     uint32_t numSimdPrims = 0,
 361     uint32_t numPrimsIncrement = 0,
 362     bool reset = false)
 363 {
 364     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 365 }
 366
 367 #if ENABLE_AVX512_SIMD16
 368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 369     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 370     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 371     uint32_t numSimdPrims = 0,
 372     uint32_t numPrimsIncrement = 0,
 373     bool reset = false)
 374 {
 375     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 376 }
 377
 378 #endif
 379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 380 {
 381     return pa.GetSimdVector(index, slot);
 382 }
 383
 384 #if ENABLE_AVX512_SIMD16
 385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 386 {
 387     return pa.GetSimdVector_simd16(index, slot);
 388 }
 389
 390 #endif
 391 // Cut-aware primitive assembler.
 392 struct PA_STATE_CUT : public PA_STATE
 393 {
 394     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 395     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 396     uint32_t numAttribs{ 0 };            // number of attributes
 397     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 398     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 399 #if ENABLE_AVX512_SIMD16
 400     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 401 #else
 402     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 403 #endif
 404     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 405     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 406     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 407     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 408     uint32_t curVertex{ 0 };             // current unprocessed vertex
 409     uint32_t startPrimId{ 0 };           // starting prim id
 410     SIMDSCALARI vPrimId;                 // vector of prim ID
 411     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 412     uint32_t vertsPerPrim{ 0 };
 413     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 414                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 415                                          // while the GS sends valid verts for every index
 416
 417     simdvector      junkVector;          // junk simdvector for unimplemented API
 418 #if ENABLE_AVX512_SIMD16
 419     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
 420 #endif
 421
 422     // Topology state tracking
 423     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 424     uint32_t curIndex{ 0 };
 425     bool reverseWinding{ false };        // indicates reverse winding for strips
 426     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 427
 428     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 429     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 430
 431     PA_STATE_CUT() {}
 432     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 433         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 434         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
 435     {
 436         numVerts = in_streamSizeInVerts;
 437         numAttribs = in_numAttribs;
 438         binTopology = topo;
 439         needOffsets = false;
 440         processCutVerts = in_processCutVerts;
 441
 442         numVertsToAssemble = numRemainingVerts = in_numVerts;
 443         numPrimsAssembled = 0;
 444         headVertex = tailVertex = curVertex = 0;
 445
 446         curIndex = 0;
 447         pCutIndices = in_pIndices;
 448         memset(indices, 0, sizeof(indices));
 449 #if USE_SIMD16_FRONTEND
 450         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 451 #else
 452         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 453 #endif
 454         reverseWinding = false;
 455         adjExtraVert = -1;
 456
 457         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 458         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 459
 460         switch (topo)
 461         {
 462         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 463         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 464         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 465         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 466                                     {
 467                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 468                                     }
 469                                     else
 470                                     {
 471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 472                                     }
 473                                     break;
 474
 475         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 476         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 477         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 478         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 479         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 480         default: assert(0 && "Unimplemented topology");
 481         }
 482     }
 483
 484     SIMDVERTEX& GetNextVsOutput()
 485     {
 486         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 487         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 488         this->needOffsets = true;
 489         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
 490
 491         return *(SIMDVERTEX*)pVertex;
 492     }
 493
 494     SIMDMASK& GetNextVsIndices()
 495     {
 496         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 497         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 498         return *pCurCutIndex;
 499     }
 500
 501     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 502     {
 503         // unused
 504         SWR_ASSERT(0 && "Not implemented");
 505         return junkVector;
 506     }
 507
 508 #if ENABLE_AVX512_SIMD16
 509     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 510     {
 511         // unused
 512         SWR_ASSERT(0 && "Not implemented");
 513         return junkVector_simd16;
 514     }
 515
 516 #endif
 517     bool GetNextStreamOutput()
 518     {
 519         this->headVertex += SIMD_WIDTH;
 520         this->needOffsets = true;
 521         return HasWork();
 522     }
 523
 524     SIMDSCALARI GetPrimID(uint32_t startID)
 525     {
 526 #if USE_SIMD16_FRONTEND
 527         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 528 #else
 529         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 530 #endif
 531     }
 532
 533     void Reset()
 534     {
 535 #if ENABLE_AVX512_SIMD16
 536         useAlternateOffset = false;
 537
 538 #endif
 539         this->numRemainingVerts = this->numVertsToAssemble;
 540         this->numPrimsAssembled = 0;
 541         this->curIndex = 0;
 542         this->curVertex = 0;
 543         this->tailVertex = 0;
 544         this->headVertex = 0;
 545         this->reverseWinding = false;
 546         this->adjExtraVert = -1;
 547 #if USE_SIMD16_FRONTEND
 548         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 549 #else
 550         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 551 #endif
 552     }
 553
 554     bool HasWork()
 555     {
 556         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 557     }
 558
 559     bool IsVertexStoreFull()
 560     {
 561         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 562     }
 563
 564     void RestartTopology()
 565     {
 566         this->curIndex = 0;
 567         this->reverseWinding = false;
 568         this->adjExtraVert = -1;
 569     }
 570
 571     bool IsCutIndex(uint32_t vertex)
 572     {
 573         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 574         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 575         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 576     }
 577
 578     // iterates across the unprocessed verts until we hit the end or we
 579     // have assembled SIMD prims
 580     void ProcessVerts()
 581     {
 582         while (this->numPrimsAssembled != SIMD_WIDTH &&
 583             this->numRemainingVerts > 0 &&
 584             this->curVertex != this->headVertex)
 585         {
 586             // if cut index, restart topology
 587             if (IsCutIndex(this->curVertex))
 588             {
 589                 if (this->processCutVerts)
 590                 {
 591                     (this->*pfnPa)(this->curVertex, false);
 592                 }
 593                 // finish off tri strip w/ adj before restarting topo
 594                 if (this->adjExtraVert != -1)
 595                 {
 596                     (this->*pfnPa)(this->curVertex, true);
 597                 }
 598                 RestartTopology();
 599             }
 600             else
 601             {
 602                 (this->*pfnPa)(this->curVertex, false);
 603             }
 604
 605             this->curVertex++;
 606             if (this->curVertex >= this->numVerts) {
 607                this->curVertex = 0;
 608             }
 609             this->numRemainingVerts--;
 610         }
 611
 612         // special case last primitive for tri strip w/ adj
 613         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 614         {
 615             (this->*pfnPa)(this->curVertex, true);
 616         }
 617     }
 618
 619     void Advance()
 620     {
 621         // done with current batch
 622         // advance tail to the current unsubmitted vertex
 623         this->tailVertex = this->curVertex;
 624         this->numPrimsAssembled = 0;
 625 #if USE_SIMD16_FRONTEND
 626         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 627 #else
 628         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 629 #endif
 630     }
 631
 632     bool NextPrim()
 633     {
 634         // if we've assembled enough prims, we can advance to the next set of verts
 635         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 636         {
 637             Advance();
 638         }
 639         return false;
 640     }
 641
 642     void ComputeOffsets()
 643     {
 644         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 645         {
 646             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
 647             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 648
 649             // step to simdvertex batch
 650             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 651 #if USE_SIMD16_FRONTEND
 652             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 653             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 654 #else
 655             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 656             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 657 #endif
 658
 659             // step to index
 660             const uint32_t simdMask = SIMD_WIDTH - 1;
 661 #if USE_SIMD16_FRONTEND
 662             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 663             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 664 #else
 665             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 666             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 667 #endif
 668         }
 669     }
 670
 671     bool Assemble(uint32_t slot, simdvector *verts)
 672     {
 673         // process any outstanding verts
 674         ProcessVerts();
 675
 676         // return false if we don't have enough prims assembled
 677         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 678         {
 679             return false;
 680         }
 681
 682         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 683         if (this->needOffsets)
 684         {
 685             ComputeOffsets();
 686             this->needOffsets = false;
 687         }
 688
 689         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 690         {
 691             SIMDSCALARI offsets = this->vOffsets[v];
 692
 693             // step to attribute
 694 #if USE_SIMD16_FRONTEND
 695             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 696 #else
 697             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 698 #endif
 699
 700             float* pBase = (float*)this->pStreamBase;
 701             for (uint32_t c = 0; c < 4; ++c)
 702             {
 703 #if USE_SIMD16_FRONTEND
 704                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 705
 706                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
 707                 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 708                 verts[v].v[c] = t;
 709 #else
 710                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 711 #endif
 712
 713                 // move base to next component
 714                 pBase += SIMD_WIDTH;
 715             }
 716         }
 717
 718         return true;
 719     }
 720
 721 #if ENABLE_AVX512_SIMD16
 722     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 723     {
 724         // process any outstanding verts
 725         ProcessVerts();
 726
 727         // return false if we don't have enough prims assembled
 728         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 729         {
 730             return false;
 731         }
 732
 733         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 734         if (this->needOffsets)
 735         {
 736             ComputeOffsets();
 737             this->needOffsets = false;
 738         }
 739
 740         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 741         {
 742             SIMDSCALARI offsets = this->vOffsets[v];
 743
 744             // step to attribute
 745 #if USE_SIMD16_FRONTEND
 746             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 747 #else
 748             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 749 #endif
 750
 751             float* pBase = (float*)this->pStreamBase;
 752             for (uint32_t c = 0; c < 4; ++c)
 753             {
 754 #if USE_SIMD16_FRONTEND
 755                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 756 #else
 757                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 758 #endif
 759
 760                 // move base to next component
 761                 pBase += SIMD_WIDTH;
 762             }
 763         }
 764
 765         return true;
 766     }
 767
 768 #endif
 769     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
 770     {
 771         // move to slot
 772         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 773         {
 774             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 775 #if USE_SIMD16_FRONTEND
 776             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 777 #else
 778             uint32_t offset = pOffset[triIndex];
 779 #endif
 780             offset += sizeof(SIMDVECTOR) * slot;
 781             float* pVert = (float*)&tri[v];
 782             for (uint32_t c = 0; c < 4; ++c)
 783             {
 784                 float* pComponent = (float*)(this->pStreamBase + offset);
 785                 pVert[c] = *pComponent;
 786                 offset += SIMD_WIDTH * sizeof(float);
 787             }
 788         }
 789     }
 790
 791     uint32_t NumPrims()
 792     {
 793         return this->numPrimsAssembled;
 794     }
 795
 796     // Per-topology functions
 797     void ProcessVertTriStrip(uint32_t index, bool finish)
 798     {
 799         this->vert[this->curIndex] = index;
 800         this->curIndex++;
 801         if (this->curIndex == 3)
 802         {
 803             // assembled enough verts for prim, add to gather indices
 804             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 805             if (reverseWinding)
 806             {
 807                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 808                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 809             }
 810             else
 811             {
 812                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 813                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 814             }
 815
 816             // increment numPrimsAssembled
 817             this->numPrimsAssembled++;
 818
 819             // set up next prim state
 820             this->vert[0] = this->vert[1];
 821             this->vert[1] = this->vert[2];
 822             this->curIndex = 2;
 823             this->reverseWinding ^= 1;
 824         }
 825     }
 826
 827     template<bool gsEnabled>
 828     void AssembleTriStripAdj()
 829     {
 830         if (!gsEnabled)
 831         {
 832             this->vert[1] = this->vert[2];
 833             this->vert[2] = this->vert[4];
 834
 835             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 836             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 837             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 838
 839             this->vert[4] = this->vert[2];
 840             this->vert[2] = this->vert[1];
 841         }
 842         else
 843         {
 844             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 845             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 846             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 847             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 848             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 849             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 850         }
 851         this->numPrimsAssembled++;
 852     }
 853
 854
 855     template<bool gsEnabled>
 856     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 857     {
 858         // handle last primitive of tristrip
 859         if (finish && this->adjExtraVert != -1)
 860         {
 861             this->vert[3] = this->adjExtraVert;
 862             AssembleTriStripAdj<gsEnabled>();
 863             this->adjExtraVert = -1;
 864             return;
 865         }
 866
 867         switch (this->curIndex)
 868         {
 869         case 0:
 870         case 1:
 871         case 2:
 872         case 4:
 873             this->vert[this->curIndex] = index;
 874             this->curIndex++;
 875             break;
 876         case 3:
 877             this->vert[5] = index;
 878             this->curIndex++;
 879             break;
 880         case 5:
 881             if (this->adjExtraVert == -1)
 882             {
 883                 this->adjExtraVert = index;
 884             }
 885             else
 886             {
 887                 this->vert[3] = index;
 888                 if (!gsEnabled)
 889                 {
 890                     AssembleTriStripAdj<gsEnabled>();
 891
 892                     uint32_t nextTri[6];
 893                     if (this->reverseWinding)
 894                     {
 895                         nextTri[0] = this->vert[4];
 896                         nextTri[1] = this->vert[0];
 897                         nextTri[2] = this->vert[2];
 898                         nextTri[4] = this->vert[3];
 899                         nextTri[5] = this->adjExtraVert;
 900                     }
 901                     else
 902                     {
 903                         nextTri[0] = this->vert[2];
 904                         nextTri[1] = this->adjExtraVert;
 905                         nextTri[2] = this->vert[3];
 906                         nextTri[4] = this->vert[4];
 907                         nextTri[5] = this->vert[0];
 908                     }
 909                     for (uint32_t i = 0; i < 6; ++i)
 910                     {
 911                         this->vert[i] = nextTri[i];
 912                     }
 913
 914                     this->adjExtraVert = -1;
 915                     this->reverseWinding ^= 1;
 916                 }
 917                 else
 918                 {
 919                     this->curIndex++;
 920                 }
 921             }
 922             break;
 923         case 6:
 924             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 925             AssembleTriStripAdj<gsEnabled>();
 926
 927             uint32_t nextTri[6];
 928             if (this->reverseWinding)
 929             {
 930                 nextTri[0] = this->vert[4];
 931                 nextTri[1] = this->vert[0];
 932                 nextTri[2] = this->vert[2];
 933                 nextTri[4] = this->vert[3];
 934                 nextTri[5] = this->adjExtraVert;
 935             }
 936             else
 937             {
 938                 nextTri[0] = this->vert[2];
 939                 nextTri[1] = this->adjExtraVert;
 940                 nextTri[2] = this->vert[3];
 941                 nextTri[4] = this->vert[4];
 942                 nextTri[5] = this->vert[0];
 943             }
 944             for (uint32_t i = 0; i < 6; ++i)
 945             {
 946                 this->vert[i] = nextTri[i];
 947             }
 948             this->reverseWinding ^= 1;
 949             this->adjExtraVert = index;
 950             this->curIndex--;
 951             break;
 952         }
 953     }
 954
 955     void ProcessVertTriList(uint32_t index, bool finish)
 956     {
 957         this->vert[this->curIndex] = index;
 958         this->curIndex++;
 959         if (this->curIndex == 3)
 960         {
 961             // assembled enough verts for prim, add to gather indices
 962             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 963             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 964             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 965
 966             // increment numPrimsAssembled
 967             this->numPrimsAssembled++;
 968
 969             // set up next prim state
 970             this->curIndex = 0;
 971         }
 972     }
 973
 974     void ProcessVertTriListAdj(uint32_t index, bool finish)
 975     {
 976         this->vert[this->curIndex] = index;
 977         this->curIndex++;
 978         if (this->curIndex == 6)
 979         {
 980             // assembled enough verts for prim, add to gather indices
 981             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 982             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 983             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 984             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 985             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 986             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 987
 988             // increment numPrimsAssembled
 989             this->numPrimsAssembled++;
 990
 991             // set up next prim state
 992             this->curIndex = 0;
 993         }
 994     }
 995
 996     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 997     {
 998         this->vert[this->curIndex] = index;
 999         this->curIndex++;
1000         if (this->curIndex == 6)
1001         {
1002             // assembled enough verts for prim, add to gather indices
1003             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1004             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1005             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1006
1007             // increment numPrimsAssembled
1008             this->numPrimsAssembled++;
1009
1010             // set up next prim state
1011             this->curIndex = 0;
1012         }
1013     }
1014
1015
1016     void ProcessVertLineList(uint32_t index, bool finish)
1017     {
1018         this->vert[this->curIndex] = index;
1019         this->curIndex++;
1020         if (this->curIndex == 2)
1021         {
1022             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1023             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1024
1025             this->numPrimsAssembled++;
1026             this->curIndex = 0;
1027         }
1028     }
1029
1030     void ProcessVertLineStrip(uint32_t index, bool finish)
1031     {
1032         this->vert[this->curIndex] = index;
1033         this->curIndex++;
1034         if (this->curIndex == 2)
1035         {
1036             // assembled enough verts for prim, add to gather indices
1037             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1038             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1039
1040             // increment numPrimsAssembled
1041             this->numPrimsAssembled++;
1042
1043             // set up next prim state
1044             this->vert[0] = this->vert[1];
1045             this->curIndex = 1;
1046         }
1047     }
1048
1049     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1050     {
1051         this->vert[this->curIndex] = index;
1052         this->curIndex++;
1053         if (this->curIndex == 4)
1054         {
1055             // assembled enough verts for prim, add to gather indices
1056             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1057             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1058             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1059             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1060
1061             // increment numPrimsAssembled
1062             this->numPrimsAssembled++;
1063
1064             // set up next prim state
1065             this->vert[0] = this->vert[1];
1066             this->vert[1] = this->vert[2];
1067             this->vert[2] = this->vert[3];
1068             this->curIndex = 3;
1069         }
1070     }
1071
1072     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1073     {
1074         this->vert[this->curIndex] = index;
1075         this->curIndex++;
1076         if (this->curIndex == 4)
1077         {
1078             // assembled enough verts for prim, add to gather indices
1079             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1080             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1081
1082             // increment numPrimsAssembled
1083             this->numPrimsAssembled++;
1084
1085             // set up next prim state
1086             this->vert[0] = this->vert[1];
1087             this->vert[1] = this->vert[2];
1088             this->vert[2] = this->vert[3];
1089             this->curIndex = 3;
1090         }
1091     }
1092
1093     void ProcessVertLineListAdj(uint32_t index, bool finish)
1094     {
1095         this->vert[this->curIndex] = index;
1096         this->curIndex++;
1097         if (this->curIndex == 4)
1098         {
1099             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1100             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1101             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1102             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1103
1104             this->numPrimsAssembled++;
1105             this->curIndex = 0;
1106         }
1107     }
1108
1109     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1110     {
1111         this->vert[this->curIndex] = index;
1112         this->curIndex++;
1113         if (this->curIndex == 4)
1114         {
1115             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1116             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1117
1118             this->numPrimsAssembled++;
1119             this->curIndex = 0;
1120         }
1121     }
1122
1123     void ProcessVertPointList(uint32_t index, bool finish)
1124     {
1125         this->vert[this->curIndex] = index;
1126         this->curIndex++;
1127         if (this->curIndex == 1)
1128         {
1129             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1130             this->numPrimsAssembled++;
1131             this->curIndex = 0;
1132         }
1133     }
1134 };
1135
1136 // Primitive Assembly for data output from the DomainShader.
1137 struct PA_TESS : PA_STATE
1138 {
1139     PA_TESS(
1140         DRAW_CONTEXT *in_pDC,
1141         const SIMDSCALAR* in_pVertData,
1142         uint32_t in_attributeStrideInVectors,
1143         uint32_t in_vertexStride,
1144         uint32_t in_numAttributes,
1145         uint32_t* (&in_ppIndices)[3],
1146         uint32_t in_numPrims,
1147         PRIMITIVE_TOPOLOGY in_binTopology) :
1148
1149         PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1150         m_pVertexData(in_pVertData),
1151         m_attributeStrideInVectors(in_attributeStrideInVectors),
1152         m_numAttributes(in_numAttributes),
1153         m_numPrims(in_numPrims)
1154     {
1155 #if USE_SIMD16_FRONTEND
1156         m_vPrimId = _simd16_setzero_si();
1157 #else
1158         m_vPrimId = _simd_setzero_si();
1159 #endif
1160         binTopology = in_binTopology;
1161         m_ppIndices[0] = in_ppIndices[0];
1162         m_ppIndices[1] = in_ppIndices[1];
1163         m_ppIndices[2] = in_ppIndices[2];
1164
1165         switch (binTopology)
1166         {
1167         case TOP_POINT_LIST:
1168             m_numVertsPerPrim = 1;
1169             break;
1170
1171         case TOP_LINE_LIST:
1172             m_numVertsPerPrim = 2;
1173             break;
1174
1175         case TOP_TRIANGLE_LIST:
1176             m_numVertsPerPrim = 3;
1177             break;
1178
1179         default:
1180             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1181             break;
1182         }
1183     }
1184
1185     bool HasWork()
1186     {
1187         return m_numPrims != 0;
1188     }
1189
1190     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1191     {
1192         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1193         return junkVector;
1194     }
1195
1196 #if ENABLE_AVX512_SIMD16
1197     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1198     {
1199         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1200         return junkVector_simd16;
1201     }
1202
1203 #endif
1204     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1205     {
1206         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1207 #if USE_SIMD16_FRONTEND
1208         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1209         {
1210             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1211             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1212         };
1213
1214         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1215 #else
1216         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1217         {
1218             -1, -1, -1, -1, -1, -1, -1, -1,
1219             0,  0,  0,  0,  0,  0,  0,  0
1220         };
1221
1222         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1223 #endif
1224     }
1225
1226     bool Assemble(uint32_t slot, simdvector verts[])
1227     {
1228         SWR_ASSERT(slot < m_numAttributes);
1229
1230         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1231         if (0 == numPrimsToAssemble)
1232         {
1233             return false;
1234         }
1235
1236         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1237
1238         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1239         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1240         {
1241 #if USE_SIMD16_FRONTEND
1242             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1243 #else
1244             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1245 #endif
1246
1247             const float* pBase = pBaseAttrib;
1248             for (uint32_t c = 0; c < 4; ++c)
1249             {
1250 #if USE_SIMD16_FRONTEND
1251                 simd16scalar temp = _simd16_mask_i32gather_ps(
1252                     _simd16_setzero_ps(),
1253                     pBase,
1254                     indices,
1255                     _simd16_castsi_ps(mask),
1256                     4 /* gcc doesn't like sizeof(float) */);
1257
1258                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1259 #else
1260                 verts[i].v[c] = _simd_mask_i32gather_ps(
1261                     _simd_setzero_ps(),
1262                     pBase,
1263                     indices,
1264                     _simd_castsi_ps(mask),
1265                     4); // gcc doesn't like sizeof(float)
1266 #endif
1267                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1268             }
1269         }
1270
1271         return true;
1272     }
1273
1274 #if ENABLE_AVX512_SIMD16
1275     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1276     {
1277         SWR_ASSERT(slot < m_numAttributes);
1278
1279         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1280         if (0 == numPrimsToAssemble)
1281         {
1282             return false;
1283         }
1284
1285         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1286
1287         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1288         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1289         {
1290 #if USE_SIMD16_FRONTEND
1291             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1292 #else
1293             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1294 #endif
1295
1296             const float* pBase = pBaseAttrib;
1297             for (uint32_t c = 0; c < 4; ++c)
1298             {
1299 #if USE_SIMD16_FRONTEND
1300                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1301                     _simd16_setzero_ps(),
1302                     pBase,
1303                     indices,
1304                     _simd16_castsi_ps(mask),
1305                     4 /* gcc doesn't like sizeof(float) */);
1306 #else
1307                 simdscalar temp = _simd_mask_i32gather_ps(
1308                     _simd_setzero_ps(),
1309                     pBase,
1310                     indices,
1311                     _simd_castsi_ps(mask),
1312                     4 /* gcc doesn't like sizeof(float) */);
1313                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1314 #endif
1315                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1316             }
1317         }
1318
1319         return true;
1320     }
1321
1322 #endif
1323     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1324     {
1325         SWR_ASSERT(slot < m_numAttributes);
1326         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1327
1328         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1329         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1330         {
1331 #if USE_SIMD16_FRONTEND
1332             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1333 #else
1334             uint32_t index = m_ppIndices[i][primIndex];
1335 #endif
1336             const float* pVertData = pVertDataBase;
1337             float* pVert = (float*)&verts[i];
1338
1339             for (uint32_t c = 0; c < 4; ++c)
1340             {
1341                 pVert[c] = pVertData[index];
1342                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1343             }
1344         }
1345     }
1346
1347     bool NextPrim()
1348     {
1349         uint32_t numPrims = PA_TESS::NumPrims();
1350         m_numPrims -= numPrims;
1351         m_ppIndices[0] += numPrims;
1352         m_ppIndices[1] += numPrims;
1353         m_ppIndices[2] += numPrims;
1354
1355         return HasWork();
1356     }
1357
1358     SIMDVERTEX& GetNextVsOutput()
1359     {
1360         SWR_NOT_IMPL;
1361         return junkVertex;
1362     }
1363
1364     bool GetNextStreamOutput()
1365     {
1366         SWR_NOT_IMPL;
1367         return false;
1368     }
1369
1370     SIMDMASK& GetNextVsIndices()
1371     {
1372         SWR_NOT_IMPL;
1373         return junkIndices;
1374     }
1375
1376     uint32_t NumPrims()
1377     {
1378         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1379     }
1380
1381     void Reset()
1382     {
1383         SWR_NOT_IMPL;
1384     }
1385
1386     SIMDSCALARI GetPrimID(uint32_t startID)
1387     {
1388 #if USE_SIMD16_FRONTEND
1389         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1390 #else
1391         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1392 #endif
1393     }
1394
1395 private:
1396     const SIMDSCALAR*   m_pVertexData = nullptr;
1397     uint32_t            m_attributeStrideInVectors = 0;
1398     uint32_t            m_numAttributes = 0;
1399     uint32_t            m_numPrims = 0;
1400     uint32_t*           m_ppIndices[3];
1401
1402     uint32_t            m_numVertsPerPrim = 0;
1403
1404     SIMDSCALARI         m_vPrimId;
1405
1406     simdvector          junkVector;         // junk simdvector for unimplemented API
1407 #if ENABLE_AVX512_SIMD16
1408     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
1409 #endif
1410     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
1411     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
1412 };
1413
1414 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1415 // based on state.
1416 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1417 struct PA_FACTORY
1418 {
1419     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1420     {
1421 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1422         const API_STATE& state = GetApiState(pDC);
1423         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1424             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1425             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1426             topo == TOP_TRIANGLE_LIST)) ||
1427
1428             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1429             // for them in the optimized PA
1430             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1431         {
1432             memset(&indexStore, 0, sizeof(indexStore));
1433             uint32_t numAttribs = state.feNumAttributes;
1434
1435             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1436                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1437             cutPA = true;
1438         }
1439         else
1440 #endif
1441         {
1442             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1443             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1444             cutPA = false;
1445         }
1446
1447     }
1448
1449     PA_STATE& GetPA()
1450     {
1451 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1452         if (cutPA)
1453         {
1454             return this->paCut;
1455         }
1456         else
1457 #endif
1458         {
1459             return this->paOpt;
1460         }
1461     }
1462
1463     PA_STATE_OPT paOpt;
1464     PA_STATE_CUT paCut;
1465
1466     bool cutPA{ false };
1467
1468     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1469
1470     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1471 };