src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
  71     uint8_t* pStreamBase{ nullptr };    // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
  73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
  74
  75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  77
  78 #if ENABLE_AVX512_SIMD16
  79     bool useAlternateOffset{ false };
  80
  81 #endif
  82     PA_STATE() {}
  83     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
  84         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
  85
  86     virtual bool HasWork() = 0;
  87     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  88 #if ENABLE_AVX512_SIMD16
  89     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  90 #endif
  91     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  92 #if ENABLE_AVX512_SIMD16
  93     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
  94 #endif
  95     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
  96     virtual bool NextPrim() = 0;
  97     virtual SIMDVERTEX& GetNextVsOutput() = 0;
  98     virtual bool GetNextStreamOutput() = 0;
  99     virtual SIMDMASK& GetNextVsIndices() = 0;
 100     virtual uint32_t NumPrims() = 0;
 101     virtual void Reset() = 0;
 102     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 103 };
 104
 105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 106 // output. Here is the sequence
 107 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 108 //    2. Execute PA function to assemble and bin triangles.
 109 //        a.    The PA function is a set of functions that collectively make up the
 110 //            state machine for a given topology.
 111 //                1.    We use a state index to track which PA function to call.
 112 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 113 //                1.    We call this the current and previous simd vertex.
 114 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 115 //                    order to assemble the second triangle, for a triangle list, we'll need the
 116 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 117 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 118 //
 119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 120 // cuts
 121 struct PA_STATE_OPT : public PA_STATE
 122 {
 123     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 124     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 125
 126     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 127
 128     uint32_t cur{ 0 };                   // index to current VS output.
 129     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 130     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
 131
 132     uint32_t counter{ 0 };               // state counter
 133     bool reset{ false };                 // reset state
 134
 135     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 136     SIMDSCALARI primID;
 137
 138     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 139 #if ENABLE_AVX512_SIMD16
 140     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 141 #endif
 142     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 143
 144     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 145 #if ENABLE_AVX512_SIMD16
 146     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 147 #endif
 148     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 149     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 150 #if ENABLE_AVX512_SIMD16
 151     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 152 #endif
 153
 154     // state used to advance the PA when Next is called
 155     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 156 #if ENABLE_AVX512_SIMD16
 157     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 158 #endif
 159     uint32_t           nextNumSimdPrims{ 0 };
 160     uint32_t           nextNumPrimsIncrement{ 0 };
 161     bool               nextReset{ false };
 162     bool               isStreaming{ false };
 163
 164     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
 165
 166     PA_STATE_OPT() {}
 167     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 168         uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 169
 170     bool HasWork()
 171     {
 172         return (this->numPrimsComplete < this->numPrims) ? true : false;
 173     }
 174
 175     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 176     {
 177         SWR_ASSERT(slot < vertexStride);
 178         uint32_t offset = index * vertexStride + slot;
 179         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
 180         return vertexSlot;
 181     }
 182
 183 #if ENABLE_AVX512_SIMD16
 184     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 185     {
 186         SWR_ASSERT(slot < vertexStride);
 187         uint32_t offset = index * vertexStride + slot;
 188         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
 189         return vertexSlot;
 190     }
 191
 192 #endif
 193     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 194     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 195     bool Assemble(uint32_t slot, simdvector verts[])
 196     {
 197         return this->pfnPaFunc(*this, slot, verts);
 198     }
 199
 200 #if ENABLE_AVX512_SIMD16
 201     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 202     {
 203         return this->pfnPaFunc_simd16(*this, slot, verts);
 204     }
 205
 206 #endif
 207     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 208     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 209     {
 210         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 211     }
 212
 213     bool NextPrim()
 214     {
 215         this->pfnPaFunc = this->pfnPaNextFunc;
 216 #if ENABLE_AVX512_SIMD16
 217         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 218 #endif
 219         this->numSimdPrims = this->nextNumSimdPrims;
 220         this->numPrimsComplete += this->nextNumPrimsIncrement;
 221         this->reset = this->nextReset;
 222
 223         if (this->isStreaming)
 224         {
 225             this->reset = false;
 226         }
 227
 228         bool morePrims = false;
 229
 230         if (this->numSimdPrims > 0)
 231         {
 232             morePrims = true;
 233             this->numSimdPrims--;
 234         }
 235         else
 236         {
 237             this->counter = (this->reset) ? 0 : (this->counter + 1);
 238             this->reset = false;
 239         }
 240
 241         if (!HasWork())
 242         {
 243             morePrims = false;    // no more to do
 244         }
 245
 246         return morePrims;
 247     }
 248
 249     SIMDVERTEX& GetNextVsOutput()
 250     {
 251         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
 252
 253         // increment cur and prev indices
 254         if (counter < numSimdVerts)
 255         {
 256             // prev undefined for first state
 257             prev = cur;
 258             cur = counter;
 259         }
 260         else
 261         {
 262             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
 263             uint32_t temp = prev;
 264
 265             prev = cur;
 266             cur = temp;
 267         }
 268
 269         SWR_ASSERT(cur < numSimdVerts);
 270         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
 271
 272         return *(SIMDVERTEX*)pVertex;
 273     }
 274
 275     SIMDMASK& GetNextVsIndices()
 276     {
 277         // unused in optimized PA, pass tmp buffer back
 278         return junkIndices;
 279     }
 280
 281     bool GetNextStreamOutput()
 282     {
 283         this->prev = this->cur;
 284         this->cur = this->counter;
 285
 286         return HasWork();
 287     }
 288
 289     uint32_t NumPrims()
 290     {
 291         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 292             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 293     }
 294
 295     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 296         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 297         uint32_t numSimdPrims = 0,
 298         uint32_t numPrimsIncrement = 0,
 299         bool reset = false)
 300     {
 301         this->pfnPaNextFunc = pfnPaNextFunc;
 302         this->nextNumSimdPrims = numSimdPrims;
 303         this->nextNumPrimsIncrement = numPrimsIncrement;
 304         this->nextReset = reset;
 305
 306         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 307     }
 308
 309 #if ENABLE_AVX512_SIMD16
 310     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 311         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 312         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 313         uint32_t numSimdPrims = 0,
 314         uint32_t numPrimsIncrement = 0,
 315         bool reset = false)
 316     {
 317         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 318         this->pfnPaNextFunc = pfnPaNextFunc;
 319         this->nextNumSimdPrims = numSimdPrims;
 320         this->nextNumPrimsIncrement = numPrimsIncrement;
 321         this->nextReset = reset;
 322
 323         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 324     }
 325
 326 #endif
 327     void Reset()
 328     {
 329 #if ENABLE_AVX512_SIMD16
 330         useAlternateOffset = false;
 331
 332 #endif
 333         this->pfnPaFunc = this->pfnPaFuncReset;
 334 #if ENABLE_AVX512_SIMD16
 335         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 336 #endif
 337         this->numPrimsComplete = 0;
 338         this->numSimdPrims = 0;
 339         this->cur = 0;
 340         this->prev = 0;
 341         this->counter = 0;
 342         this->reset = false;
 343     }
 344
 345     SIMDSCALARI GetPrimID(uint32_t startID)
 346     {
 347 #if USE_SIMD16_FRONTEND
 348         return _simd16_add_epi32(this->primID,
 349             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 350 #else
 351         return _simd_add_epi32(this->primID,
 352             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 353 #endif
 354     }
 355 };
 356
 357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 359     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 360     uint32_t numSimdPrims = 0,
 361     uint32_t numPrimsIncrement = 0,
 362     bool reset = false)
 363 {
 364     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 365 }
 366
 367 #if ENABLE_AVX512_SIMD16
 368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 369     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 370     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 371     uint32_t numSimdPrims = 0,
 372     uint32_t numPrimsIncrement = 0,
 373     bool reset = false)
 374 {
 375     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 376 }
 377
 378 #endif
 379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 380 {
 381     return pa.GetSimdVector(index, slot);
 382 }
 383
 384 #if ENABLE_AVX512_SIMD16
 385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 386 {
 387     return pa.GetSimdVector_simd16(index, slot);
 388 }
 389
 390 #endif
 391 // Cut-aware primitive assembler.
 392 struct PA_STATE_CUT : public PA_STATE
 393 {
 394     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 395     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 396     uint32_t numAttribs{ 0 };            // number of attributes
 397     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 398     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 399 #if ENABLE_AVX512_SIMD16
 400     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 401 #else
 402     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 403 #endif
 404     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 405     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 406     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 407     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 408     uint32_t curVertex{ 0 };             // current unprocessed vertex
 409     uint32_t startPrimId{ 0 };           // starting prim id
 410     SIMDSCALARI vPrimId;                 // vector of prim ID
 411     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 412     uint32_t vertsPerPrim{ 0 };
 413     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 414                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 415                                          // while the GS sends valid verts for every index
 416
 417     simdvector      junkVector;          // junk simdvector for unimplemented API
 418 #if ENABLE_AVX512_SIMD16
 419     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
 420 #endif
 421
 422     // Topology state tracking
 423     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 424     uint32_t curIndex{ 0 };
 425     bool reverseWinding{ false };        // indicates reverse winding for strips
 426     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 427
 428     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 429     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 430
 431     PA_STATE_CUT() {}
 432     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 433         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 434         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
 435     {
 436         numVerts = in_streamSizeInVerts;
 437         numAttribs = in_numAttribs;
 438         binTopology = topo;
 439         needOffsets = false;
 440         processCutVerts = in_processCutVerts;
 441
 442         numVertsToAssemble = numRemainingVerts = in_numVerts;
 443         numPrimsAssembled = 0;
 444         headVertex = tailVertex = curVertex = 0;
 445
 446         curIndex = 0;
 447         pCutIndices = in_pIndices;
 448         memset(indices, 0, sizeof(indices));
 449 #if USE_SIMD16_FRONTEND
 450         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 451 #else
 452         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 453 #endif
 454         reverseWinding = false;
 455         adjExtraVert = -1;
 456
 457         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 458         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 459
 460         switch (topo)
 461         {
 462         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 463         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 464         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 465         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 466                                     {
 467                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 468                                     }
 469                                     else
 470                                     {
 471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 472                                     }
 473                                     break;
 474
 475         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 476         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 477         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 478         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 479         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 480         default: assert(0 && "Unimplemented topology");
 481         }
 482     }
 483
 484     SIMDVERTEX& GetNextVsOutput()
 485     {
 486         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 487         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 488         this->needOffsets = true;
 489         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
 490
 491         return *(SIMDVERTEX*)pVertex;
 492     }
 493
 494     SIMDMASK& GetNextVsIndices()
 495     {
 496         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 497         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 498         return *pCurCutIndex;
 499     }
 500
 501     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 502     {
 503         // unused
 504         SWR_ASSERT(0 && "Not implemented");
 505         return junkVector;
 506     }
 507
 508 #if ENABLE_AVX512_SIMD16
 509     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 510     {
 511         // unused
 512         SWR_ASSERT(0 && "Not implemented");
 513         return junkVector_simd16;
 514     }
 515
 516 #endif
 517     bool GetNextStreamOutput()
 518     {
 519         this->headVertex += SIMD_WIDTH;
 520         this->needOffsets = true;
 521         return HasWork();
 522     }
 523
 524     SIMDSCALARI GetPrimID(uint32_t startID)
 525     {
 526 #if USE_SIMD16_FRONTEND
 527         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 528 #else
 529         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 530 #endif
 531     }
 532
 533     void Reset()
 534     {
 535 #if ENABLE_AVX512_SIMD16
 536         useAlternateOffset = false;
 537
 538 #endif
 539         this->numRemainingVerts = this->numVertsToAssemble;
 540         this->numPrimsAssembled = 0;
 541         this->curIndex = 0;
 542         this->curVertex = 0;
 543         this->tailVertex = 0;
 544         this->headVertex = 0;
 545         this->reverseWinding = false;
 546         this->adjExtraVert = -1;
 547 #if USE_SIMD16_FRONTEND
 548         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 549 #else
 550         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 551 #endif
 552     }
 553
 554     bool HasWork()
 555     {
 556         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 557     }
 558
 559     bool IsVertexStoreFull()
 560     {
 561         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 562     }
 563
 564     void RestartTopology()
 565     {
 566         this->curIndex = 0;
 567         this->reverseWinding = false;
 568         this->adjExtraVert = -1;
 569     }
 570
 571     bool IsCutIndex(uint32_t vertex)
 572     {
 573         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 574         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 575         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 576     }
 577
 578     // iterates across the unprocessed verts until we hit the end or we
 579     // have assembled SIMD prims
 580     void ProcessVerts()
 581     {
 582         while (this->numPrimsAssembled != SIMD_WIDTH &&
 583             this->numRemainingVerts > 0 &&
 584             this->curVertex != this->headVertex)
 585         {
 586             // if cut index, restart topology
 587             if (IsCutIndex(this->curVertex))
 588             {
 589                 if (this->processCutVerts)
 590                 {
 591                     (this->*pfnPa)(this->curVertex, false);
 592                 }
 593                 // finish off tri strip w/ adj before restarting topo
 594                 if (this->adjExtraVert != -1)
 595                 {
 596                     (this->*pfnPa)(this->curVertex, true);
 597                 }
 598                 RestartTopology();
 599             }
 600             else
 601             {
 602                 (this->*pfnPa)(this->curVertex, false);
 603             }
 604
 605             this->curVertex++;
 606             if (this->curVertex >= this->numVerts) {
 607                this->curVertex = 0;
 608             }
 609             this->numRemainingVerts--;
 610         }
 611
 612         // special case last primitive for tri strip w/ adj
 613         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 614         {
 615             (this->*pfnPa)(this->curVertex, true);
 616         }
 617     }
 618
 619     void Advance()
 620     {
 621         // done with current batch
 622         // advance tail to the current unsubmitted vertex
 623         this->tailVertex = this->curVertex;
 624         this->numPrimsAssembled = 0;
 625 #if USE_SIMD16_FRONTEND
 626         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 627 #else
 628         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 629 #endif
 630     }
 631
 632     bool NextPrim()
 633     {
 634         // if we've assembled enough prims, we can advance to the next set of verts
 635         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 636         {
 637             Advance();
 638         }
 639         return false;
 640     }
 641
 642     void ComputeOffsets()
 643     {
 644         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 645         {
 646             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
 647             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 648
 649             // step to simdvertex batch
 650             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 651 #if USE_SIMD16_FRONTEND
 652             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 653             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 654 #else
 655             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 656             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 657 #endif
 658
 659             // step to index
 660             const uint32_t simdMask = SIMD_WIDTH - 1;
 661 #if USE_SIMD16_FRONTEND
 662             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 663             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 664 #else
 665             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 666             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 667 #endif
 668         }
 669     }
 670
 671     bool Assemble(uint32_t slot, simdvector *verts)
 672     {
 673         // process any outstanding verts
 674         ProcessVerts();
 675
 676         // return false if we don't have enough prims assembled
 677         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 678         {
 679             return false;
 680         }
 681
 682         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 683         if (this->needOffsets)
 684         {
 685             ComputeOffsets();
 686             this->needOffsets = false;
 687         }
 688
 689         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 690         {
 691             SIMDSCALARI offsets = this->vOffsets[v];
 692
 693             // step to attribute
 694 #if USE_SIMD16_FRONTEND
 695             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 696 #else
 697             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 698 #endif
 699
 700             float* pBase = (float*)this->pStreamBase;
 701             for (uint32_t c = 0; c < 4; ++c)
 702             {
 703 #if USE_SIMD16_FRONTEND
 704                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 705
 706                 verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 707 #else
 708                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 709 #endif
 710
 711                 // move base to next component
 712                 pBase += SIMD_WIDTH;
 713             }
 714         }
 715
 716         return true;
 717     }
 718
 719 #if ENABLE_AVX512_SIMD16
 720     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 721     {
 722         // process any outstanding verts
 723         ProcessVerts();
 724
 725         // return false if we don't have enough prims assembled
 726         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 727         {
 728             return false;
 729         }
 730
 731         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 732         if (this->needOffsets)
 733         {
 734             ComputeOffsets();
 735             this->needOffsets = false;
 736         }
 737
 738         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 739         {
 740             SIMDSCALARI offsets = this->vOffsets[v];
 741
 742             // step to attribute
 743 #if USE_SIMD16_FRONTEND
 744             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 745 #else
 746             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 747 #endif
 748
 749             float* pBase = (float*)this->pStreamBase;
 750             for (uint32_t c = 0; c < 4; ++c)
 751             {
 752 #if USE_SIMD16_FRONTEND
 753                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 754 #else
 755                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 756 #endif
 757
 758                 // move base to next component
 759                 pBase += SIMD_WIDTH;
 760             }
 761         }
 762
 763         return true;
 764     }
 765
 766 #endif
 767     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
 768     {
 769         // move to slot
 770         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 771         {
 772             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 773 #if USE_SIMD16_FRONTEND
 774             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 775 #else
 776             uint32_t offset = pOffset[triIndex];
 777 #endif
 778             offset += sizeof(SIMDVECTOR) * slot;
 779             float* pVert = (float*)&tri[v];
 780             for (uint32_t c = 0; c < 4; ++c)
 781             {
 782                 float* pComponent = (float*)(this->pStreamBase + offset);
 783                 pVert[c] = *pComponent;
 784                 offset += SIMD_WIDTH * sizeof(float);
 785             }
 786         }
 787     }
 788
 789     uint32_t NumPrims()
 790     {
 791         return this->numPrimsAssembled;
 792     }
 793
 794     // Per-topology functions
 795     void ProcessVertTriStrip(uint32_t index, bool finish)
 796     {
 797         this->vert[this->curIndex] = index;
 798         this->curIndex++;
 799         if (this->curIndex == 3)
 800         {
 801             // assembled enough verts for prim, add to gather indices
 802             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 803             if (reverseWinding)
 804             {
 805                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 806                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 807             }
 808             else
 809             {
 810                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 811                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 812             }
 813
 814             // increment numPrimsAssembled
 815             this->numPrimsAssembled++;
 816
 817             // set up next prim state
 818             this->vert[0] = this->vert[1];
 819             this->vert[1] = this->vert[2];
 820             this->curIndex = 2;
 821             this->reverseWinding ^= 1;
 822         }
 823     }
 824
 825     template<bool gsEnabled>
 826     void AssembleTriStripAdj()
 827     {
 828         if (!gsEnabled)
 829         {
 830             this->vert[1] = this->vert[2];
 831             this->vert[2] = this->vert[4];
 832
 833             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 834             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 835             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 836
 837             this->vert[4] = this->vert[2];
 838             this->vert[2] = this->vert[1];
 839         }
 840         else
 841         {
 842             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 843             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 844             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 845             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 846             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 847             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 848         }
 849         this->numPrimsAssembled++;
 850     }
 851
 852
 853     template<bool gsEnabled>
 854     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 855     {
 856         // handle last primitive of tristrip
 857         if (finish && this->adjExtraVert != -1)
 858         {
 859             this->vert[3] = this->adjExtraVert;
 860             AssembleTriStripAdj<gsEnabled>();
 861             this->adjExtraVert = -1;
 862             return;
 863         }
 864
 865         switch (this->curIndex)
 866         {
 867         case 0:
 868         case 1:
 869         case 2:
 870         case 4:
 871             this->vert[this->curIndex] = index;
 872             this->curIndex++;
 873             break;
 874         case 3:
 875             this->vert[5] = index;
 876             this->curIndex++;
 877             break;
 878         case 5:
 879             if (this->adjExtraVert == -1)
 880             {
 881                 this->adjExtraVert = index;
 882             }
 883             else
 884             {
 885                 this->vert[3] = index;
 886                 if (!gsEnabled)
 887                 {
 888                     AssembleTriStripAdj<gsEnabled>();
 889
 890                     uint32_t nextTri[6];
 891                     if (this->reverseWinding)
 892                     {
 893                         nextTri[0] = this->vert[4];
 894                         nextTri[1] = this->vert[0];
 895                         nextTri[2] = this->vert[2];
 896                         nextTri[4] = this->vert[3];
 897                         nextTri[5] = this->adjExtraVert;
 898                     }
 899                     else
 900                     {
 901                         nextTri[0] = this->vert[2];
 902                         nextTri[1] = this->adjExtraVert;
 903                         nextTri[2] = this->vert[3];
 904                         nextTri[4] = this->vert[4];
 905                         nextTri[5] = this->vert[0];
 906                     }
 907                     for (uint32_t i = 0; i < 6; ++i)
 908                     {
 909                         this->vert[i] = nextTri[i];
 910                     }
 911
 912                     this->adjExtraVert = -1;
 913                     this->reverseWinding ^= 1;
 914                 }
 915                 else
 916                 {
 917                     this->curIndex++;
 918                 }
 919             }
 920             break;
 921         case 6:
 922             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 923             AssembleTriStripAdj<gsEnabled>();
 924
 925             uint32_t nextTri[6];
 926             if (this->reverseWinding)
 927             {
 928                 nextTri[0] = this->vert[4];
 929                 nextTri[1] = this->vert[0];
 930                 nextTri[2] = this->vert[2];
 931                 nextTri[4] = this->vert[3];
 932                 nextTri[5] = this->adjExtraVert;
 933             }
 934             else
 935             {
 936                 nextTri[0] = this->vert[2];
 937                 nextTri[1] = this->adjExtraVert;
 938                 nextTri[2] = this->vert[3];
 939                 nextTri[4] = this->vert[4];
 940                 nextTri[5] = this->vert[0];
 941             }
 942             for (uint32_t i = 0; i < 6; ++i)
 943             {
 944                 this->vert[i] = nextTri[i];
 945             }
 946             this->reverseWinding ^= 1;
 947             this->adjExtraVert = index;
 948             this->curIndex--;
 949             break;
 950         }
 951     }
 952
 953     void ProcessVertTriList(uint32_t index, bool finish)
 954     {
 955         this->vert[this->curIndex] = index;
 956         this->curIndex++;
 957         if (this->curIndex == 3)
 958         {
 959             // assembled enough verts for prim, add to gather indices
 960             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 961             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 962             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 963
 964             // increment numPrimsAssembled
 965             this->numPrimsAssembled++;
 966
 967             // set up next prim state
 968             this->curIndex = 0;
 969         }
 970     }
 971
 972     void ProcessVertTriListAdj(uint32_t index, bool finish)
 973     {
 974         this->vert[this->curIndex] = index;
 975         this->curIndex++;
 976         if (this->curIndex == 6)
 977         {
 978             // assembled enough verts for prim, add to gather indices
 979             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 980             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 981             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 982             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 983             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 984             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 985
 986             // increment numPrimsAssembled
 987             this->numPrimsAssembled++;
 988
 989             // set up next prim state
 990             this->curIndex = 0;
 991         }
 992     }
 993
 994     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 995     {
 996         this->vert[this->curIndex] = index;
 997         this->curIndex++;
 998         if (this->curIndex == 6)
 999         {
1000             // assembled enough verts for prim, add to gather indices
1001             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1002             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1003             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1004
1005             // increment numPrimsAssembled
1006             this->numPrimsAssembled++;
1007
1008             // set up next prim state
1009             this->curIndex = 0;
1010         }
1011     }
1012
1013
1014     void ProcessVertLineList(uint32_t index, bool finish)
1015     {
1016         this->vert[this->curIndex] = index;
1017         this->curIndex++;
1018         if (this->curIndex == 2)
1019         {
1020             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1021             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1022
1023             this->numPrimsAssembled++;
1024             this->curIndex = 0;
1025         }
1026     }
1027
1028     void ProcessVertLineStrip(uint32_t index, bool finish)
1029     {
1030         this->vert[this->curIndex] = index;
1031         this->curIndex++;
1032         if (this->curIndex == 2)
1033         {
1034             // assembled enough verts for prim, add to gather indices
1035             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1036             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1037
1038             // increment numPrimsAssembled
1039             this->numPrimsAssembled++;
1040
1041             // set up next prim state
1042             this->vert[0] = this->vert[1];
1043             this->curIndex = 1;
1044         }
1045     }
1046
1047     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1048     {
1049         this->vert[this->curIndex] = index;
1050         this->curIndex++;
1051         if (this->curIndex == 4)
1052         {
1053             // assembled enough verts for prim, add to gather indices
1054             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1055             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1056             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1057             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1058
1059             // increment numPrimsAssembled
1060             this->numPrimsAssembled++;
1061
1062             // set up next prim state
1063             this->vert[0] = this->vert[1];
1064             this->vert[1] = this->vert[2];
1065             this->vert[2] = this->vert[3];
1066             this->curIndex = 3;
1067         }
1068     }
1069
1070     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1071     {
1072         this->vert[this->curIndex] = index;
1073         this->curIndex++;
1074         if (this->curIndex == 4)
1075         {
1076             // assembled enough verts for prim, add to gather indices
1077             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1078             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1079
1080             // increment numPrimsAssembled
1081             this->numPrimsAssembled++;
1082
1083             // set up next prim state
1084             this->vert[0] = this->vert[1];
1085             this->vert[1] = this->vert[2];
1086             this->vert[2] = this->vert[3];
1087             this->curIndex = 3;
1088         }
1089     }
1090
1091     void ProcessVertLineListAdj(uint32_t index, bool finish)
1092     {
1093         this->vert[this->curIndex] = index;
1094         this->curIndex++;
1095         if (this->curIndex == 4)
1096         {
1097             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1098             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1099             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1100             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1101
1102             this->numPrimsAssembled++;
1103             this->curIndex = 0;
1104         }
1105     }
1106
1107     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1108     {
1109         this->vert[this->curIndex] = index;
1110         this->curIndex++;
1111         if (this->curIndex == 4)
1112         {
1113             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1114             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1115
1116             this->numPrimsAssembled++;
1117             this->curIndex = 0;
1118         }
1119     }
1120
1121     void ProcessVertPointList(uint32_t index, bool finish)
1122     {
1123         this->vert[this->curIndex] = index;
1124         this->curIndex++;
1125         if (this->curIndex == 1)
1126         {
1127             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1128             this->numPrimsAssembled++;
1129             this->curIndex = 0;
1130         }
1131     }
1132 };
1133
1134 // Primitive Assembly for data output from the DomainShader.
1135 struct PA_TESS : PA_STATE
1136 {
1137     PA_TESS(
1138         DRAW_CONTEXT *in_pDC,
1139         const SIMDSCALAR* in_pVertData,
1140         uint32_t in_attributeStrideInVectors,
1141         uint32_t in_vertexStride,
1142         uint32_t in_numAttributes,
1143         uint32_t* (&in_ppIndices)[3],
1144         uint32_t in_numPrims,
1145         PRIMITIVE_TOPOLOGY in_binTopology) :
1146
1147         PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1148         m_pVertexData(in_pVertData),
1149         m_attributeStrideInVectors(in_attributeStrideInVectors),
1150         m_numAttributes(in_numAttributes),
1151         m_numPrims(in_numPrims)
1152     {
1153 #if USE_SIMD16_FRONTEND
1154         m_vPrimId = _simd16_setzero_si();
1155 #else
1156         m_vPrimId = _simd_setzero_si();
1157 #endif
1158         binTopology = in_binTopology;
1159         m_ppIndices[0] = in_ppIndices[0];
1160         m_ppIndices[1] = in_ppIndices[1];
1161         m_ppIndices[2] = in_ppIndices[2];
1162
1163         switch (binTopology)
1164         {
1165         case TOP_POINT_LIST:
1166             m_numVertsPerPrim = 1;
1167             break;
1168
1169         case TOP_LINE_LIST:
1170             m_numVertsPerPrim = 2;
1171             break;
1172
1173         case TOP_TRIANGLE_LIST:
1174             m_numVertsPerPrim = 3;
1175             break;
1176
1177         default:
1178             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1179             break;
1180         }
1181     }
1182
1183     bool HasWork()
1184     {
1185         return m_numPrims != 0;
1186     }
1187
1188     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1189     {
1190         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1191         return junkVector;
1192     }
1193
1194 #if ENABLE_AVX512_SIMD16
1195     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1196     {
1197         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1198         return junkVector_simd16;
1199     }
1200
1201 #endif
1202     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1203     {
1204         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1205 #if USE_SIMD16_FRONTEND
1206         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1207         {
1208             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1209             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1210         };
1211
1212         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1213 #else
1214         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1215         {
1216             -1, -1, -1, -1, -1, -1, -1, -1,
1217             0,  0,  0,  0,  0,  0,  0,  0
1218         };
1219
1220         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1221 #endif
1222     }
1223
1224     bool Assemble(uint32_t slot, simdvector verts[])
1225     {
1226         SWR_ASSERT(slot < m_numAttributes);
1227
1228         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1229         if (0 == numPrimsToAssemble)
1230         {
1231             return false;
1232         }
1233
1234         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1235
1236         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1237         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1238         {
1239 #if USE_SIMD16_FRONTEND
1240             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1241 #else
1242             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1243 #endif
1244
1245             const float* pBase = pBaseAttrib;
1246             for (uint32_t c = 0; c < 4; ++c)
1247             {
1248 #if USE_SIMD16_FRONTEND
1249                 simd16scalar temp = _simd16_mask_i32gather_ps(
1250                     _simd16_setzero_ps(),
1251                     pBase,
1252                     indices,
1253                     _simd16_castsi_ps(mask),
1254                     4 /* gcc doesn't like sizeof(float) */);
1255
1256                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1257 #else
1258                 verts[i].v[c] = _simd_mask_i32gather_ps(
1259                     _simd_setzero_ps(),
1260                     pBase,
1261                     indices,
1262                     _simd_castsi_ps(mask),
1263                     4); // gcc doesn't like sizeof(float)
1264 #endif
1265                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1266             }
1267         }
1268
1269         return true;
1270     }
1271
1272 #if ENABLE_AVX512_SIMD16
1273     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1274     {
1275         SWR_ASSERT(slot < m_numAttributes);
1276
1277         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1278         if (0 == numPrimsToAssemble)
1279         {
1280             return false;
1281         }
1282
1283         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1284
1285         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1286         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1287         {
1288 #if USE_SIMD16_FRONTEND
1289             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1290 #else
1291             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1292 #endif
1293
1294             const float* pBase = pBaseAttrib;
1295             for (uint32_t c = 0; c < 4; ++c)
1296             {
1297 #if USE_SIMD16_FRONTEND
1298                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1299                     _simd16_setzero_ps(),
1300                     pBase,
1301                     indices,
1302                     _simd16_castsi_ps(mask),
1303                     4 /* gcc doesn't like sizeof(float) */);
1304 #else
1305                 simdscalar temp = _simd_mask_i32gather_ps(
1306                     _simd_setzero_ps(),
1307                     pBase,
1308                     indices,
1309                     _simd_castsi_ps(mask),
1310                     4 /* gcc doesn't like sizeof(float) */);
1311                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1312 #endif
1313                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1314             }
1315         }
1316
1317         return true;
1318     }
1319
1320 #endif
1321     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1322     {
1323         SWR_ASSERT(slot < m_numAttributes);
1324         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1325
1326         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1327         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1328         {
1329 #if USE_SIMD16_FRONTEND
1330             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1331 #else
1332             uint32_t index = m_ppIndices[i][primIndex];
1333 #endif
1334             const float* pVertData = pVertDataBase;
1335             float* pVert = (float*)&verts[i];
1336
1337             for (uint32_t c = 0; c < 4; ++c)
1338             {
1339                 pVert[c] = pVertData[index];
1340                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1341             }
1342         }
1343     }
1344
1345     bool NextPrim()
1346     {
1347         uint32_t numPrims = PA_TESS::NumPrims();
1348         m_numPrims -= numPrims;
1349         m_ppIndices[0] += numPrims;
1350         m_ppIndices[1] += numPrims;
1351         m_ppIndices[2] += numPrims;
1352
1353         return HasWork();
1354     }
1355
1356     SIMDVERTEX& GetNextVsOutput()
1357     {
1358         SWR_NOT_IMPL;
1359         return junkVertex;
1360     }
1361
1362     bool GetNextStreamOutput()
1363     {
1364         SWR_NOT_IMPL;
1365         return false;
1366     }
1367
1368     SIMDMASK& GetNextVsIndices()
1369     {
1370         SWR_NOT_IMPL;
1371         return junkIndices;
1372     }
1373
1374     uint32_t NumPrims()
1375     {
1376         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1377     }
1378
1379     void Reset()
1380     {
1381         SWR_NOT_IMPL;
1382     }
1383
1384     SIMDSCALARI GetPrimID(uint32_t startID)
1385     {
1386 #if USE_SIMD16_FRONTEND
1387         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1388 #else
1389         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1390 #endif
1391     }
1392
1393 private:
1394     const SIMDSCALAR*   m_pVertexData = nullptr;
1395     uint32_t            m_attributeStrideInVectors = 0;
1396     uint32_t            m_numAttributes = 0;
1397     uint32_t            m_numPrims = 0;
1398     uint32_t*           m_ppIndices[3];
1399
1400     uint32_t            m_numVertsPerPrim = 0;
1401
1402     SIMDSCALARI         m_vPrimId;
1403
1404     simdvector          junkVector;         // junk simdvector for unimplemented API
1405 #if ENABLE_AVX512_SIMD16
1406     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
1407 #endif
1408     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
1409     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
1410 };
1411
1412 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1413 // based on state.
1414 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1415 struct PA_FACTORY
1416 {
1417     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1418     {
1419 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1420         const API_STATE& state = GetApiState(pDC);
1421         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1422             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1423             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1424             topo == TOP_TRIANGLE_LIST)) ||
1425
1426             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1427             // for them in the optimized PA
1428             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1429         {
1430             memset(&indexStore, 0, sizeof(indexStore));
1431             uint32_t numAttribs = state.feNumAttributes;
1432
1433             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1434                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1435             cutPA = true;
1436         }
1437         else
1438 #endif
1439         {
1440             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1441             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1442             cutPA = false;
1443         }
1444
1445     }
1446
1447     PA_STATE& GetPA()
1448     {
1449 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1450         if (cutPA)
1451         {
1452             return this->paCut;
1453         }
1454         else
1455 #endif
1456         {
1457             return this->paOpt;
1458         }
1459     }
1460
1461     PA_STATE_OPT paOpt;
1462     PA_STATE_CUT paCut;
1463
1464     bool cutPA{ false };
1465
1466     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1467
1468     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1469 };