src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
  71     uint8_t* pStreamBase{ nullptr };    // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
  73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
  74
  75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  77
  78 #if ENABLE_AVX512_SIMD16
  79     bool useAlternateOffset{ false };
  80
  81 #endif
  82     PA_STATE() {}
  83     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride) :
  84         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride) {}
  85
  86     virtual bool HasWork() = 0;
  87     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  88 #if ENABLE_AVX512_SIMD16
  89     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  90 #endif
  91     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  92 #if ENABLE_AVX512_SIMD16
  93     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
  94 #endif
  95     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
  96     virtual bool NextPrim() = 0;
  97     virtual SIMDVERTEX& GetNextVsOutput() = 0;
  98     virtual bool GetNextStreamOutput() = 0;
  99     virtual SIMDMASK& GetNextVsIndices() = 0;
 100     virtual uint32_t NumPrims() = 0;
 101     virtual void Reset() = 0;
 102     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 103 };
 104
 105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 106 // output. Here is the sequence
 107 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 108 //    2. Execute PA function to assemble and bin triangles.
 109 //        a.    The PA function is a set of functions that collectively make up the
 110 //            state machine for a given topology.
 111 //                1.    We use a state index to track which PA function to call.
 112 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 113 //                1.    We call this the current and previous simd vertex.
 114 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 115 //                    order to assemble the second triangle, for a triangle list, we'll need the
 116 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 117 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 118 //
 119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 120 // cuts
 121 struct PA_STATE_OPT : public PA_STATE
 122 {
 123     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 124     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 125
 126     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 127
 128     uint32_t cur{ 0 };                   // index to current VS output.
 129     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 130     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
 131
 132     uint32_t counter{ 0 };               // state counter
 133     bool reset{ false };                 // reset state
 134
 135     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 136     SIMDSCALARI primID;
 137
 138     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 139 #if ENABLE_AVX512_SIMD16
 140     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 141 #endif
 142     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 143
 144     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 145 #if ENABLE_AVX512_SIMD16
 146     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 147 #endif
 148     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 149     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 150 #if ENABLE_AVX512_SIMD16
 151     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 152 #endif
 153
 154     // state used to advance the PA when Next is called
 155     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 156 #if ENABLE_AVX512_SIMD16
 157     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 158 #endif
 159     uint32_t           nextNumSimdPrims{ 0 };
 160     uint32_t           nextNumPrimsIncrement{ 0 };
 161     bool               nextReset{ false };
 162     bool               isStreaming{ false };
 163
 164     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
 165
 166     PA_STATE_OPT() {}
 167     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 168         uint32_t vertexStride, bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 169
 170     bool HasWork()
 171     {
 172         return (this->numPrimsComplete < this->numPrims) ? true : false;
 173     }
 174
 175     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 176     {
 177         SWR_ASSERT(slot < vertexStride);
 178         uint32_t offset = index * vertexStride + slot;
 179         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
 180         return vertexSlot;
 181     }
 182
 183 #if ENABLE_AVX512_SIMD16
 184     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 185     {
 186         SWR_ASSERT(slot < vertexStride);
 187         uint32_t offset = index * vertexStride + slot;
 188         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
 189         return vertexSlot;
 190     }
 191
 192 #endif
 193     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 194     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 195     bool Assemble(uint32_t slot, simdvector verts[])
 196     {
 197         return this->pfnPaFunc(*this, slot, verts);
 198     }
 199
 200 #if ENABLE_AVX512_SIMD16
 201     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 202     {
 203         return this->pfnPaFunc_simd16(*this, slot, verts);
 204     }
 205
 206 #endif
 207     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 208     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
 209     {
 210         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 211     }
 212
 213     bool NextPrim()
 214     {
 215         this->pfnPaFunc = this->pfnPaNextFunc;
 216 #if ENABLE_AVX512_SIMD16
 217         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 218 #endif
 219         this->numSimdPrims = this->nextNumSimdPrims;
 220         this->numPrimsComplete += this->nextNumPrimsIncrement;
 221         this->reset = this->nextReset;
 222
 223         if (this->isStreaming)
 224         {
 225             this->reset = false;
 226         }
 227
 228         bool morePrims = false;
 229
 230         if (this->numSimdPrims > 0)
 231         {
 232             morePrims = true;
 233             this->numSimdPrims--;
 234         }
 235         else
 236         {
 237             this->counter = (this->reset) ? 0 : (this->counter + 1);
 238             this->reset = false;
 239         }
 240
 241         if (!HasWork())
 242         {
 243             morePrims = false;    // no more to do
 244         }
 245
 246         return morePrims;
 247     }
 248
 249     SIMDVERTEX& GetNextVsOutput()
 250     {
 251         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
 252
 253         // increment cur and prev indices
 254         if (counter < numSimdVerts)
 255         {
 256             // prev undefined for first state
 257             prev = cur;
 258             cur = counter;
 259         }
 260         else
 261         {
 262             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
 263             uint32_t temp = prev;
 264
 265             prev = cur;
 266             cur = temp;
 267         }
 268
 269         SWR_ASSERT(cur < numSimdVerts);
 270         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
 271
 272         return *(SIMDVERTEX*)pVertex;
 273     }
 274
 275     SIMDMASK& GetNextVsIndices()
 276     {
 277         // unused in optimized PA, pass tmp buffer back
 278         return junkIndices;
 279     }
 280
 281     bool GetNextStreamOutput()
 282     {
 283         this->prev = this->cur;
 284         this->cur = this->counter;
 285
 286         return HasWork();
 287     }
 288
 289     uint32_t NumPrims()
 290     {
 291         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 292             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 293     }
 294
 295     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 296         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 297         uint32_t numSimdPrims = 0,
 298         uint32_t numPrimsIncrement = 0,
 299         bool reset = false)
 300     {
 301         this->pfnPaNextFunc = pfnPaNextFunc;
 302         this->nextNumSimdPrims = numSimdPrims;
 303         this->nextNumPrimsIncrement = numPrimsIncrement;
 304         this->nextReset = reset;
 305
 306         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 307     }
 308
 309 #if ENABLE_AVX512_SIMD16
 310     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 311         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 312         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 313         uint32_t numSimdPrims = 0,
 314         uint32_t numPrimsIncrement = 0,
 315         bool reset = false)
 316     {
 317         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 318         this->pfnPaNextFunc = pfnPaNextFunc;
 319         this->nextNumSimdPrims = numSimdPrims;
 320         this->nextNumPrimsIncrement = numPrimsIncrement;
 321         this->nextReset = reset;
 322
 323         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 324     }
 325
 326 #endif
 327     void Reset()
 328     {
 329 #if ENABLE_AVX512_SIMD16
 330         useAlternateOffset = false;
 331
 332 #endif
 333         this->pfnPaFunc = this->pfnPaFuncReset;
 334 #if ENABLE_AVX512_SIMD16
 335         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 336 #endif
 337         this->numPrimsComplete = 0;
 338         this->numSimdPrims = 0;
 339         this->cur = 0;
 340         this->prev = 0;
 341         this->counter = 0;
 342         this->reset = false;
 343     }
 344
 345     SIMDSCALARI GetPrimID(uint32_t startID)
 346     {
 347 #if USE_SIMD16_FRONTEND
 348         return _simd16_add_epi32(this->primID,
 349             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 350 #else
 351         return _simd_add_epi32(this->primID,
 352             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 353 #endif
 354     }
 355 };
 356
 357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 358 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 359     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 360     uint32_t numSimdPrims = 0,
 361     uint32_t numPrimsIncrement = 0,
 362     bool reset = false)
 363 {
 364     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 365 }
 366
 367 #if ENABLE_AVX512_SIMD16
 368 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 369     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 370     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 371     uint32_t numSimdPrims = 0,
 372     uint32_t numPrimsIncrement = 0,
 373     bool reset = false)
 374 {
 375     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 376 }
 377
 378 #endif
 379 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 380 {
 381     return pa.GetSimdVector(index, slot);
 382 }
 383
 384 #if ENABLE_AVX512_SIMD16
 385 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 386 {
 387     return pa.GetSimdVector_simd16(index, slot);
 388 }
 389
 390 #endif
 391 // Cut-aware primitive assembler.
 392 struct PA_STATE_CUT : public PA_STATE
 393 {
 394     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 395     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 396     uint32_t numAttribs{ 0 };            // number of attributes
 397     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 398     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 399 #if ENABLE_AVX512_SIMD16
 400     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 401 #else
 402     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 403 #endif
 404     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 405     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 406     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 407     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 408     uint32_t curVertex{ 0 };             // current unprocessed vertex
 409     uint32_t startPrimId{ 0 };           // starting prim id
 410     SIMDSCALARI vPrimId;                 // vector of prim ID
 411     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 412     uint32_t vertsPerPrim{ 0 };
 413     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 414                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 415                                          // while the GS sends valid verts for every index
 416
 417     simdvector      junkVector;          // junk simdvector for unimplemented API
 418 #if ENABLE_AVX512_SIMD16
 419     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
 420 #endif
 421
 422     // Topology state tracking
 423     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 424     uint32_t curIndex{ 0 };
 425     bool reverseWinding{ false };        // indicates reverse winding for strips
 426     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 427
 428     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 429     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 430
 431     PA_STATE_CUT() {}
 432     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 433         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 434         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride)
 435     {
 436         numVerts = in_streamSizeInVerts;
 437         numAttribs = in_numAttribs;
 438         binTopology = topo;
 439         needOffsets = false;
 440         processCutVerts = in_processCutVerts;
 441
 442         numVertsToAssemble = numRemainingVerts = in_numVerts;
 443         numPrimsAssembled = 0;
 444         headVertex = tailVertex = curVertex = 0;
 445
 446         curIndex = 0;
 447         pCutIndices = in_pIndices;
 448         memset(indices, 0, sizeof(indices));
 449 #if USE_SIMD16_FRONTEND
 450         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 451 #else
 452         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 453 #endif
 454         reverseWinding = false;
 455         adjExtraVert = -1;
 456
 457         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 458         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 459
 460         switch (topo)
 461         {
 462         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 463         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 464         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 465         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 466                                     {
 467                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 468                                     }
 469                                     else
 470                                     {
 471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 472                                     }
 473                                     break;
 474
 475         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 476         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 477         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 478         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 479         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 480         default: assert(0 && "Unimplemented topology");
 481         }
 482     }
 483
 484     SIMDVERTEX& GetNextVsOutput()
 485     {
 486         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 487         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 488         this->needOffsets = true;
 489         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
 490
 491         return *(SIMDVERTEX*)pVertex;
 492     }
 493
 494     SIMDMASK& GetNextVsIndices()
 495     {
 496         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 497         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 498         return *pCurCutIndex;
 499     }
 500
 501     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 502     {
 503         // unused
 504         SWR_ASSERT(0 && "Not implemented");
 505         return junkVector;
 506     }
 507
 508 #if ENABLE_AVX512_SIMD16
 509     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 510     {
 511         // unused
 512         SWR_ASSERT(0 && "Not implemented");
 513         return junkVector_simd16;
 514     }
 515
 516 #endif
 517     bool GetNextStreamOutput()
 518     {
 519         this->headVertex += SIMD_WIDTH;
 520         this->needOffsets = true;
 521         return HasWork();
 522     }
 523
 524     SIMDSCALARI GetPrimID(uint32_t startID)
 525     {
 526 #if USE_SIMD16_FRONTEND
 527         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 528 #else
 529         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 530 #endif
 531     }
 532
 533     void Reset()
 534     {
 535 #if ENABLE_AVX512_SIMD16
 536         useAlternateOffset = false;
 537
 538 #endif
 539         this->numRemainingVerts = this->numVertsToAssemble;
 540         this->numPrimsAssembled = 0;
 541         this->curIndex = 0;
 542         this->curVertex = 0;
 543         this->tailVertex = 0;
 544         this->headVertex = 0;
 545         this->reverseWinding = false;
 546         this->adjExtraVert = -1;
 547 #if USE_SIMD16_FRONTEND
 548         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 549 #else
 550         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 551 #endif
 552     }
 553
 554     bool HasWork()
 555     {
 556         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 557     }
 558
 559     bool IsVertexStoreFull()
 560     {
 561         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 562     }
 563
 564     void RestartTopology()
 565     {
 566         this->curIndex = 0;
 567         this->reverseWinding = false;
 568         this->adjExtraVert = -1;
 569     }
 570
 571     bool IsCutIndex(uint32_t vertex)
 572     {
 573         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 574         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 575         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 576     }
 577
 578     // iterates across the unprocessed verts until we hit the end or we
 579     // have assembled SIMD prims
 580     void ProcessVerts()
 581     {
 582         while (this->numPrimsAssembled != SIMD_WIDTH &&
 583             this->numRemainingVerts > 0 &&
 584             this->curVertex != this->headVertex)
 585         {
 586             // if cut index, restart topology
 587             if (IsCutIndex(this->curVertex))
 588             {
 589                 if (this->processCutVerts)
 590                 {
 591                     (this->*pfnPa)(this->curVertex, false);
 592                 }
 593                 // finish off tri strip w/ adj before restarting topo
 594                 if (this->adjExtraVert != -1)
 595                 {
 596                     (this->*pfnPa)(this->curVertex, true);
 597                 }
 598                 RestartTopology();
 599             }
 600             else
 601             {
 602                 (this->*pfnPa)(this->curVertex, false);
 603             }
 604
 605             this->curVertex++;
 606             if (this->curVertex >= this->numVerts) {
 607                this->curVertex = 0;
 608             }
 609             this->numRemainingVerts--;
 610         }
 611
 612         // special case last primitive for tri strip w/ adj
 613         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 614         {
 615             (this->*pfnPa)(this->curVertex, true);
 616         }
 617     }
 618
 619     void Advance()
 620     {
 621         // done with current batch
 622         // advance tail to the current unsubmitted vertex
 623         this->tailVertex = this->curVertex;
 624         this->numPrimsAssembled = 0;
 625 #if USE_SIMD16_FRONTEND
 626         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 627 #else
 628         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 629 #endif
 630     }
 631
 632     bool NextPrim()
 633     {
 634         // if we've assembled enough prims, we can advance to the next set of verts
 635         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 636         {
 637             Advance();
 638         }
 639         return false;
 640     }
 641
 642     void ComputeOffsets()
 643     {
 644         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 645         {
 646             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
 647             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 648
 649             // step to simdvertex batch
 650             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 651 #if USE_SIMD16_FRONTEND
 652             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 653             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 654 #else
 655             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 656             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 657 #endif
 658
 659             // step to index
 660             const uint32_t simdMask = SIMD_WIDTH - 1;
 661 #if USE_SIMD16_FRONTEND
 662             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 663             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 664 #else
 665             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 666             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 667 #endif
 668         }
 669     }
 670
 671 // disabling buffer overrun warning for this function for what appears to be a bug in MSVC 2017
 672 PRAGMA_WARNING_PUSH_DISABLE(4789)
 673     bool Assemble(uint32_t slot, simdvector *verts)
 674     {
 675         // process any outstanding verts
 676         ProcessVerts();
 677
 678         // return false if we don't have enough prims assembled
 679         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 680         {
 681             return false;
 682         }
 683
 684         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 685         if (this->needOffsets)
 686         {
 687             ComputeOffsets();
 688             this->needOffsets = false;
 689         }
 690
 691         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 692         {
 693             SIMDSCALARI offsets = this->vOffsets[v];
 694
 695             // step to attribute
 696 #if USE_SIMD16_FRONTEND
 697             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 698 #else
 699             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 700 #endif
 701
 702             float* pBase = (float*)this->pStreamBase;
 703             for (uint32_t c = 0; c < 4; ++c)
 704             {
 705 #if USE_SIMD16_FRONTEND
 706                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 707
 708                 verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 709 #else
 710                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 711 #endif
 712
 713                 // move base to next component
 714                 pBase += SIMD_WIDTH;
 715             }
 716         }
 717
 718         return true;
 719     }
 720 PRAGMA_WARNING_POP()
 721
 722 #if ENABLE_AVX512_SIMD16
 723     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 724     {
 725         // process any outstanding verts
 726         ProcessVerts();
 727
 728         // return false if we don't have enough prims assembled
 729         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 730         {
 731             return false;
 732         }
 733
 734         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 735         if (this->needOffsets)
 736         {
 737             ComputeOffsets();
 738             this->needOffsets = false;
 739         }
 740
 741         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 742         {
 743             SIMDSCALARI offsets = this->vOffsets[v];
 744
 745             // step to attribute
 746 #if USE_SIMD16_FRONTEND
 747             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 748 #else
 749             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 750 #endif
 751
 752             float* pBase = (float*)this->pStreamBase;
 753             for (uint32_t c = 0; c < 4; ++c)
 754             {
 755 #if USE_SIMD16_FRONTEND
 756                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 757 #else
 758                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 759 #endif
 760
 761                 // move base to next component
 762                 pBase += SIMD_WIDTH;
 763             }
 764         }
 765
 766         return true;
 767     }
 768
 769 #endif
 770     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
 771     {
 772         // move to slot
 773         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 774         {
 775             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 776 #if USE_SIMD16_FRONTEND
 777             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 778 #else
 779             uint32_t offset = pOffset[triIndex];
 780 #endif
 781             offset += sizeof(SIMDVECTOR) * slot;
 782             float* pVert = (float*)&tri[v];
 783             for (uint32_t c = 0; c < 4; ++c)
 784             {
 785                 float* pComponent = (float*)(this->pStreamBase + offset);
 786                 pVert[c] = *pComponent;
 787                 offset += SIMD_WIDTH * sizeof(float);
 788             }
 789         }
 790     }
 791
 792     uint32_t NumPrims()
 793     {
 794         return this->numPrimsAssembled;
 795     }
 796
 797     // Per-topology functions
 798     void ProcessVertTriStrip(uint32_t index, bool finish)
 799     {
 800         this->vert[this->curIndex] = index;
 801         this->curIndex++;
 802         if (this->curIndex == 3)
 803         {
 804             // assembled enough verts for prim, add to gather indices
 805             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 806             if (reverseWinding)
 807             {
 808                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 809                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 810             }
 811             else
 812             {
 813                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 814                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 815             }
 816
 817             // increment numPrimsAssembled
 818             this->numPrimsAssembled++;
 819
 820             // set up next prim state
 821             this->vert[0] = this->vert[1];
 822             this->vert[1] = this->vert[2];
 823             this->curIndex = 2;
 824             this->reverseWinding ^= 1;
 825         }
 826     }
 827
 828     template<bool gsEnabled>
 829     void AssembleTriStripAdj()
 830     {
 831         if (!gsEnabled)
 832         {
 833             this->vert[1] = this->vert[2];
 834             this->vert[2] = this->vert[4];
 835
 836             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 837             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 838             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 839
 840             this->vert[4] = this->vert[2];
 841             this->vert[2] = this->vert[1];
 842         }
 843         else
 844         {
 845             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 846             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 847             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 848             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 849             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 850             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 851         }
 852         this->numPrimsAssembled++;
 853     }
 854
 855
 856     template<bool gsEnabled>
 857     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 858     {
 859         // handle last primitive of tristrip
 860         if (finish && this->adjExtraVert != -1)
 861         {
 862             this->vert[3] = this->adjExtraVert;
 863             AssembleTriStripAdj<gsEnabled>();
 864             this->adjExtraVert = -1;
 865             return;
 866         }
 867
 868         switch (this->curIndex)
 869         {
 870         case 0:
 871         case 1:
 872         case 2:
 873         case 4:
 874             this->vert[this->curIndex] = index;
 875             this->curIndex++;
 876             break;
 877         case 3:
 878             this->vert[5] = index;
 879             this->curIndex++;
 880             break;
 881         case 5:
 882             if (this->adjExtraVert == -1)
 883             {
 884                 this->adjExtraVert = index;
 885             }
 886             else
 887             {
 888                 this->vert[3] = index;
 889                 if (!gsEnabled)
 890                 {
 891                     AssembleTriStripAdj<gsEnabled>();
 892
 893                     uint32_t nextTri[6];
 894                     if (this->reverseWinding)
 895                     {
 896                         nextTri[0] = this->vert[4];
 897                         nextTri[1] = this->vert[0];
 898                         nextTri[2] = this->vert[2];
 899                         nextTri[4] = this->vert[3];
 900                         nextTri[5] = this->adjExtraVert;
 901                     }
 902                     else
 903                     {
 904                         nextTri[0] = this->vert[2];
 905                         nextTri[1] = this->adjExtraVert;
 906                         nextTri[2] = this->vert[3];
 907                         nextTri[4] = this->vert[4];
 908                         nextTri[5] = this->vert[0];
 909                     }
 910                     for (uint32_t i = 0; i < 6; ++i)
 911                     {
 912                         this->vert[i] = nextTri[i];
 913                     }
 914
 915                     this->adjExtraVert = -1;
 916                     this->reverseWinding ^= 1;
 917                 }
 918                 else
 919                 {
 920                     this->curIndex++;
 921                 }
 922             }
 923             break;
 924         case 6:
 925             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 926             AssembleTriStripAdj<gsEnabled>();
 927
 928             uint32_t nextTri[6];
 929             if (this->reverseWinding)
 930             {
 931                 nextTri[0] = this->vert[4];
 932                 nextTri[1] = this->vert[0];
 933                 nextTri[2] = this->vert[2];
 934                 nextTri[4] = this->vert[3];
 935                 nextTri[5] = this->adjExtraVert;
 936             }
 937             else
 938             {
 939                 nextTri[0] = this->vert[2];
 940                 nextTri[1] = this->adjExtraVert;
 941                 nextTri[2] = this->vert[3];
 942                 nextTri[4] = this->vert[4];
 943                 nextTri[5] = this->vert[0];
 944             }
 945             for (uint32_t i = 0; i < 6; ++i)
 946             {
 947                 this->vert[i] = nextTri[i];
 948             }
 949             this->reverseWinding ^= 1;
 950             this->adjExtraVert = index;
 951             this->curIndex--;
 952             break;
 953         }
 954     }
 955
 956     void ProcessVertTriList(uint32_t index, bool finish)
 957     {
 958         this->vert[this->curIndex] = index;
 959         this->curIndex++;
 960         if (this->curIndex == 3)
 961         {
 962             // assembled enough verts for prim, add to gather indices
 963             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 964             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 965             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 966
 967             // increment numPrimsAssembled
 968             this->numPrimsAssembled++;
 969
 970             // set up next prim state
 971             this->curIndex = 0;
 972         }
 973     }
 974
 975     void ProcessVertTriListAdj(uint32_t index, bool finish)
 976     {
 977         this->vert[this->curIndex] = index;
 978         this->curIndex++;
 979         if (this->curIndex == 6)
 980         {
 981             // assembled enough verts for prim, add to gather indices
 982             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 983             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 984             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 985             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 986             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 987             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 988
 989             // increment numPrimsAssembled
 990             this->numPrimsAssembled++;
 991
 992             // set up next prim state
 993             this->curIndex = 0;
 994         }
 995     }
 996
 997     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 998     {
 999         this->vert[this->curIndex] = index;
1000         this->curIndex++;
1001         if (this->curIndex == 6)
1002         {
1003             // assembled enough verts for prim, add to gather indices
1004             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1005             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1006             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1007
1008             // increment numPrimsAssembled
1009             this->numPrimsAssembled++;
1010
1011             // set up next prim state
1012             this->curIndex = 0;
1013         }
1014     }
1015
1016
1017     void ProcessVertLineList(uint32_t index, bool finish)
1018     {
1019         this->vert[this->curIndex] = index;
1020         this->curIndex++;
1021         if (this->curIndex == 2)
1022         {
1023             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1024             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1025
1026             this->numPrimsAssembled++;
1027             this->curIndex = 0;
1028         }
1029     }
1030
1031     void ProcessVertLineStrip(uint32_t index, bool finish)
1032     {
1033         this->vert[this->curIndex] = index;
1034         this->curIndex++;
1035         if (this->curIndex == 2)
1036         {
1037             // assembled enough verts for prim, add to gather indices
1038             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1039             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1040
1041             // increment numPrimsAssembled
1042             this->numPrimsAssembled++;
1043
1044             // set up next prim state
1045             this->vert[0] = this->vert[1];
1046             this->curIndex = 1;
1047         }
1048     }
1049
1050     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1051     {
1052         this->vert[this->curIndex] = index;
1053         this->curIndex++;
1054         if (this->curIndex == 4)
1055         {
1056             // assembled enough verts for prim, add to gather indices
1057             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1058             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1059             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1060             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1061
1062             // increment numPrimsAssembled
1063             this->numPrimsAssembled++;
1064
1065             // set up next prim state
1066             this->vert[0] = this->vert[1];
1067             this->vert[1] = this->vert[2];
1068             this->vert[2] = this->vert[3];
1069             this->curIndex = 3;
1070         }
1071     }
1072
1073     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1074     {
1075         this->vert[this->curIndex] = index;
1076         this->curIndex++;
1077         if (this->curIndex == 4)
1078         {
1079             // assembled enough verts for prim, add to gather indices
1080             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1081             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1082
1083             // increment numPrimsAssembled
1084             this->numPrimsAssembled++;
1085
1086             // set up next prim state
1087             this->vert[0] = this->vert[1];
1088             this->vert[1] = this->vert[2];
1089             this->vert[2] = this->vert[3];
1090             this->curIndex = 3;
1091         }
1092     }
1093
1094     void ProcessVertLineListAdj(uint32_t index, bool finish)
1095     {
1096         this->vert[this->curIndex] = index;
1097         this->curIndex++;
1098         if (this->curIndex == 4)
1099         {
1100             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1101             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1102             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1103             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1104
1105             this->numPrimsAssembled++;
1106             this->curIndex = 0;
1107         }
1108     }
1109
1110     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1111     {
1112         this->vert[this->curIndex] = index;
1113         this->curIndex++;
1114         if (this->curIndex == 4)
1115         {
1116             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1117             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1118
1119             this->numPrimsAssembled++;
1120             this->curIndex = 0;
1121         }
1122     }
1123
1124     void ProcessVertPointList(uint32_t index, bool finish)
1125     {
1126         this->vert[this->curIndex] = index;
1127         this->curIndex++;
1128         if (this->curIndex == 1)
1129         {
1130             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1131             this->numPrimsAssembled++;
1132             this->curIndex = 0;
1133         }
1134     }
1135 };
1136
1137 // Primitive Assembly for data output from the DomainShader.
1138 struct PA_TESS : PA_STATE
1139 {
1140     PA_TESS(
1141         DRAW_CONTEXT *in_pDC,
1142         const SIMDSCALAR* in_pVertData,
1143         uint32_t in_attributeStrideInVectors,
1144         uint32_t in_vertexStride,
1145         uint32_t in_numAttributes,
1146         uint32_t* (&in_ppIndices)[3],
1147         uint32_t in_numPrims,
1148         PRIMITIVE_TOPOLOGY in_binTopology) :
1149
1150         PA_STATE(in_pDC, nullptr, 0, in_vertexStride),
1151         m_pVertexData(in_pVertData),
1152         m_attributeStrideInVectors(in_attributeStrideInVectors),
1153         m_numAttributes(in_numAttributes),
1154         m_numPrims(in_numPrims)
1155     {
1156 #if USE_SIMD16_FRONTEND
1157         m_vPrimId = _simd16_setzero_si();
1158 #else
1159         m_vPrimId = _simd_setzero_si();
1160 #endif
1161         binTopology = in_binTopology;
1162         m_ppIndices[0] = in_ppIndices[0];
1163         m_ppIndices[1] = in_ppIndices[1];
1164         m_ppIndices[2] = in_ppIndices[2];
1165
1166         switch (binTopology)
1167         {
1168         case TOP_POINT_LIST:
1169             m_numVertsPerPrim = 1;
1170             break;
1171
1172         case TOP_LINE_LIST:
1173             m_numVertsPerPrim = 2;
1174             break;
1175
1176         case TOP_TRIANGLE_LIST:
1177             m_numVertsPerPrim = 3;
1178             break;
1179
1180         default:
1181             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1182             break;
1183         }
1184     }
1185
1186     bool HasWork()
1187     {
1188         return m_numPrims != 0;
1189     }
1190
1191     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1192     {
1193         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1194         return junkVector;
1195     }
1196
1197 #if ENABLE_AVX512_SIMD16
1198     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1199     {
1200         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1201         return junkVector_simd16;
1202     }
1203
1204 #endif
1205     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1206     {
1207         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1208 #if USE_SIMD16_FRONTEND
1209         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1210         {
1211             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1212             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1213         };
1214
1215         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1216 #else
1217         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1218         {
1219             -1, -1, -1, -1, -1, -1, -1, -1,
1220             0,  0,  0,  0,  0,  0,  0,  0
1221         };
1222
1223         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1224 #endif
1225     }
1226
1227     bool Assemble(uint32_t slot, simdvector verts[])
1228     {
1229         SWR_ASSERT(slot < m_numAttributes);
1230
1231         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1232         if (0 == numPrimsToAssemble)
1233         {
1234             return false;
1235         }
1236
1237         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1238
1239         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1240         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1241         {
1242 #if USE_SIMD16_FRONTEND
1243             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1244 #else
1245             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1246 #endif
1247
1248             const float* pBase = pBaseAttrib;
1249             for (uint32_t c = 0; c < 4; ++c)
1250             {
1251 #if USE_SIMD16_FRONTEND
1252                 simd16scalar temp = _simd16_mask_i32gather_ps(
1253                     _simd16_setzero_ps(),
1254                     pBase,
1255                     indices,
1256                     mask,
1257                     4 /* gcc doesn't like sizeof(float) */);
1258
1259                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1260 #else
1261                 verts[i].v[c] = _simd_mask_i32gather_ps(
1262                     _simd_setzero_ps(),
1263                     pBase,
1264                     indices,
1265                     _simd_castsi_ps(mask),
1266                     4 /* gcc doesn't like sizeof(float) */);
1267 #endif
1268                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1269             }
1270         }
1271
1272         return true;
1273     }
1274
1275 #if ENABLE_AVX512_SIMD16
1276     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1277     {
1278         SWR_ASSERT(slot < m_numAttributes);
1279
1280         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1281         if (0 == numPrimsToAssemble)
1282         {
1283             return false;
1284         }
1285
1286         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1287
1288         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1289         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1290         {
1291 #if USE_SIMD16_FRONTEND
1292             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1293 #else
1294             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1295 #endif
1296
1297             const float* pBase = pBaseAttrib;
1298             for (uint32_t c = 0; c < 4; ++c)
1299             {
1300 #if USE_SIMD16_FRONTEND
1301                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1302                     _simd16_setzero_ps(),
1303                     pBase,
1304                     indices,
1305                     mask,
1306                     4 /* gcc doesn't like sizeof(float) */);
1307 #else
1308                 simdscalar temp = _simd_mask_i32gather_ps(
1309                     _simd_setzero_ps(),
1310                     pBase,
1311                     indices,
1312                     _simd_castsi_ps(mask),
1313                     4 /* gcc doesn't like sizeof(float) */);
1314                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1315 #endif
1316                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1317             }
1318         }
1319
1320         return true;
1321     }
1322
1323 #endif
1324     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1325     {
1326         SWR_ASSERT(slot < m_numAttributes);
1327         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1328
1329         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1330         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1331         {
1332 #if USE_SIMD16_FRONTEND
1333             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1334 #else
1335             uint32_t index = m_ppIndices[i][primIndex];
1336 #endif
1337             const float* pVertData = pVertDataBase;
1338             float* pVert = (float*)&verts[i];
1339
1340             for (uint32_t c = 0; c < 4; ++c)
1341             {
1342                 pVert[c] = pVertData[index];
1343                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1344             }
1345         }
1346     }
1347
1348     bool NextPrim()
1349     {
1350         uint32_t numPrims = PA_TESS::NumPrims();
1351         m_numPrims -= numPrims;
1352         m_ppIndices[0] += numPrims;
1353         m_ppIndices[1] += numPrims;
1354         m_ppIndices[2] += numPrims;
1355
1356         return HasWork();
1357     }
1358
1359     SIMDVERTEX& GetNextVsOutput()
1360     {
1361         SWR_NOT_IMPL;
1362         return junkVertex;
1363     }
1364
1365     bool GetNextStreamOutput()
1366     {
1367         SWR_NOT_IMPL;
1368         return false;
1369     }
1370
1371     SIMDMASK& GetNextVsIndices()
1372     {
1373         SWR_NOT_IMPL;
1374         return junkIndices;
1375     }
1376
1377     uint32_t NumPrims()
1378     {
1379         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1380     }
1381
1382     void Reset()
1383     {
1384         SWR_NOT_IMPL;
1385     }
1386
1387     SIMDSCALARI GetPrimID(uint32_t startID)
1388     {
1389 #if USE_SIMD16_FRONTEND
1390         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1391 #else
1392         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1393 #endif
1394     }
1395
1396 private:
1397     const SIMDSCALAR*   m_pVertexData = nullptr;
1398     uint32_t            m_attributeStrideInVectors = 0;
1399     uint32_t            m_numAttributes = 0;
1400     uint32_t            m_numPrims = 0;
1401     uint32_t*           m_ppIndices[3];
1402
1403     uint32_t            m_numVertsPerPrim = 0;
1404
1405     SIMDSCALARI         m_vPrimId;
1406
1407     simdvector          junkVector;         // junk simdvector for unimplemented API
1408 #if ENABLE_AVX512_SIMD16
1409     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
1410 #endif
1411     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
1412     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
1413 };
1414
1415 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1416 // based on state.
1417 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1418 struct PA_FACTORY
1419 {
1420     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride) : topo(in_topo)
1421     {
1422 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1423         const API_STATE& state = GetApiState(pDC);
1424         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1425             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1426             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1427             topo == TOP_TRIANGLE_LIST)) ||
1428
1429             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1430             // for them in the optimized PA
1431             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1432         {
1433             memset(&indexStore, 0, sizeof(indexStore));
1434             uint32_t numAttribs = state.feNumAttributes;
1435
1436             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1437                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1438             cutPA = true;
1439         }
1440         else
1441 #endif
1442         {
1443             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1444             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false);
1445             cutPA = false;
1446         }
1447
1448     }
1449
1450     PA_STATE& GetPA()
1451     {
1452 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1453         if (cutPA)
1454         {
1455             return this->paCut;
1456         }
1457         else
1458 #endif
1459         {
1460             return this->paOpt;
1461         }
1462     }
1463
1464     PA_STATE_OPT paOpt;
1465     PA_STATE_CUT paCut;
1466
1467     bool cutPA{ false };
1468
1469     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1470
1471     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1472 };