src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };       // draw context
  71     uint8_t* pStreamBase{ nullptr };    // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };    // total size of the input stream in verts
  73     uint32_t vertexStride{ 0 };         // stride of a vertex in simdvector units
  74
  75     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  76     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  77
  78 #if ENABLE_AVX512_SIMD16
  79     bool useAlternateOffset{ false };
  80 #endif
  81
  82     bool viewportArrayActive{ false };
  83     bool rtArrayActive { false };
  84     uint32_t numVertsPerPrim{ 0 };
  85
  86     PA_STATE(){}
  87     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, uint32_t in_numVertsPerPrim) :
  88         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts), vertexStride(in_vertexStride), numVertsPerPrim(in_numVertsPerPrim) {}
  89
  90     virtual bool HasWork() = 0;
  91     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  92 #if ENABLE_AVX512_SIMD16
  93     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  94 #endif
  95     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  96 #if ENABLE_AVX512_SIMD16
  97     virtual bool Assemble(uint32_t slot, simd16vector verts[]) = 0;
  98 #endif
  99     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
 100     virtual bool NextPrim() = 0;
 101     virtual SIMDVERTEX& GetNextVsOutput() = 0;
 102     virtual bool GetNextStreamOutput() = 0;
 103     virtual SIMDMASK& GetNextVsIndices() = 0;
 104     virtual uint32_t NumPrims() = 0;
 105     virtual void Reset() = 0;
 106     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 107 };
 108
 109 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 110 // output. Here is the sequence
 111 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 112 //    2. Execute PA function to assemble and bin triangles.
 113 //        a.    The PA function is a set of functions that collectively make up the
 114 //            state machine for a given topology.
 115 //                1.    We use a state index to track which PA function to call.
 116 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 117 //                1.    We call this the current and previous simd vertex.
 118 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 119 //                    order to assemble the second triangle, for a triangle list, we'll need the
 120 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 121 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 122 //
 123 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 124 // cuts
 125 struct PA_STATE_OPT : public PA_STATE
 126 {
 127     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 128     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 129
 130     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 131
 132     uint32_t cur{ 0 };                   // index to current VS output.
 133     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 134     const uint32_t first{ 0 };           // index to first VS output. Used for tri fan and line loop.
 135
 136     uint32_t counter{ 0 };               // state counter
 137     bool reset{ false };                 // reset state
 138
 139     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 140     SIMDSCALARI primID;
 141
 142     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 143 #if ENABLE_AVX512_SIMD16
 144     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 145 #endif
 146     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 147
 148     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 149 #if ENABLE_AVX512_SIMD16
 150     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 151 #endif
 152     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 153     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 154 #if ENABLE_AVX512_SIMD16
 155     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 156 #endif
 157
 158     // state used to advance the PA when Next is called
 159     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 160 #if ENABLE_AVX512_SIMD16
 161     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 162 #endif
 163     uint32_t           nextNumSimdPrims{ 0 };
 164     uint32_t           nextNumPrimsIncrement{ 0 };
 165     bool               nextReset{ false };
 166     bool               isStreaming{ false };
 167
 168     SIMDMASK           junkIndices  { 0 };          // temporary index store for unused virtual function
 169
 170     PA_STATE_OPT() {}
 171     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 172         uint32_t vertexStride, bool in_isStreaming, uint32_t numVertsPerPrim, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 173
 174     bool HasWork()
 175     {
 176         return (this->numPrimsComplete < this->numPrims) ? true : false;
 177     }
 178
 179     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 180     {
 181         SWR_ASSERT(slot < vertexStride);
 182         uint32_t offset = index * vertexStride + slot;
 183         simdvector& vertexSlot = ((simdvector*)pStreamBase)[offset];
 184         return vertexSlot;
 185     }
 186
 187 #if ENABLE_AVX512_SIMD16
 188     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 189     {
 190         SWR_ASSERT(slot < vertexStride);
 191         uint32_t offset = index * vertexStride + slot;
 192         simd16vector& vertexSlot = ((simd16vector*)pStreamBase)[offset];
 193         return vertexSlot;
 194     }
 195
 196 #endif
 197     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 198     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 199     bool Assemble(uint32_t slot, simdvector verts[])
 200     {
 201         return this->pfnPaFunc(*this, slot, verts);
 202     }
 203
 204 #if ENABLE_AVX512_SIMD16
 205     bool Assemble(uint32_t slot, simd16vector verts[])
 206     {
 207         return this->pfnPaFunc_simd16(*this, slot, verts);
 208     }
 209
 210 #endif
 211     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 212     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 213     {
 214         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 215     }
 216
 217     bool NextPrim()
 218     {
 219         this->pfnPaFunc = this->pfnPaNextFunc;
 220 #if ENABLE_AVX512_SIMD16
 221         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 222 #endif
 223         this->numSimdPrims = this->nextNumSimdPrims;
 224         this->numPrimsComplete += this->nextNumPrimsIncrement;
 225         this->reset = this->nextReset;
 226
 227         if (this->isStreaming)
 228         {
 229             this->reset = false;
 230         }
 231
 232         bool morePrims = false;
 233
 234         if (this->numSimdPrims > 0)
 235         {
 236             morePrims = true;
 237             this->numSimdPrims--;
 238         }
 239         else
 240         {
 241             this->counter = (this->reset) ? 0 : (this->counter + 1);
 242             this->reset = false;
 243         }
 244
 245         if (!HasWork())
 246         {
 247             morePrims = false;    // no more to do
 248         }
 249
 250         return morePrims;
 251     }
 252
 253     SIMDVERTEX& GetNextVsOutput()
 254     {
 255         const uint32_t numSimdVerts = streamSizeInVerts / SIMD_WIDTH;
 256
 257         // increment cur and prev indices
 258         if (counter < numSimdVerts)
 259         {
 260             // prev undefined for first state
 261             prev = cur;
 262             cur = counter;
 263         }
 264         else
 265         {
 266             // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
 267             uint32_t temp = prev;
 268
 269             prev = cur;
 270             cur = temp;
 271         }
 272
 273         SWR_ASSERT(cur < numSimdVerts);
 274         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[cur * vertexStride];
 275
 276         return *(SIMDVERTEX*)pVertex;
 277     }
 278
 279     SIMDMASK& GetNextVsIndices()
 280     {
 281         // unused in optimized PA, pass tmp buffer back
 282         return junkIndices;
 283     }
 284
 285     bool GetNextStreamOutput()
 286     {
 287         this->prev = this->cur;
 288         this->cur = this->counter;
 289
 290         return HasWork();
 291     }
 292
 293     uint32_t NumPrims()
 294     {
 295         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 296             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 297     }
 298
 299     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 300         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 301         uint32_t numSimdPrims = 0,
 302         uint32_t numPrimsIncrement = 0,
 303         bool reset = false)
 304     {
 305         this->pfnPaNextFunc = pfnPaNextFunc;
 306         this->nextNumSimdPrims = numSimdPrims;
 307         this->nextNumPrimsIncrement = numPrimsIncrement;
 308         this->nextReset = reset;
 309
 310         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 311     }
 312
 313 #if ENABLE_AVX512_SIMD16
 314     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 315         PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 316         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 317         uint32_t numSimdPrims = 0,
 318         uint32_t numPrimsIncrement = 0,
 319         bool reset = false)
 320     {
 321         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 322         this->pfnPaNextFunc = pfnPaNextFunc;
 323         this->nextNumSimdPrims = numSimdPrims;
 324         this->nextNumPrimsIncrement = numPrimsIncrement;
 325         this->nextReset = reset;
 326
 327         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 328     }
 329
 330 #endif
 331     void Reset()
 332     {
 333 #if ENABLE_AVX512_SIMD16
 334         useAlternateOffset = false;
 335
 336 #endif
 337         this->pfnPaFunc = this->pfnPaFuncReset;
 338 #if ENABLE_AVX512_SIMD16
 339         this->pfnPaFunc_simd16 = this->pfnPaFuncReset_simd16;
 340 #endif
 341         this->numPrimsComplete = 0;
 342         this->numSimdPrims = 0;
 343         this->cur = 0;
 344         this->prev = 0;
 345         this->counter = 0;
 346         this->reset = false;
 347     }
 348
 349     SIMDSCALARI GetPrimID(uint32_t startID)
 350     {
 351 #if USE_SIMD16_FRONTEND
 352         return _simd16_add_epi32(this->primID,
 353             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 354 #else
 355         return _simd_add_epi32(this->primID,
 356             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 357 #endif
 358     }
 359 };
 360
 361 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 362 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 363     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 364     uint32_t numSimdPrims = 0,
 365     uint32_t numPrimsIncrement = 0,
 366     bool reset = false)
 367 {
 368     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 369 }
 370
 371 #if ENABLE_AVX512_SIMD16
 372 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 373     PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 374     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 375     uint32_t numSimdPrims = 0,
 376     uint32_t numPrimsIncrement = 0,
 377     bool reset = false)
 378 {
 379     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 380 }
 381
 382 #endif
 383 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 384 {
 385     return pa.GetSimdVector(index, slot);
 386 }
 387
 388 #if ENABLE_AVX512_SIMD16
 389 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 390 {
 391     return pa.GetSimdVector_simd16(index, slot);
 392 }
 393
 394 #endif
 395 // Cut-aware primitive assembler.
 396 struct PA_STATE_CUT : public PA_STATE
 397 {
 398     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 399     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 400     uint32_t numAttribs{ 0 };            // number of attributes
 401     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 402     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 403 #if ENABLE_AVX512_SIMD16
 404     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 405 #else
 406     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 407 #endif
 408     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 409     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 410     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 411     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 412     uint32_t curVertex{ 0 };             // current unprocessed vertex
 413     uint32_t startPrimId{ 0 };           // starting prim id
 414     SIMDSCALARI vPrimId;                 // vector of prim ID
 415     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 416     uint32_t vertsPerPrim{ 0 };
 417     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 418                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 419                                          // while the GS sends valid verts for every index
 420
 421     simdvector      junkVector;          // junk simdvector for unimplemented API
 422 #if ENABLE_AVX512_SIMD16
 423     simd16vector    junkVector_simd16;   // junk simd16vector for unimplemented API
 424 #endif
 425
 426     // Topology state tracking
 427     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 428     uint32_t curIndex{ 0 };
 429     bool reverseWinding{ false };        // indicates reverse winding for strips
 430     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 431
 432     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 433     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 434
 435     PA_STATE_CUT() {}
 436     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, uint32_t in_vertexStride, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 437         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts, uint32_t in_numVertsPerPrim)
 438         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts, in_vertexStride, in_numVertsPerPrim)
 439     {
 440         numVerts = in_streamSizeInVerts;
 441         numAttribs = in_numAttribs;
 442         binTopology = topo;
 443         needOffsets = false;
 444         processCutVerts = in_processCutVerts;
 445
 446         numVertsToAssemble = numRemainingVerts = in_numVerts;
 447         numPrimsAssembled = 0;
 448         headVertex = tailVertex = curVertex = 0;
 449
 450         curIndex = 0;
 451         pCutIndices = in_pIndices;
 452         memset(indices, 0, sizeof(indices));
 453 #if USE_SIMD16_FRONTEND
 454         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 455 #else
 456         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 457 #endif
 458         reverseWinding = false;
 459         adjExtraVert = -1;
 460
 461         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 462         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 463
 464         switch (topo)
 465         {
 466         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 467         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 468         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 469         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 470                                     {
 471                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 472                                     }
 473                                     else
 474                                     {
 475                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 476                                     }
 477                                     break;
 478
 479         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 480         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 481         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 482         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 483         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 484         default: assert(0 && "Unimplemented topology");
 485         }
 486     }
 487
 488     SIMDVERTEX& GetNextVsOutput()
 489     {
 490         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 491         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 492         this->needOffsets = true;
 493         SIMDVECTOR* pVertex = &((SIMDVECTOR*)pStreamBase)[vertexIndex * vertexStride];
 494
 495         return *(SIMDVERTEX*)pVertex;
 496     }
 497
 498     SIMDMASK& GetNextVsIndices()
 499     {
 500         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 501         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 502         return *pCurCutIndex;
 503     }
 504
 505     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 506     {
 507         // unused
 508         SWR_ASSERT(0 && "Not implemented");
 509         return junkVector;
 510     }
 511
 512 #if ENABLE_AVX512_SIMD16
 513     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 514     {
 515         // unused
 516         SWR_ASSERT(0 && "Not implemented");
 517         return junkVector_simd16;
 518     }
 519
 520 #endif
 521     bool GetNextStreamOutput()
 522     {
 523         this->headVertex += SIMD_WIDTH;
 524         this->needOffsets = true;
 525         return HasWork();
 526     }
 527
 528     SIMDSCALARI GetPrimID(uint32_t startID)
 529     {
 530 #if USE_SIMD16_FRONTEND
 531         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 532 #else
 533         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 534 #endif
 535     }
 536
 537     void Reset()
 538     {
 539 #if ENABLE_AVX512_SIMD16
 540         useAlternateOffset = false;
 541
 542 #endif
 543         this->numRemainingVerts = this->numVertsToAssemble;
 544         this->numPrimsAssembled = 0;
 545         this->curIndex = 0;
 546         this->curVertex = 0;
 547         this->tailVertex = 0;
 548         this->headVertex = 0;
 549         this->reverseWinding = false;
 550         this->adjExtraVert = -1;
 551 #if USE_SIMD16_FRONTEND
 552         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 553 #else
 554         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 555 #endif
 556     }
 557
 558     bool HasWork()
 559     {
 560         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 561     }
 562
 563     bool IsVertexStoreFull()
 564     {
 565         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 566     }
 567
 568     void RestartTopology()
 569     {
 570         this->curIndex = 0;
 571         this->reverseWinding = false;
 572         this->adjExtraVert = -1;
 573     }
 574
 575     bool IsCutIndex(uint32_t vertex)
 576     {
 577         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 578         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 579         return CheckBit(this->pCutIndices[vertexIndex], vertexOffset);
 580     }
 581
 582     // iterates across the unprocessed verts until we hit the end or we
 583     // have assembled SIMD prims
 584     void ProcessVerts()
 585     {
 586         while (this->numPrimsAssembled != SIMD_WIDTH &&
 587             this->numRemainingVerts > 0 &&
 588             this->curVertex != this->headVertex)
 589         {
 590             // if cut index, restart topology
 591             if (IsCutIndex(this->curVertex))
 592             {
 593                 if (this->processCutVerts)
 594                 {
 595                     (this->*pfnPa)(this->curVertex, false);
 596                 }
 597                 // finish off tri strip w/ adj before restarting topo
 598                 if (this->adjExtraVert != -1)
 599                 {
 600                     (this->*pfnPa)(this->curVertex, true);
 601                 }
 602                 RestartTopology();
 603             }
 604             else
 605             {
 606                 (this->*pfnPa)(this->curVertex, false);
 607             }
 608
 609             this->curVertex++;
 610             if (this->curVertex >= this->numVerts) {
 611                this->curVertex = 0;
 612             }
 613             this->numRemainingVerts--;
 614         }
 615
 616         // special case last primitive for tri strip w/ adj
 617         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 618         {
 619             (this->*pfnPa)(this->curVertex, true);
 620         }
 621     }
 622
 623     void Advance()
 624     {
 625         // done with current batch
 626         // advance tail to the current unsubmitted vertex
 627         this->tailVertex = this->curVertex;
 628         this->numPrimsAssembled = 0;
 629 #if USE_SIMD16_FRONTEND
 630         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 631 #else
 632         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 633 #endif
 634     }
 635
 636     bool NextPrim()
 637     {
 638         // if we've assembled enough prims, we can advance to the next set of verts
 639         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 640         {
 641             Advance();
 642         }
 643         return false;
 644     }
 645
 646     void ComputeOffsets()
 647     {
 648         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 649         {
 650             uint32_t vertexStrideBytes = vertexStride * sizeof(SIMDVECTOR);
 651             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 652
 653             // step to simdvertex batch
 654             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 655 #if USE_SIMD16_FRONTEND
 656             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 657             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(vertexStrideBytes));
 658 #else
 659             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 660             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(vertexStrideBytes));
 661 #endif
 662
 663             // step to index
 664             const uint32_t simdMask = SIMD_WIDTH - 1;
 665 #if USE_SIMD16_FRONTEND
 666             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 667             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 668 #else
 669             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 670             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 671 #endif
 672         }
 673     }
 674
 675     bool Assemble(uint32_t slot, simdvector *verts)
 676     {
 677         // process any outstanding verts
 678         ProcessVerts();
 679
 680         // return false if we don't have enough prims assembled
 681         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 682         {
 683             return false;
 684         }
 685
 686         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 687         if (this->needOffsets)
 688         {
 689             ComputeOffsets();
 690             this->needOffsets = false;
 691         }
 692
 693         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 694         {
 695             SIMDSCALARI offsets = this->vOffsets[v];
 696
 697             // step to attribute
 698 #if USE_SIMD16_FRONTEND
 699             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 700 #else
 701             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 702 #endif
 703
 704             float* pBase = (float*)this->pStreamBase;
 705             for (uint32_t c = 0; c < 4; ++c)
 706             {
 707 #if USE_SIMD16_FRONTEND
 708                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 709
 710                 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
 711                 simdscalar t = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
 712                 verts[v].v[c] = t;
 713 #else
 714                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 715 #endif
 716
 717                 // move base to next component
 718                 pBase += SIMD_WIDTH;
 719             }
 720         }
 721
 722         return true;
 723     }
 724
 725 #if ENABLE_AVX512_SIMD16
 726     bool Assemble(uint32_t slot, simd16vector verts[])
 727     {
 728         // process any outstanding verts
 729         ProcessVerts();
 730
 731         // return false if we don't have enough prims assembled
 732         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 733         {
 734             return false;
 735         }
 736
 737         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 738         if (this->needOffsets)
 739         {
 740             ComputeOffsets();
 741             this->needOffsets = false;
 742         }
 743
 744         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 745         {
 746             SIMDSCALARI offsets = this->vOffsets[v];
 747
 748             // step to attribute
 749 #if USE_SIMD16_FRONTEND
 750             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 751 #else
 752             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 753 #endif
 754
 755             float* pBase = (float*)this->pStreamBase;
 756             for (uint32_t c = 0; c < 4; ++c)
 757             {
 758 #if USE_SIMD16_FRONTEND
 759                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 760 #else
 761                 verts[v].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1), 0);
 762 #endif
 763
 764                 // move base to next component
 765                 pBase += SIMD_WIDTH;
 766             }
 767         }
 768
 769         return true;
 770     }
 771
 772 #endif
 773     void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
 774     {
 775         // move to slot
 776         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 777         {
 778             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 779 #if USE_SIMD16_FRONTEND
 780             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 781 #else
 782             uint32_t offset = pOffset[triIndex];
 783 #endif
 784             offset += sizeof(SIMDVECTOR) * slot;
 785             float* pVert = (float*)&tri[v];
 786             for (uint32_t c = 0; c < 4; ++c)
 787             {
 788                 float* pComponent = (float*)(this->pStreamBase + offset);
 789                 pVert[c] = *pComponent;
 790                 offset += SIMD_WIDTH * sizeof(float);
 791             }
 792         }
 793     }
 794
 795     uint32_t NumPrims()
 796     {
 797         return this->numPrimsAssembled;
 798     }
 799
 800     // Per-topology functions
 801     void ProcessVertTriStrip(uint32_t index, bool finish)
 802     {
 803         this->vert[this->curIndex] = index;
 804         this->curIndex++;
 805         if (this->curIndex == 3)
 806         {
 807             // assembled enough verts for prim, add to gather indices
 808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 809             if (reverseWinding)
 810             {
 811                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 812                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 813             }
 814             else
 815             {
 816                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 817                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 818             }
 819
 820             // increment numPrimsAssembled
 821             this->numPrimsAssembled++;
 822
 823             // set up next prim state
 824             this->vert[0] = this->vert[1];
 825             this->vert[1] = this->vert[2];
 826             this->curIndex = 2;
 827             this->reverseWinding ^= 1;
 828         }
 829     }
 830
 831     template<bool gsEnabled>
 832     void AssembleTriStripAdj()
 833     {
 834         if (!gsEnabled)
 835         {
 836             this->vert[1] = this->vert[2];
 837             this->vert[2] = this->vert[4];
 838
 839             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 840             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 841             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 842
 843             this->vert[4] = this->vert[2];
 844             this->vert[2] = this->vert[1];
 845         }
 846         else
 847         {
 848             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 849             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 850             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 851             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 852             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 853             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 854         }
 855         this->numPrimsAssembled++;
 856     }
 857
 858
 859     template<bool gsEnabled>
 860     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 861     {
 862         // handle last primitive of tristrip
 863         if (finish && this->adjExtraVert != -1)
 864         {
 865             this->vert[3] = this->adjExtraVert;
 866             AssembleTriStripAdj<gsEnabled>();
 867             this->adjExtraVert = -1;
 868             return;
 869         }
 870
 871         switch (this->curIndex)
 872         {
 873         case 0:
 874         case 1:
 875         case 2:
 876         case 4:
 877             this->vert[this->curIndex] = index;
 878             this->curIndex++;
 879             break;
 880         case 3:
 881             this->vert[5] = index;
 882             this->curIndex++;
 883             break;
 884         case 5:
 885             if (this->adjExtraVert == -1)
 886             {
 887                 this->adjExtraVert = index;
 888             }
 889             else
 890             {
 891                 this->vert[3] = index;
 892                 if (!gsEnabled)
 893                 {
 894                     AssembleTriStripAdj<gsEnabled>();
 895
 896                     uint32_t nextTri[6];
 897                     if (this->reverseWinding)
 898                     {
 899                         nextTri[0] = this->vert[4];
 900                         nextTri[1] = this->vert[0];
 901                         nextTri[2] = this->vert[2];
 902                         nextTri[4] = this->vert[3];
 903                         nextTri[5] = this->adjExtraVert;
 904                     }
 905                     else
 906                     {
 907                         nextTri[0] = this->vert[2];
 908                         nextTri[1] = this->adjExtraVert;
 909                         nextTri[2] = this->vert[3];
 910                         nextTri[4] = this->vert[4];
 911                         nextTri[5] = this->vert[0];
 912                     }
 913                     for (uint32_t i = 0; i < 6; ++i)
 914                     {
 915                         this->vert[i] = nextTri[i];
 916                     }
 917
 918                     this->adjExtraVert = -1;
 919                     this->reverseWinding ^= 1;
 920                 }
 921                 else
 922                 {
 923                     this->curIndex++;
 924                 }
 925             }
 926             break;
 927         case 6:
 928             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 929             AssembleTriStripAdj<gsEnabled>();
 930
 931             uint32_t nextTri[6];
 932             if (this->reverseWinding)
 933             {
 934                 nextTri[0] = this->vert[4];
 935                 nextTri[1] = this->vert[0];
 936                 nextTri[2] = this->vert[2];
 937                 nextTri[4] = this->vert[3];
 938                 nextTri[5] = this->adjExtraVert;
 939             }
 940             else
 941             {
 942                 nextTri[0] = this->vert[2];
 943                 nextTri[1] = this->adjExtraVert;
 944                 nextTri[2] = this->vert[3];
 945                 nextTri[4] = this->vert[4];
 946                 nextTri[5] = this->vert[0];
 947             }
 948             for (uint32_t i = 0; i < 6; ++i)
 949             {
 950                 this->vert[i] = nextTri[i];
 951             }
 952             this->reverseWinding ^= 1;
 953             this->adjExtraVert = index;
 954             this->curIndex--;
 955             break;
 956         }
 957     }
 958
 959     void ProcessVertTriList(uint32_t index, bool finish)
 960     {
 961         this->vert[this->curIndex] = index;
 962         this->curIndex++;
 963         if (this->curIndex == 3)
 964         {
 965             // assembled enough verts for prim, add to gather indices
 966             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 967             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 968             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 969
 970             // increment numPrimsAssembled
 971             this->numPrimsAssembled++;
 972
 973             // set up next prim state
 974             this->curIndex = 0;
 975         }
 976     }
 977
 978     void ProcessVertTriListAdj(uint32_t index, bool finish)
 979     {
 980         this->vert[this->curIndex] = index;
 981         this->curIndex++;
 982         if (this->curIndex == 6)
 983         {
 984             // assembled enough verts for prim, add to gather indices
 985             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 986             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 987             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 988             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 989             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 990             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 991
 992             // increment numPrimsAssembled
 993             this->numPrimsAssembled++;
 994
 995             // set up next prim state
 996             this->curIndex = 0;
 997         }
 998     }
 999
1000     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1001     {
1002         this->vert[this->curIndex] = index;
1003         this->curIndex++;
1004         if (this->curIndex == 6)
1005         {
1006             // assembled enough verts for prim, add to gather indices
1007             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1008             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1009             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1010
1011             // increment numPrimsAssembled
1012             this->numPrimsAssembled++;
1013
1014             // set up next prim state
1015             this->curIndex = 0;
1016         }
1017     }
1018
1019
1020     void ProcessVertLineList(uint32_t index, bool finish)
1021     {
1022         this->vert[this->curIndex] = index;
1023         this->curIndex++;
1024         if (this->curIndex == 2)
1025         {
1026             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1027             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1028
1029             this->numPrimsAssembled++;
1030             this->curIndex = 0;
1031         }
1032     }
1033
1034     void ProcessVertLineStrip(uint32_t index, bool finish)
1035     {
1036         this->vert[this->curIndex] = index;
1037         this->curIndex++;
1038         if (this->curIndex == 2)
1039         {
1040             // assembled enough verts for prim, add to gather indices
1041             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1042             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1043
1044             // increment numPrimsAssembled
1045             this->numPrimsAssembled++;
1046
1047             // set up next prim state
1048             this->vert[0] = this->vert[1];
1049             this->curIndex = 1;
1050         }
1051     }
1052
1053     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1054     {
1055         this->vert[this->curIndex] = index;
1056         this->curIndex++;
1057         if (this->curIndex == 4)
1058         {
1059             // assembled enough verts for prim, add to gather indices
1060             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1061             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1062             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1063             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1064
1065             // increment numPrimsAssembled
1066             this->numPrimsAssembled++;
1067
1068             // set up next prim state
1069             this->vert[0] = this->vert[1];
1070             this->vert[1] = this->vert[2];
1071             this->vert[2] = this->vert[3];
1072             this->curIndex = 3;
1073         }
1074     }
1075
1076     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1077     {
1078         this->vert[this->curIndex] = index;
1079         this->curIndex++;
1080         if (this->curIndex == 4)
1081         {
1082             // assembled enough verts for prim, add to gather indices
1083             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1084             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1085
1086             // increment numPrimsAssembled
1087             this->numPrimsAssembled++;
1088
1089             // set up next prim state
1090             this->vert[0] = this->vert[1];
1091             this->vert[1] = this->vert[2];
1092             this->vert[2] = this->vert[3];
1093             this->curIndex = 3;
1094         }
1095     }
1096
1097     void ProcessVertLineListAdj(uint32_t index, bool finish)
1098     {
1099         this->vert[this->curIndex] = index;
1100         this->curIndex++;
1101         if (this->curIndex == 4)
1102         {
1103             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1104             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1105             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1106             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1107
1108             this->numPrimsAssembled++;
1109             this->curIndex = 0;
1110         }
1111     }
1112
1113     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1114     {
1115         this->vert[this->curIndex] = index;
1116         this->curIndex++;
1117         if (this->curIndex == 4)
1118         {
1119             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1120             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1121
1122             this->numPrimsAssembled++;
1123             this->curIndex = 0;
1124         }
1125     }
1126
1127     void ProcessVertPointList(uint32_t index, bool finish)
1128     {
1129         this->vert[this->curIndex] = index;
1130         this->curIndex++;
1131         if (this->curIndex == 1)
1132         {
1133             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1134             this->numPrimsAssembled++;
1135             this->curIndex = 0;
1136         }
1137     }
1138 };
1139
1140 // Primitive Assembly for data output from the DomainShader.
1141 struct PA_TESS : PA_STATE
1142 {
1143     PA_TESS(
1144         DRAW_CONTEXT *in_pDC,
1145         const SIMDSCALAR* in_pVertData,
1146         uint32_t in_attributeStrideInVectors,
1147         uint32_t in_vertexStride,
1148         uint32_t in_numAttributes,
1149         uint32_t* (&in_ppIndices)[3],
1150         uint32_t in_numPrims,
1151         PRIMITIVE_TOPOLOGY in_binTopology,
1152         uint32_t numVertsPerPrim) :
1153
1154         PA_STATE(in_pDC, nullptr, 0, in_vertexStride, numVertsPerPrim),
1155         m_pVertexData(in_pVertData),
1156         m_attributeStrideInVectors(in_attributeStrideInVectors),
1157         m_numAttributes(in_numAttributes),
1158         m_numPrims(in_numPrims)
1159     {
1160 #if USE_SIMD16_FRONTEND
1161         m_vPrimId = _simd16_setzero_si();
1162 #else
1163         m_vPrimId = _simd_setzero_si();
1164 #endif
1165         binTopology = in_binTopology;
1166         m_ppIndices[0] = in_ppIndices[0];
1167         m_ppIndices[1] = in_ppIndices[1];
1168         m_ppIndices[2] = in_ppIndices[2];
1169
1170         switch (binTopology)
1171         {
1172         case TOP_POINT_LIST:
1173             m_numVertsPerPrim = 1;
1174             break;
1175
1176         case TOP_LINE_LIST:
1177             m_numVertsPerPrim = 2;
1178             break;
1179
1180         case TOP_TRIANGLE_LIST:
1181             m_numVertsPerPrim = 3;
1182             break;
1183
1184         default:
1185             SWR_INVALID("Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1186             break;
1187         }
1188     }
1189
1190     bool HasWork()
1191     {
1192         return m_numPrims != 0;
1193     }
1194
1195     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1196     {
1197         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1198         return junkVector;
1199     }
1200
1201 #if ENABLE_AVX512_SIMD16
1202     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1203     {
1204         SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__);
1205         return junkVector_simd16;
1206     }
1207
1208 #endif
1209     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1210     {
1211         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1212 #if USE_SIMD16_FRONTEND
1213         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1214         {
1215             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1216             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1217         };
1218
1219         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1220 #else
1221         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1222         {
1223             -1, -1, -1, -1, -1, -1, -1, -1,
1224             0,  0,  0,  0,  0,  0,  0,  0
1225         };
1226
1227         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1228 #endif
1229     }
1230
1231     bool Assemble(uint32_t slot, simdvector verts[])
1232     {
1233         SWR_ASSERT(slot < m_numAttributes);
1234
1235         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1236         if (0 == numPrimsToAssemble)
1237         {
1238             return false;
1239         }
1240
1241         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1242
1243         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1244         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1245         {
1246 #if USE_SIMD16_FRONTEND
1247             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1248 #else
1249             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1250 #endif
1251
1252             const float* pBase = pBaseAttrib;
1253             for (uint32_t c = 0; c < 4; ++c)
1254             {
1255 #if USE_SIMD16_FRONTEND
1256                 simd16scalar temp = _simd16_mask_i32gather_ps(
1257                     _simd16_setzero_ps(),
1258                     pBase,
1259                     indices,
1260                     _simd16_castsi_ps(mask),
1261                     4 /* gcc doesn't like sizeof(float) */);
1262
1263                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
1264 #else
1265                 verts[i].v[c] = _simd_mask_i32gather_ps(
1266                     _simd_setzero_ps(),
1267                     pBase,
1268                     indices,
1269                     _simd_castsi_ps(mask),
1270                     4); // gcc doesn't like sizeof(float)
1271 #endif
1272                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1273             }
1274         }
1275
1276         return true;
1277     }
1278
1279 #if ENABLE_AVX512_SIMD16
1280     bool Assemble(uint32_t slot, simd16vector verts[])
1281     {
1282         SWR_ASSERT(slot < m_numAttributes);
1283
1284         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1285         if (0 == numPrimsToAssemble)
1286         {
1287             return false;
1288         }
1289
1290         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1291
1292         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1293         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1294         {
1295 #if USE_SIMD16_FRONTEND
1296             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1297 #else
1298             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1299 #endif
1300
1301             const float* pBase = pBaseAttrib;
1302             for (uint32_t c = 0; c < 4; ++c)
1303             {
1304 #if USE_SIMD16_FRONTEND
1305                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1306                     _simd16_setzero_ps(),
1307                     pBase,
1308                     indices,
1309                     _simd16_castsi_ps(mask),
1310                     4 /* gcc doesn't like sizeof(float) */);
1311 #else
1312                 simdscalar temp = _simd_mask_i32gather_ps(
1313                     _simd_setzero_ps(),
1314                     pBase,
1315                     indices,
1316                     _simd_castsi_ps(mask),
1317                     4 /* gcc doesn't like sizeof(float) */);
1318                 verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0);
1319 #endif
1320                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1321             }
1322         }
1323
1324         return true;
1325     }
1326
1327 #endif
1328     void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
1329     {
1330         SWR_ASSERT(slot < m_numAttributes);
1331         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1332
1333         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1334         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1335         {
1336 #if USE_SIMD16_FRONTEND
1337             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1338 #else
1339             uint32_t index = m_ppIndices[i][primIndex];
1340 #endif
1341             const float* pVertData = pVertDataBase;
1342             float* pVert = (float*)&verts[i];
1343
1344             for (uint32_t c = 0; c < 4; ++c)
1345             {
1346                 pVert[c] = pVertData[index];
1347                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1348             }
1349         }
1350     }
1351
1352     bool NextPrim()
1353     {
1354         uint32_t numPrims = PA_TESS::NumPrims();
1355         m_numPrims -= numPrims;
1356         m_ppIndices[0] += numPrims;
1357         m_ppIndices[1] += numPrims;
1358         m_ppIndices[2] += numPrims;
1359
1360         return HasWork();
1361     }
1362
1363     SIMDVERTEX& GetNextVsOutput()
1364     {
1365         SWR_NOT_IMPL;
1366         return junkVertex;
1367     }
1368
1369     bool GetNextStreamOutput()
1370     {
1371         SWR_NOT_IMPL;
1372         return false;
1373     }
1374
1375     SIMDMASK& GetNextVsIndices()
1376     {
1377         SWR_NOT_IMPL;
1378         return junkIndices;
1379     }
1380
1381     uint32_t NumPrims()
1382     {
1383         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1384     }
1385
1386     void Reset()
1387     {
1388         SWR_NOT_IMPL;
1389     }
1390
1391     SIMDSCALARI GetPrimID(uint32_t startID)
1392     {
1393 #if USE_SIMD16_FRONTEND
1394         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1395 #else
1396         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1397 #endif
1398     }
1399
1400 private:
1401     const SIMDSCALAR*   m_pVertexData = nullptr;
1402     uint32_t            m_attributeStrideInVectors = 0;
1403     uint32_t            m_numAttributes = 0;
1404     uint32_t            m_numPrims = 0;
1405     uint32_t*           m_ppIndices[3];
1406
1407     uint32_t            m_numVertsPerPrim = 0;
1408
1409     SIMDSCALARI         m_vPrimId;
1410
1411     simdvector          junkVector;         // junk simdvector for unimplemented API
1412 #if ENABLE_AVX512_SIMD16
1413     simd16vector        junkVector_simd16;  // junk simd16vector for unimplemented API
1414 #endif
1415     SIMDVERTEX          junkVertex;         // junk SIMDVERTEX for unimplemented API
1416     SIMDMASK            junkIndices;        // temporary index store for unused virtual function
1417 };
1418
1419 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1420 // based on state.
1421 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1422 struct PA_FACTORY
1423 {
1424     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize, uint32_t vertexStride, uint32_t numVertsPerPrim) : topo(in_topo)
1425     {
1426 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1427         const API_STATE& state = GetApiState(pDC);
1428         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1429             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1430             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1431             topo == TOP_TRIANGLE_LIST)) ||
1432
1433             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1434             // for them in the optimized PA
1435             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1436         {
1437             memset(&indexStore, 0, sizeof(indexStore));
1438             uint32_t numAttribs = state.feNumAttributes;
1439
1440             new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH,
1441                 vertexStride, &this->indexStore[0], numVerts, numAttribs, state.topology, false, numVertsPerPrim);
1442             cutPA = true;
1443         }
1444         else
1445 #endif
1446         {
1447             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1448             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, vertexStride, false, numVertsPerPrim);
1449             cutPA = false;
1450         }
1451
1452     }
1453
1454     PA_STATE& GetPA()
1455     {
1456 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1457         if (cutPA)
1458         {
1459             return this->paCut;
1460         }
1461         else
1462 #endif
1463         {
1464             return this->paOpt;
1465         }
1466     }
1467
1468     PA_STATE_OPT paOpt;
1469     PA_STATE_CUT paCut;
1470
1471     bool cutPA{ false };
1472
1473     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1474
1475     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1476 };