src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };              // draw context
  71     uint8_t* pStreamBase{ nullptr };           // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
  73
  74     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  75     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  76
  77 #if ENABLE_AVX512_SIMD16
  78     bool useAlternateOffset{ false };
  79
  80 #endif
  81     PA_STATE() {}
  82     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
  83         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
  84
  85     virtual bool HasWork() = 0;
  86     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  87 #if ENABLE_AVX512_SIMD16
  88     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  89 #endif
  90     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  91 #if ENABLE_AVX512_SIMD16
  92     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
  93 #endif
  94     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
  95     virtual bool NextPrim() = 0;
  96     virtual SIMDVERTEX& GetNextVsOutput() = 0;
  97     virtual bool GetNextStreamOutput() = 0;
  98     virtual SIMDMASK& GetNextVsIndices() = 0;
  99     virtual uint32_t NumPrims() = 0;
 100     virtual void Reset() = 0;
 101     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 102 };
 103
 104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 105 // output. Here is the sequence
 106 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 107 //    2. Execute PA function to assemble and bin triangles.
 108 //        a.    The PA function is a set of functions that collectively make up the
 109 //            state machine for a given topology.
 110 //                1.    We use a state index to track which PA function to call.
 111 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 112 //                1.    We call this the current and previous simd vertex.
 113 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 114 //                    order to assemble the second triangle, for a triangle list, we'll need the
 115 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 116 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 117 //
 118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 119 // cuts
 120 struct PA_STATE_OPT : public PA_STATE
 121 {
 122     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 123     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 124
 125     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 126
 127     uint32_t cur{ 0 };                   // index to current VS output.
 128     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 129     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
 130
 131     uint32_t counter{ 0 };               // state counter
 132     bool reset{ false };                 // reset state
 133
 134     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 135     SIMDSCALARI primID;
 136
 137     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
 138 #if ENABLE_AVX512_SIMD16
 139     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
 140 #endif
 141     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 142
 143     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 144 #if ENABLE_AVX512_SIMD16
 145     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 146 #endif
 147     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 148     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 149 #if ENABLE_AVX512_SIMD16
 150     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 151 #endif
 152
 153     // state used to advance the PA when Next is called
 154     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 155 #if ENABLE_AVX512_SIMD16
 156     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 157 #endif
 158     uint32_t           nextNumSimdPrims{ 0 };
 159     uint32_t           nextNumPrimsIncrement{ 0 };
 160     bool               nextReset{ false };
 161     bool               isStreaming{ false };
 162
 163     SIMDMASK tmpIndices{ 0 };            // temporary index store for unused virtual function
 164
 165     PA_STATE_OPT() {}
 166     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 167         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 168
 169     bool HasWork()
 170     {
 171         return (this->numPrimsComplete < this->numPrims) ? true : false;
 172     }
 173
 174     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 175     {
 176         simdvertex* pVertex = (simdvertex*)pStreamBase;
 177         return pVertex[index].attrib[slot];
 178     }
 179
 180 #if ENABLE_AVX512_SIMD16
 181     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 182     {
 183         simd16vertex* pVertex = (simd16vertex*)pStreamBase;
 184         return pVertex[index].attrib[slot];
 185     }
 186
 187 #endif
 188     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 189     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 190     bool Assemble(uint32_t slot, simdvector verts[])
 191     {
 192         return this->pfnPaFunc(*this, slot, verts);
 193     }
 194
 195 #if ENABLE_AVX512_SIMD16
 196     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 197     {
 198         return this->pfnPaFunc_simd16(*this, slot, verts);
 199     }
 200
 201 #endif
 202     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 203     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
 204     {
 205         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 206     }
 207
 208     bool NextPrim()
 209     {
 210         this->pfnPaFunc = this->pfnPaNextFunc;
 211 #if ENABLE_AVX512_SIMD16
 212         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 213 #endif
 214         this->numSimdPrims = this->nextNumSimdPrims;
 215         this->numPrimsComplete += this->nextNumPrimsIncrement;
 216         this->reset = this->nextReset;
 217
 218         if (this->isStreaming)
 219         {
 220             this->reset = false;
 221         }
 222
 223         bool morePrims = false;
 224
 225         if (this->numSimdPrims > 0)
 226         {
 227             morePrims = true;
 228             this->numSimdPrims--;
 229         }
 230         else
 231         {
 232             this->counter = (this->reset) ? 0 : (this->counter + 1);
 233             this->reset = false;
 234         }
 235
 236         this->pfnPaFunc = this->pfnPaNextFunc;
 237
 238         if (!HasWork())
 239         {
 240             morePrims = false;    // no more to do
 241         }
 242
 243         return morePrims;
 244     }
 245
 246     SIMDVERTEX& GetNextVsOutput()
 247     {
 248         // increment cur and prev indices
 249         const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
 250         this->prev = this->cur;  // prev is undefined for first state.
 251         this->cur = this->counter % numSimdVerts;
 252
 253         SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
 254         return pVertex[this->cur];
 255     }
 256
 257     SIMDMASK& GetNextVsIndices()
 258     {
 259         // unused in optimized PA, pass tmp buffer back
 260         return tmpIndices;
 261     }
 262
 263     bool GetNextStreamOutput()
 264     {
 265         this->prev = this->cur;
 266         this->cur = this->counter;
 267
 268         return HasWork();
 269     }
 270
 271     uint32_t NumPrims()
 272     {
 273         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 274             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 275     }
 276
 277     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 278         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 279         uint32_t numSimdPrims = 0,
 280         uint32_t numPrimsIncrement = 0,
 281         bool reset = false)
 282     {
 283         this->pfnPaNextFunc = pfnPaNextFunc;
 284         this->nextNumSimdPrims = numSimdPrims;
 285         this->nextNumPrimsIncrement = numPrimsIncrement;
 286         this->nextReset = reset;
 287
 288         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 289     }
 290
 291 #if ENABLE_AVX512_SIMD16
 292     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 293         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 294         uint32_t numSimdPrims = 0,
 295         uint32_t numPrimsIncrement = 0,
 296         bool reset = false)
 297     {
 298         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 299         this->nextNumSimdPrims = numSimdPrims;
 300         this->nextNumPrimsIncrement = numPrimsIncrement;
 301         this->nextReset = reset;
 302
 303         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 304     }
 305
 306 #endif
 307     void Reset()
 308     {
 309 #if ENABLE_AVX512_SIMD16
 310         useAlternateOffset = false;
 311
 312 #endif
 313         this->pfnPaFunc = this->pfnPaFuncReset;
 314         this->numPrimsComplete = 0;
 315         this->numSimdPrims = 0;
 316         this->cur = 0;
 317         this->prev = 0;
 318         this->first = 0;
 319         this->counter = 0;
 320         this->reset = false;
 321     }
 322
 323     SIMDSCALARI GetPrimID(uint32_t startID)
 324     {
 325 #if USE_SIMD16_FRONTEND
 326         return _simd16_add_epi32(this->primID,
 327             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 328 #else
 329         return _simd_add_epi32(this->primID,
 330             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 331 #endif
 332     }
 333 };
 334
 335 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 336 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 337     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 338     uint32_t numSimdPrims = 0,
 339     uint32_t numPrimsIncrement = 0,
 340     bool reset = false)
 341 {
 342     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 343 }
 344
 345 #if ENABLE_AVX512_SIMD16
 346 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 347     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 348     uint32_t numSimdPrims = 0,
 349     uint32_t numPrimsIncrement = 0,
 350     bool reset = false)
 351 {
 352     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 353 }
 354
 355 #endif
 356 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 357 {
 358     return pa.GetSimdVector(index, slot);
 359 }
 360
 361 #if ENABLE_AVX512_SIMD16
 362 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 363 {
 364     return pa.GetSimdVector_simd16(index, slot);
 365 }
 366
 367 #endif
 368 // Cut-aware primitive assembler.
 369 struct PA_STATE_CUT : public PA_STATE
 370 {
 371     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 372     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 373     uint32_t numAttribs{ 0 };            // number of attributes
 374     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 375     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 376 #if ENABLE_AVX512_SIMD16
 377     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 378 #else
 379     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 380 #endif
 381     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 382     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 383     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 384     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 385     uint32_t curVertex{ 0 };             // current unprocessed vertex
 386     uint32_t startPrimId{ 0 };           // starting prim id
 387     SIMDSCALARI vPrimId;                 // vector of prim ID
 388     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 389     uint32_t vertsPerPrim{ 0 };
 390     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 391                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 392                                          // while the GS sends valid verts for every index
 393     // Topology state tracking
 394     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 395     uint32_t curIndex{ 0 };
 396     bool reverseWinding{ false };        // indicates reverse winding for strips
 397     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 398
 399     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 400     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 401
 402     PA_STATE_CUT() {}
 403     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 404         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 405         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
 406     {
 407         numVerts = in_streamSizeInVerts;
 408         numAttribs = in_numAttribs;
 409         binTopology = topo;
 410         needOffsets = false;
 411         processCutVerts = in_processCutVerts;
 412
 413         numVertsToAssemble = numRemainingVerts = in_numVerts;
 414         numPrimsAssembled = 0;
 415         headVertex = tailVertex = curVertex = 0;
 416
 417         curIndex = 0;
 418         pCutIndices = in_pIndices;
 419         memset(indices, 0, sizeof(indices));
 420 #if USE_SIMD16_FRONTEND
 421         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 422 #else
 423         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 424 #endif
 425         reverseWinding = false;
 426         adjExtraVert = -1;
 427
 428         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 429         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 430
 431         switch (topo)
 432         {
 433         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 434         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 435         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 436         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 437                                     {
 438                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 439                                     }
 440                                     else
 441                                     {
 442                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 443                                     }
 444                                     break;
 445
 446         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 447         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 448         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 449         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 450         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 451         default: assert(0 && "Unimplemented topology");
 452         }
 453     }
 454
 455     SIMDVERTEX& GetNextVsOutput()
 456     {
 457         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 458         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 459         this->needOffsets = true;
 460         return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
 461     }
 462
 463     SIMDMASK& GetNextVsIndices()
 464     {
 465         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 466         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 467         return *pCurCutIndex;
 468     }
 469
 470     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 471     {
 472         // unused
 473         SWR_ASSERT(0 && "Not implemented");
 474         static simdvector junk;
 475         return junk;
 476     }
 477
 478 #if ENABLE_AVX512_SIMD16
 479     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 480     {
 481         // unused
 482         SWR_ASSERT(0 && "Not implemented");
 483         static simd16vector junk;
 484         return junk;
 485     }
 486
 487 #endif
 488     bool GetNextStreamOutput()
 489     {
 490         this->headVertex += SIMD_WIDTH;
 491         this->needOffsets = true;
 492         return HasWork();
 493     }
 494
 495     SIMDSCALARI GetPrimID(uint32_t startID)
 496     {
 497 #if USE_SIMD16_FRONTEND
 498         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 499 #else
 500         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 501 #endif
 502     }
 503
 504     void Reset()
 505     {
 506 #if ENABLE_AVX512_SIMD16
 507         useAlternateOffset = false;
 508
 509 #endif
 510         this->numRemainingVerts = this->numVertsToAssemble;
 511         this->numPrimsAssembled = 0;
 512         this->curIndex = 0;
 513         this->curVertex = 0;
 514         this->tailVertex = 0;
 515         this->headVertex = 0;
 516         this->reverseWinding = false;
 517         this->adjExtraVert = -1;
 518 #if USE_SIMD16_FRONTEND
 519         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 520 #else
 521         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 522 #endif
 523     }
 524
 525     bool HasWork()
 526     {
 527         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 528     }
 529
 530     bool IsVertexStoreFull()
 531     {
 532         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 533     }
 534
 535     void RestartTopology()
 536     {
 537         this->curIndex = 0;
 538         this->reverseWinding = false;
 539         this->adjExtraVert = -1;
 540     }
 541
 542     bool IsCutIndex(uint32_t vertex)
 543     {
 544         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 545         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 546         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 547     }
 548
 549     // iterates across the unprocessed verts until we hit the end or we
 550     // have assembled SIMD prims
 551     void ProcessVerts()
 552     {
 553         while (this->numPrimsAssembled != SIMD_WIDTH &&
 554             this->numRemainingVerts > 0 &&
 555             this->curVertex != this->headVertex)
 556         {
 557             // if cut index, restart topology
 558             if (IsCutIndex(this->curVertex))
 559             {
 560                 if (this->processCutVerts)
 561                 {
 562                     (this->*pfnPa)(this->curVertex, false);
 563                 }
 564                 // finish off tri strip w/ adj before restarting topo
 565                 if (this->adjExtraVert != -1)
 566                 {
 567                     (this->*pfnPa)(this->curVertex, true);
 568                 }
 569                 RestartTopology();
 570             }
 571             else
 572             {
 573                 (this->*pfnPa)(this->curVertex, false);
 574             }
 575
 576             this->curVertex++;
 577             if (this->curVertex >= this->numVerts) {
 578                this->curVertex = 0;
 579             }
 580             this->numRemainingVerts--;
 581         }
 582
 583         // special case last primitive for tri strip w/ adj
 584         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 585         {
 586             (this->*pfnPa)(this->curVertex, true);
 587         }
 588     }
 589
 590     void Advance()
 591     {
 592         // done with current batch
 593         // advance tail to the current unsubmitted vertex
 594         this->tailVertex = this->curVertex;
 595         this->numPrimsAssembled = 0;
 596 #if USE_SIMD16_FRONTEND
 597         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 598 #else
 599         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 600 #endif
 601     }
 602
 603     bool NextPrim()
 604     {
 605         // if we've assembled enough prims, we can advance to the next set of verts
 606         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 607         {
 608             Advance();
 609         }
 610         return false;
 611     }
 612
 613     void ComputeOffsets()
 614     {
 615         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 616         {
 617             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 618
 619             // step to simdvertex batch
 620             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 621 #if USE_SIMD16_FRONTEND
 622             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 623             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
 624 #else
 625             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 626             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
 627 #endif
 628
 629             // step to index
 630             const uint32_t simdMask = SIMD_WIDTH - 1;
 631 #if USE_SIMD16_FRONTEND
 632             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 633             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 634 #else
 635             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 636             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 637 #endif
 638         }
 639     }
 640
 641     bool Assemble(uint32_t slot, simdvector verts[])
 642     {
 643         // process any outstanding verts
 644         ProcessVerts();
 645
 646         // return false if we don't have enough prims assembled
 647         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 648         {
 649             return false;
 650         }
 651
 652         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 653         if (this->needOffsets)
 654         {
 655             ComputeOffsets();
 656             this->needOffsets = false;
 657         }
 658
 659         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 660         {
 661             SIMDSCALARI offsets = this->vOffsets[v];
 662
 663             // step to attribute
 664 #if USE_SIMD16_FRONTEND
 665             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 666 #else
 667             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 668 #endif
 669
 670             float* pBase = (float*)this->pStreamBase;
 671             for (uint32_t c = 0; c < 4; ++c)
 672             {
 673 #if USE_SIMD16_FRONTEND
 674                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 675
 676                 verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
 677 #else
 678                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 679 #endif
 680
 681                 // move base to next component
 682                 pBase += SIMD_WIDTH;
 683             }
 684         }
 685
 686         return true;
 687     }
 688
 689 #if ENABLE_AVX512_SIMD16
 690     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 691     {
 692         // process any outstanding verts
 693         ProcessVerts();
 694
 695         // return false if we don't have enough prims assembled
 696         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 697         {
 698             return false;
 699         }
 700
 701         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 702         if (this->needOffsets)
 703         {
 704             ComputeOffsets();
 705             this->needOffsets = false;
 706         }
 707
 708         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 709         {
 710             SIMDSCALARI offsets = this->vOffsets[v];
 711
 712             // step to attribute
 713 #if USE_SIMD16_FRONTEND
 714             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 715 #else
 716             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 717 #endif
 718
 719             float* pBase = (float*)this->pStreamBase;
 720             for (uint32_t c = 0; c < 4; ++c)
 721             {
 722 #if USE_SIMD16_FRONTEND
 723                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 724 #else
 725                 verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
 726                 verts[v].v[c].hi = _simd_setzero_ps();
 727 #endif
 728
 729                 // move base to next component
 730                 pBase += SIMD_WIDTH;
 731             }
 732         }
 733
 734         return true;
 735     }
 736
 737 #endif
 738     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
 739     {
 740         // move to slot
 741         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 742         {
 743             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 744 #if USE_SIMD16_FRONTEND
 745             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 746 #else
 747             uint32_t offset = pOffset[triIndex];
 748 #endif
 749             offset += sizeof(SIMDVECTOR) * slot;
 750             float* pVert = (float*)&tri[v];
 751             for (uint32_t c = 0; c < 4; ++c)
 752             {
 753                 float* pComponent = (float*)(this->pStreamBase + offset);
 754                 pVert[c] = *pComponent;
 755                 offset += SIMD_WIDTH * sizeof(float);
 756             }
 757         }
 758     }
 759
 760     uint32_t NumPrims()
 761     {
 762         return this->numPrimsAssembled;
 763     }
 764
 765     // Per-topology functions
 766     void ProcessVertTriStrip(uint32_t index, bool finish)
 767     {
 768         this->vert[this->curIndex] = index;
 769         this->curIndex++;
 770         if (this->curIndex == 3)
 771         {
 772             // assembled enough verts for prim, add to gather indices
 773             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 774             if (reverseWinding)
 775             {
 776                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 777                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 778             }
 779             else
 780             {
 781                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 782                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 783             }
 784
 785             // increment numPrimsAssembled
 786             this->numPrimsAssembled++;
 787
 788             // set up next prim state
 789             this->vert[0] = this->vert[1];
 790             this->vert[1] = this->vert[2];
 791             this->curIndex = 2;
 792             this->reverseWinding ^= 1;
 793         }
 794     }
 795
 796     template<bool gsEnabled>
 797     void AssembleTriStripAdj()
 798     {
 799         if (!gsEnabled)
 800         {
 801             this->vert[1] = this->vert[2];
 802             this->vert[2] = this->vert[4];
 803
 804             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 805             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 806             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 807
 808             this->vert[4] = this->vert[2];
 809             this->vert[2] = this->vert[1];
 810         }
 811         else
 812         {
 813             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 814             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 815             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 816             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 817             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 818             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 819         }
 820         this->numPrimsAssembled++;
 821     }
 822
 823
 824     template<bool gsEnabled>
 825     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 826     {
 827         // handle last primitive of tristrip
 828         if (finish && this->adjExtraVert != -1)
 829         {
 830             this->vert[3] = this->adjExtraVert;
 831             AssembleTriStripAdj<gsEnabled>();
 832             this->adjExtraVert = -1;
 833             return;
 834         }
 835
 836         switch (this->curIndex)
 837         {
 838         case 0:
 839         case 1:
 840         case 2:
 841         case 4:
 842             this->vert[this->curIndex] = index;
 843             this->curIndex++;
 844             break;
 845         case 3:
 846             this->vert[5] = index;
 847             this->curIndex++;
 848             break;
 849         case 5:
 850             if (this->adjExtraVert == -1)
 851             {
 852                 this->adjExtraVert = index;
 853             }
 854             else
 855             {
 856                 this->vert[3] = index;
 857                 if (!gsEnabled)
 858                 {
 859                     AssembleTriStripAdj<gsEnabled>();
 860
 861                     uint32_t nextTri[6];
 862                     if (this->reverseWinding)
 863                     {
 864                         nextTri[0] = this->vert[4];
 865                         nextTri[1] = this->vert[0];
 866                         nextTri[2] = this->vert[2];
 867                         nextTri[4] = this->vert[3];
 868                         nextTri[5] = this->adjExtraVert;
 869                     }
 870                     else
 871                     {
 872                         nextTri[0] = this->vert[2];
 873                         nextTri[1] = this->adjExtraVert;
 874                         nextTri[2] = this->vert[3];
 875                         nextTri[4] = this->vert[4];
 876                         nextTri[5] = this->vert[0];
 877                     }
 878                     for (uint32_t i = 0; i < 6; ++i)
 879                     {
 880                         this->vert[i] = nextTri[i];
 881                     }
 882
 883                     this->adjExtraVert = -1;
 884                     this->reverseWinding ^= 1;
 885                 }
 886                 else
 887                 {
 888                     this->curIndex++;
 889                 }
 890             }
 891             break;
 892         case 6:
 893             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 894             AssembleTriStripAdj<gsEnabled>();
 895
 896             uint32_t nextTri[6];
 897             if (this->reverseWinding)
 898             {
 899                 nextTri[0] = this->vert[4];
 900                 nextTri[1] = this->vert[0];
 901                 nextTri[2] = this->vert[2];
 902                 nextTri[4] = this->vert[3];
 903                 nextTri[5] = this->adjExtraVert;
 904             }
 905             else
 906             {
 907                 nextTri[0] = this->vert[2];
 908                 nextTri[1] = this->adjExtraVert;
 909                 nextTri[2] = this->vert[3];
 910                 nextTri[4] = this->vert[4];
 911                 nextTri[5] = this->vert[0];
 912             }
 913             for (uint32_t i = 0; i < 6; ++i)
 914             {
 915                 this->vert[i] = nextTri[i];
 916             }
 917             this->reverseWinding ^= 1;
 918             this->adjExtraVert = index;
 919             this->curIndex--;
 920             break;
 921         }
 922     }
 923
 924     void ProcessVertTriList(uint32_t index, bool finish)
 925     {
 926         this->vert[this->curIndex] = index;
 927         this->curIndex++;
 928         if (this->curIndex == 3)
 929         {
 930             // assembled enough verts for prim, add to gather indices
 931             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 932             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 933             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 934
 935             // increment numPrimsAssembled
 936             this->numPrimsAssembled++;
 937
 938             // set up next prim state
 939             this->curIndex = 0;
 940         }
 941     }
 942
 943     void ProcessVertTriListAdj(uint32_t index, bool finish)
 944     {
 945         this->vert[this->curIndex] = index;
 946         this->curIndex++;
 947         if (this->curIndex == 6)
 948         {
 949             // assembled enough verts for prim, add to gather indices
 950             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 951             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 952             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 953             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 954             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 955             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 956
 957             // increment numPrimsAssembled
 958             this->numPrimsAssembled++;
 959
 960             // set up next prim state
 961             this->curIndex = 0;
 962         }
 963     }
 964
 965     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 966     {
 967         this->vert[this->curIndex] = index;
 968         this->curIndex++;
 969         if (this->curIndex == 6)
 970         {
 971             // assembled enough verts for prim, add to gather indices
 972             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 973             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 974             this->indices[2][this->numPrimsAssembled] = this->vert[4];
 975
 976             // increment numPrimsAssembled
 977             this->numPrimsAssembled++;
 978
 979             // set up next prim state
 980             this->curIndex = 0;
 981         }
 982     }
 983
 984
 985     void ProcessVertLineList(uint32_t index, bool finish)
 986     {
 987         this->vert[this->curIndex] = index;
 988         this->curIndex++;
 989         if (this->curIndex == 2)
 990         {
 991             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 992             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 993
 994             this->numPrimsAssembled++;
 995             this->curIndex = 0;
 996         }
 997     }
 998
 999     void ProcessVertLineStrip(uint32_t index, bool finish)
1000     {
1001         this->vert[this->curIndex] = index;
1002         this->curIndex++;
1003         if (this->curIndex == 2)
1004         {
1005             // assembled enough verts for prim, add to gather indices
1006             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1007             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1008
1009             // increment numPrimsAssembled
1010             this->numPrimsAssembled++;
1011
1012             // set up next prim state
1013             this->vert[0] = this->vert[1];
1014             this->curIndex = 1;
1015         }
1016     }
1017
1018     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1019     {
1020         this->vert[this->curIndex] = index;
1021         this->curIndex++;
1022         if (this->curIndex == 4)
1023         {
1024             // assembled enough verts for prim, add to gather indices
1025             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1026             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1027             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1028             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1029
1030             // increment numPrimsAssembled
1031             this->numPrimsAssembled++;
1032
1033             // set up next prim state
1034             this->vert[0] = this->vert[1];
1035             this->vert[1] = this->vert[2];
1036             this->vert[2] = this->vert[3];
1037             this->curIndex = 3;
1038         }
1039     }
1040
1041     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1042     {
1043         this->vert[this->curIndex] = index;
1044         this->curIndex++;
1045         if (this->curIndex == 4)
1046         {
1047             // assembled enough verts for prim, add to gather indices
1048             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1049             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1050
1051             // increment numPrimsAssembled
1052             this->numPrimsAssembled++;
1053
1054             // set up next prim state
1055             this->vert[0] = this->vert[1];
1056             this->vert[1] = this->vert[2];
1057             this->vert[2] = this->vert[3];
1058             this->curIndex = 3;
1059         }
1060     }
1061
1062     void ProcessVertLineListAdj(uint32_t index, bool finish)
1063     {
1064         this->vert[this->curIndex] = index;
1065         this->curIndex++;
1066         if (this->curIndex == 4)
1067         {
1068             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1069             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1070             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1071             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1072
1073             this->numPrimsAssembled++;
1074             this->curIndex = 0;
1075         }
1076     }
1077
1078     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1079     {
1080         this->vert[this->curIndex] = index;
1081         this->curIndex++;
1082         if (this->curIndex == 4)
1083         {
1084             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1085             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1086
1087             this->numPrimsAssembled++;
1088             this->curIndex = 0;
1089         }
1090     }
1091
1092     void ProcessVertPointList(uint32_t index, bool finish)
1093     {
1094         this->vert[this->curIndex] = index;
1095         this->curIndex++;
1096         if (this->curIndex == 1)
1097         {
1098             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1099             this->numPrimsAssembled++;
1100             this->curIndex = 0;
1101         }
1102     }
1103 };
1104
1105 // Primitive Assembly for data output from the DomainShader.
1106 struct PA_TESS : PA_STATE
1107 {
1108     PA_TESS(
1109         DRAW_CONTEXT *in_pDC,
1110         const SIMDSCALAR* in_pVertData,
1111         uint32_t in_attributeStrideInVectors,
1112         uint32_t in_numAttributes,
1113         uint32_t* (&in_ppIndices)[3],
1114         uint32_t in_numPrims,
1115         PRIMITIVE_TOPOLOGY in_binTopology) :
1116
1117         PA_STATE(in_pDC, nullptr, 0),
1118         m_pVertexData(in_pVertData),
1119         m_attributeStrideInVectors(in_attributeStrideInVectors),
1120         m_numAttributes(in_numAttributes),
1121         m_numPrims(in_numPrims)
1122     {
1123 #if USE_SIMD16_FRONTEND
1124         m_vPrimId = _simd16_setzero_si();
1125 #else
1126         m_vPrimId = _simd_setzero_si();
1127 #endif
1128         binTopology = in_binTopology;
1129         m_ppIndices[0] = in_ppIndices[0];
1130         m_ppIndices[1] = in_ppIndices[1];
1131         m_ppIndices[2] = in_ppIndices[2];
1132
1133         switch (binTopology)
1134         {
1135         case TOP_POINT_LIST:
1136             m_numVertsPerPrim = 1;
1137             break;
1138
1139         case TOP_LINE_LIST:
1140             m_numVertsPerPrim = 2;
1141             break;
1142
1143         case TOP_TRIANGLE_LIST:
1144             m_numVertsPerPrim = 3;
1145             break;
1146
1147         default:
1148             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1149             break;
1150         }
1151     }
1152
1153     bool HasWork()
1154     {
1155         return m_numPrims != 0;
1156     }
1157
1158     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1159     {
1160         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1161         static simdvector junk;
1162         return junk;
1163     }
1164
1165 #if ENABLE_AVX512_SIMD16
1166     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1167     {
1168         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1169         static simd16vector junk;
1170         return junk;
1171     }
1172
1173 #endif
1174     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1175     {
1176         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1177 #if USE_SIMD16_FRONTEND
1178         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1179         {
1180             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1181             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1182         };
1183
1184         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1185 #else
1186         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1187         {
1188             -1, -1, -1, -1, -1, -1, -1, -1,
1189             0,  0,  0,  0,  0,  0,  0,  0
1190         };
1191
1192         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1193 #endif
1194     }
1195
1196     bool Assemble(uint32_t slot, simdvector verts[])
1197     {
1198         SWR_ASSERT(slot < m_numAttributes);
1199
1200         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1201         if (0 == numPrimsToAssemble)
1202         {
1203             return false;
1204         }
1205
1206         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1207
1208         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1209         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1210         {
1211 #if USE_SIMD16_FRONTEND
1212             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1213 #else
1214             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1215 #endif
1216
1217             const float* pBase = pBaseAttrib;
1218             for (uint32_t c = 0; c < 4; ++c)
1219             {
1220 #if USE_SIMD16_FRONTEND
1221                 simd16scalar temp = _simd16_mask_i32gather_ps(
1222                     _simd16_setzero_ps(),
1223                     pBase,
1224                     indices,
1225                     mask,
1226                     4 /* gcc doesn't like sizeof(float) */);
1227
1228                 verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
1229 #else
1230                 verts[i].v[c] = _simd_mask_i32gather_ps(
1231                     _simd_setzero_ps(),
1232                     pBase,
1233                     indices,
1234                     _simd_castsi_ps(mask),
1235                     4 /* gcc doesn't like sizeof(float) */);
1236 #endif
1237                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1238             }
1239         }
1240
1241         return true;
1242     }
1243
1244 #if ENABLE_AVX512_SIMD16
1245     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1246     {
1247         SWR_ASSERT(slot < m_numAttributes);
1248
1249         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1250         if (0 == numPrimsToAssemble)
1251         {
1252             return false;
1253         }
1254
1255         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1256
1257         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1258         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1259         {
1260 #if USE_SIMD16_FRONTEND
1261             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1262 #else
1263             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1264 #endif
1265
1266             const float* pBase = pBaseAttrib;
1267             for (uint32_t c = 0; c < 4; ++c)
1268             {
1269 #if USE_SIMD16_FRONTEND
1270                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1271                     _simd16_setzero_ps(),
1272                     pBase,
1273                     indices,
1274                     mask,
1275                     4 /* gcc doesn't like sizeof(float) */);
1276 #else
1277                 verts[i].v[c].lo = _simd_mask_i32gather_ps(
1278                     _simd_setzero_ps(),
1279                     pBase,
1280                     indices,
1281                     _simd_castsi_ps(mask),
1282                     4 /* gcc doesn't like sizeof(float) */);
1283                 verts[i].v[c].hi = _simd_setzero_ps();
1284 #endif
1285                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1286             }
1287         }
1288
1289         return true;
1290     }
1291
1292 #endif
1293     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1294     {
1295         SWR_ASSERT(slot < m_numAttributes);
1296         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1297
1298         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1299         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1300         {
1301 #if USE_SIMD16_FRONTEND
1302             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1303 #else
1304             uint32_t index = m_ppIndices[i][primIndex];
1305 #endif
1306             const float* pVertData = pVertDataBase;
1307             float* pVert = (float*)&verts[i];
1308
1309             for (uint32_t c = 0; c < 4; ++c)
1310             {
1311                 pVert[c] = pVertData[index];
1312                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1313             }
1314         }
1315     }
1316
1317     bool NextPrim()
1318     {
1319         uint32_t numPrims = PA_TESS::NumPrims();
1320         m_numPrims -= numPrims;
1321         m_ppIndices[0] += numPrims;
1322         m_ppIndices[1] += numPrims;
1323         m_ppIndices[2] += numPrims;
1324
1325         return HasWork();
1326     }
1327
1328     SIMDVERTEX& GetNextVsOutput()
1329     {
1330         SWR_ASSERT(0, "%s", __FUNCTION__);
1331         static SIMDVERTEX junk;
1332         return junk;
1333     }
1334
1335     bool GetNextStreamOutput()
1336     {
1337         SWR_ASSERT(0, "%s", __FUNCTION__);
1338         return false;
1339     }
1340
1341     SIMDMASK& GetNextVsIndices()
1342     {
1343         SWR_ASSERT(0, "%s", __FUNCTION__);
1344         static SIMDMASK junk;
1345         return junk;
1346     }
1347
1348     uint32_t NumPrims()
1349     {
1350         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1351     }
1352
1353     void Reset() { SWR_ASSERT(0); };
1354
1355     SIMDSCALARI GetPrimID(uint32_t startID)
1356     {
1357 #if USE_SIMD16_FRONTEND
1358         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1359 #else
1360         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1361 #endif
1362     }
1363
1364 private:
1365     const SIMDSCALAR*   m_pVertexData = nullptr;
1366     uint32_t            m_attributeStrideInVectors = 0;
1367     uint32_t            m_numAttributes = 0;
1368     uint32_t            m_numPrims = 0;
1369     uint32_t*           m_ppIndices[3];
1370
1371     uint32_t            m_numVertsPerPrim = 0;
1372
1373     SIMDSCALARI         m_vPrimId;
1374 };
1375
1376 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1377 // based on state.
1378 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1379 struct PA_FACTORY
1380 {
1381     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1382     {
1383 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1384         const API_STATE& state = GetApiState(pDC);
1385         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1386             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1387             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1388             topo == TOP_TRIANGLE_LIST)) ||
1389
1390             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1391             // for them in the optimized PA
1392             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1393         {
1394             memset(&indexStore, 0, sizeof(indexStore));
1395             uint32_t numAttribs = state.feNumAttributes;
1396
1397             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
1398                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1399             cutPA = true;
1400         }
1401         else
1402 #endif
1403         {
1404             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1405             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
1406             cutPA = false;
1407         }
1408
1409     }
1410
1411     PA_STATE& GetPA()
1412     {
1413 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1414         if (cutPA)
1415         {
1416             return this->paCut;
1417         }
1418         else
1419 #endif
1420         {
1421             return this->paOpt;
1422         }
1423     }
1424
1425     PA_STATE_OPT paOpt;
1426     PA_STATE_CUT paCut;
1427     bool cutPA{ false };
1428
1429     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1430
1431     PA_STATE::SIMDVERTEX    vertexStore[MAX_NUM_VERTS_PER_PRIM];
1432     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1433 };