src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37     DRAW_CONTEXT *pDC{ nullptr };              // draw context
  38     uint8_t* pStreamBase{ nullptr };           // vertex stream
  39     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
  40
  41     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  42     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  43
  44     PA_STATE() {}
  45     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
  46         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
  47
  48     virtual bool HasWork() = 0;
  49     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  50     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  51     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
  52     virtual bool NextPrim() = 0;
  53     virtual simdvertex& GetNextVsOutput() = 0;
  54     virtual bool GetNextStreamOutput() = 0;
  55     virtual simdmask& GetNextVsIndices() = 0;
  56     virtual uint32_t NumPrims() = 0;
  57     virtual void Reset() = 0;
  58     virtual simdscalari GetPrimID(uint32_t startID) = 0;
  59 };
  60
  61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
  62 // output. Here is the sequence
  63 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
  64 //    2. Execute PA function to assemble and bin triangles.
  65 //        a.    The PA function is a set of functions that collectively make up the
  66 //            state machine for a given topology.
  67 //                1.    We use a state index to track which PA function to call.
  68 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
  69 //                1.    We call this the current and previous simd vertex.
  70 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
  71 //                    order to assemble the second triangle, for a triangle list, we'll need the
  72 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
  73 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
  74 //
  75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
  76 // cuts
  77 struct PA_STATE_OPT : public PA_STATE
  78 {
  79     simdvertex leadingVertex;            // For tri-fan
  80     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
  81     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
  82
  83     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
  84
  85     uint32_t cur{ 0 };                   // index to current VS output.
  86     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
  87     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
  88
  89     uint32_t counter{ 0 };               // state counter
  90     bool reset{ false };                 // reset state
  91
  92     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
  93     simdscalari primID;
  94
  95     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
  96     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
  97
  98     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
  99     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 100     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 101
 102     // state used to advance the PA when Next is called
 103     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 104     uint32_t           nextNumSimdPrims{ 0 };
 105     uint32_t           nextNumPrimsIncrement{ 0 };
 106     bool               nextReset{ false };
 107     bool               isStreaming{ false };
 108
 109     simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
 110
 111     PA_STATE_OPT() {}
 112     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 113         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 114
 115     bool HasWork()
 116     {
 117         return (this->numPrimsComplete < this->numPrims) ? true : false;
 118     }
 119
 120     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 121     {
 122         simdvertex* pVertex = (simdvertex*)pStreamBase;
 123         return pVertex[index].attrib[slot];
 124     }
 125
 126     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 127     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 128     bool Assemble(uint32_t slot, simdvector verts[])
 129     {
 130         return this->pfnPaFunc(*this, slot, verts);
 131     }
 132
 133     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 134     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
 135     {
 136         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 137     }
 138
 139     bool NextPrim()
 140     {
 141         this->pfnPaFunc = this->pfnPaNextFunc;
 142         this->numSimdPrims = this->nextNumSimdPrims;
 143         this->numPrimsComplete += this->nextNumPrimsIncrement;
 144         this->reset = this->nextReset;
 145
 146         if (this->isStreaming)
 147         {
 148             this->reset = false;
 149         }
 150
 151         bool morePrims = false;
 152
 153         if (this->numSimdPrims > 0)
 154         {
 155             morePrims = true;
 156             this->numSimdPrims--;
 157         }
 158         else
 159         {
 160             this->counter = (this->reset) ? 0 : (this->counter + 1);
 161             this->reset = false;
 162         }
 163
 164         this->pfnPaFunc = this->pfnPaNextFunc;
 165
 166         if (!HasWork())
 167         {
 168             morePrims = false;    // no more to do
 169         }
 170
 171         return morePrims;
 172     }
 173
 174     simdvertex& GetNextVsOutput()
 175     {
 176         // increment cur and prev indices
 177         const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
 178         this->prev = this->cur;  // prev is undefined for first state.
 179         this->cur = this->counter % numSimdVerts;
 180
 181         simdvertex* pVertex = (simdvertex*)pStreamBase;
 182         return pVertex[this->cur];
 183     }
 184
 185     simdmask& GetNextVsIndices()
 186     {
 187         // unused in optimized PA, pass tmp buffer back
 188         return tmpIndices;
 189     }
 190
 191     bool GetNextStreamOutput()
 192     {
 193         this->prev = this->cur;
 194         this->cur = this->counter;
 195
 196         return HasWork();
 197     }
 198
 199     uint32_t NumPrims()
 200     {
 201         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 202             (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
 203     }
 204
 205     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 206         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 207         uint32_t numSimdPrims = 0,
 208         uint32_t numPrimsIncrement = 0,
 209         bool reset = false)
 210     {
 211         this->pfnPaNextFunc = pfnPaNextFunc;
 212         this->nextNumSimdPrims = numSimdPrims;
 213         this->nextNumPrimsIncrement = numPrimsIncrement;
 214         this->nextReset = reset;
 215
 216         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 217     }
 218
 219     void Reset()
 220     {
 221         this->pfnPaFunc = this->pfnPaFuncReset;
 222         this->numPrimsComplete = 0;
 223         this->numSimdPrims = 0;
 224         this->cur = 0;
 225         this->prev = 0;
 226         this->first = 0;
 227         this->counter = 0;
 228         this->reset = false;
 229     }
 230
 231     simdscalari GetPrimID(uint32_t startID)
 232     {
 233         return _simd_add_epi32(this->primID,
 234             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
 235     }
 236 };
 237
 238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 240     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 241     uint32_t numSimdPrims = 0,
 242     uint32_t numPrimsIncrement = 0,
 243     bool reset = false)
 244 {
 245     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 246 }
 247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 248 {
 249     return pa.GetSimdVector(index, slot);
 250 }
 251
 252 INLINE __m128 swizzleLane0(const simdvector &a)
 253 {
 254     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 255     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 256     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 257 }
 258
 259 INLINE __m128 swizzleLane1(const simdvector &a)
 260 {
 261     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 262     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 263     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 264 }
 265
 266 INLINE __m128 swizzleLane2(const simdvector &a)
 267 {
 268     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 269     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 270     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 271 }
 272
 273 INLINE __m128 swizzleLane3(const simdvector &a)
 274 {
 275     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 276     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 277     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 278 }
 279
 280 INLINE __m128 swizzleLane4(const simdvector &a)
 281 {
 282     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 283     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 284     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 285
 286 }
 287
 288 INLINE __m128 swizzleLane5(const simdvector &a)
 289 {
 290     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 291     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 292     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 293 }
 294
 295 INLINE __m128 swizzleLane6(const simdvector &a)
 296 {
 297     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 298     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 299     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 300 }
 301
 302 INLINE __m128 swizzleLane7(const simdvector &a)
 303 {
 304     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 305     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 306     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 307 }
 308
 309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 310 {
 311     switch (lane) {
 312     case 0:
 313         return swizzleLane0(a);
 314     case 1:
 315         return swizzleLane1(a);
 316     case 2:
 317         return swizzleLane2(a);
 318     case 3:
 319         return swizzleLane3(a);
 320     case 4:
 321         return swizzleLane4(a);
 322     case 5:
 323         return swizzleLane5(a);
 324     case 6:
 325         return swizzleLane6(a);
 326     case 7:
 327         return swizzleLane7(a);
 328     default:
 329         return _mm_setzero_ps();
 330     }
 331 }
 332
 333 // Cut-aware primitive assembler.
 334 struct PA_STATE_CUT : public PA_STATE
 335 {
 336     simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 337     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 338     uint32_t numAttribs{ 0 };            // number of attributes
 339     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 340     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 341     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
 342     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 343     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 344     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 345     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 346     uint32_t curVertex{ 0 };             // current unprocessed vertex
 347     uint32_t startPrimId{ 0 };           // starting prim id
 348     simdscalari vPrimId;                 // vector of prim ID
 349     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 350     uint32_t vertsPerPrim{ 0 };
 351     simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
 352     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 353                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 354                                          // while the GS sends valid verts for every index
 355     // Topology state tracking
 356     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 357     uint32_t curIndex{ 0 };
 358     bool reverseWinding{ false };        // indicates reverse winding for strips
 359     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 360
 361     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 362     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 363
 364     PA_STATE_CUT() {}
 365     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
 366         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 367         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
 368     {
 369         numVerts = in_streamSizeInVerts;
 370         numAttribs = in_numAttribs;
 371         binTopology = topo;
 372         needOffsets = false;
 373         processCutVerts = in_processCutVerts;
 374
 375         numVertsToAssemble = numRemainingVerts = in_numVerts;
 376         numPrimsAssembled = 0;
 377         headVertex = tailVertex = curVertex = 0;
 378
 379         curIndex = 0;
 380         pCutIndices = in_pIndices;
 381         memset(indices, 0, sizeof(indices));
 382         vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 383         reverseWinding = false;
 384         adjExtraVert = -1;
 385
 386         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 387         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 388
 389         switch (topo)
 390         {
 391         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 392         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 393         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 394         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 395                                     {
 396                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 397                                     }
 398                                     else
 399                                     {
 400                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 401                                     }
 402                                     break;
 403
 404         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 405         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 406         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 407         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 408         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 409         default: assert(0 && "Unimplemented topology");
 410         }
 411     }
 412
 413     simdvertex& GetNextVsOutput()
 414     {
 415         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
 416         this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
 417         this->needOffsets = true;
 418         return ((simdvertex*)pStreamBase)[vertexIndex];
 419     }
 420
 421     simdmask& GetNextVsIndices()
 422     {
 423         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
 424         simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
 425         return *pCurCutIndex;
 426     }
 427
 428     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 429     {
 430         // unused
 431         SWR_ASSERT(0 && "Not implemented");
 432         return this->tmpVertex.attrib[0];
 433     }
 434
 435     bool GetNextStreamOutput()
 436     {
 437         this->headVertex += KNOB_SIMD_WIDTH;
 438         this->needOffsets = true;
 439         return HasWork();
 440     }
 441
 442     simdscalari GetPrimID(uint32_t startID)
 443     {
 444         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 445     }
 446
 447     void Reset()
 448     {
 449         this->numRemainingVerts = this->numVertsToAssemble;
 450         this->numPrimsAssembled = 0;
 451         this->curIndex = 0;
 452         this->curVertex = 0;
 453         this->tailVertex = 0;
 454         this->headVertex = 0;
 455         this->reverseWinding = false;
 456         this->adjExtraVert = -1;
 457         this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 458     }
 459
 460     bool HasWork()
 461     {
 462         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 463     }
 464
 465     bool IsVertexStoreFull()
 466     {
 467         return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 468     }
 469
 470     void RestartTopology()
 471     {
 472         this->curIndex = 0;
 473         this->reverseWinding = false;
 474         this->adjExtraVert = -1;
 475     }
 476
 477     bool IsCutIndex(uint32_t vertex)
 478     {
 479         uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
 480         uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
 481         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 482     }
 483
 484     // iterates across the unprocessed verts until we hit the end or we
 485     // have assembled SIMD prims
 486     void ProcessVerts()
 487     {
 488         while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
 489             this->numRemainingVerts > 0 &&
 490             this->curVertex != this->headVertex)
 491         {
 492             // if cut index, restart topology
 493             if (IsCutIndex(this->curVertex))
 494             {
 495                 if (this->processCutVerts)
 496                 {
 497                     (this->*pfnPa)(this->curVertex, false);
 498                 }
 499                 // finish off tri strip w/ adj before restarting topo
 500                 if (this->adjExtraVert != -1)
 501                 {
 502                     (this->*pfnPa)(this->curVertex, true);
 503                 }
 504                 RestartTopology();
 505             }
 506             else
 507             {
 508                 (this->*pfnPa)(this->curVertex, false);
 509             }
 510
 511             this->curVertex++;
 512             if (this->curVertex >= this->numVerts) {
 513                this->curVertex = 0;
 514             }
 515             this->numRemainingVerts--;
 516         }
 517
 518         // special case last primitive for tri strip w/ adj
 519         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 520         {
 521             (this->*pfnPa)(this->curVertex, true);
 522         }
 523     }
 524
 525     void Advance()
 526     {
 527         // done with current batch
 528         // advance tail to the current unsubmitted vertex
 529         this->tailVertex = this->curVertex;
 530         this->numPrimsAssembled = 0;
 531         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
 532     }
 533
 534     bool NextPrim()
 535     {
 536         // if we've assembled enough prims, we can advance to the next set of verts
 537         if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
 538         {
 539             Advance();
 540         }
 541         return false;
 542     }
 543
 544     void ComputeOffsets()
 545     {
 546         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 547         {
 548             simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
 549
 550             // step to simdvertex batch
 551             const uint32_t simdShift = 3; // @todo make knob
 552             simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 553             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
 554
 555             // step to index
 556             const uint32_t simdMask = 0x7; // @todo make knob
 557             simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 558             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 559         }
 560     }
 561
 562     bool Assemble(uint32_t slot, simdvector result[])
 563     {
 564         // process any outstanding verts
 565         ProcessVerts();
 566
 567         // return false if we don't have enough prims assembled
 568         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
 569         {
 570             return false;
 571         }
 572
 573         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 574         if (this->needOffsets)
 575         {
 576             ComputeOffsets();
 577             this->needOffsets = false;
 578         }
 579
 580         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 581         {
 582             simdscalari offsets = this->vOffsets[v];
 583
 584             // step to attribute
 585             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 586
 587             float* pBase = (float*)this->pStreamBase;
 588             for (uint32_t c = 0; c < 4; ++c)
 589             {
 590                 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 591
 592                 // move base to next component
 593                 pBase += KNOB_SIMD_WIDTH;
 594             }
 595         }
 596
 597         return true;
 598     }
 599
 600     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
 601     {
 602         // move to slot
 603         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 604         {
 605             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 606             uint32_t offset = pOffset[triIndex];
 607             offset += sizeof(simdvector) * slot;
 608             float* pVert = (float*)&tri[v];
 609             for (uint32_t c = 0; c < 4; ++c)
 610             {
 611                 float* pComponent = (float*)(this->pStreamBase + offset);
 612                 pVert[c] = *pComponent;
 613                 offset += KNOB_SIMD_WIDTH * sizeof(float);
 614             }
 615         }
 616     }
 617
 618     uint32_t NumPrims()
 619     {
 620         return this->numPrimsAssembled;
 621     }
 622
 623     // Per-topology functions
 624     void ProcessVertTriStrip(uint32_t index, bool finish)
 625     {
 626         this->vert[this->curIndex] = index;
 627         this->curIndex++;
 628         if (this->curIndex == 3)
 629         {
 630             // assembled enough verts for prim, add to gather indices
 631             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 632             if (reverseWinding)
 633             {
 634                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 635                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 636             }
 637             else
 638             {
 639                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 640                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 641             }
 642
 643             // increment numPrimsAssembled
 644             this->numPrimsAssembled++;
 645
 646             // set up next prim state
 647             this->vert[0] = this->vert[1];
 648             this->vert[1] = this->vert[2];
 649             this->curIndex = 2;
 650             this->reverseWinding ^= 1;
 651         }
 652     }
 653
 654     template<bool gsEnabled>
 655     void AssembleTriStripAdj()
 656     {
 657         if (!gsEnabled)
 658         {
 659             this->vert[1] = this->vert[2];
 660             this->vert[2] = this->vert[4];
 661
 662             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 663             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 664             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 665
 666             this->vert[4] = this->vert[2];
 667             this->vert[2] = this->vert[1];
 668         }
 669         else
 670         {
 671             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 672             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 673             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 674             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 675             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 676             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 677         }
 678         this->numPrimsAssembled++;
 679     }
 680
 681
 682     template<bool gsEnabled>
 683     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 684     {
 685         // handle last primitive of tristrip
 686         if (finish && this->adjExtraVert != -1)
 687         {
 688             this->vert[3] = this->adjExtraVert;
 689             AssembleTriStripAdj<gsEnabled>();
 690             this->adjExtraVert = -1;
 691             return;
 692         }
 693
 694         switch (this->curIndex)
 695         {
 696         case 0:
 697         case 1:
 698         case 2:
 699         case 4:
 700             this->vert[this->curIndex] = index;
 701             this->curIndex++;
 702             break;
 703         case 3:
 704             this->vert[5] = index;
 705             this->curIndex++;
 706             break;
 707         case 5:
 708             if (this->adjExtraVert == -1)
 709             {
 710                 this->adjExtraVert = index;
 711             }
 712             else
 713             {
 714                 this->vert[3] = index;
 715                 if (!gsEnabled)
 716                 {
 717                     AssembleTriStripAdj<gsEnabled>();
 718
 719                     uint32_t nextTri[6];
 720                     if (this->reverseWinding)
 721                     {
 722                         nextTri[0] = this->vert[4];
 723                         nextTri[1] = this->vert[0];
 724                         nextTri[2] = this->vert[2];
 725                         nextTri[4] = this->vert[3];
 726                         nextTri[5] = this->adjExtraVert;
 727                     }
 728                     else
 729                     {
 730                         nextTri[0] = this->vert[2];
 731                         nextTri[1] = this->adjExtraVert;
 732                         nextTri[2] = this->vert[3];
 733                         nextTri[4] = this->vert[4];
 734                         nextTri[5] = this->vert[0];
 735                     }
 736                     for (uint32_t i = 0; i < 6; ++i)
 737                     {
 738                         this->vert[i] = nextTri[i];
 739                     }
 740
 741                     this->adjExtraVert = -1;
 742                     this->reverseWinding ^= 1;
 743                 }
 744                 else
 745                 {
 746                     this->curIndex++;
 747                 }
 748             }
 749             break;
 750         case 6:
 751             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 752             AssembleTriStripAdj<gsEnabled>();
 753
 754             uint32_t nextTri[6];
 755             if (this->reverseWinding)
 756             {
 757                 nextTri[0] = this->vert[4];
 758                 nextTri[1] = this->vert[0];
 759                 nextTri[2] = this->vert[2];
 760                 nextTri[4] = this->vert[3];
 761                 nextTri[5] = this->adjExtraVert;
 762             }
 763             else
 764             {
 765                 nextTri[0] = this->vert[2];
 766                 nextTri[1] = this->adjExtraVert;
 767                 nextTri[2] = this->vert[3];
 768                 nextTri[4] = this->vert[4];
 769                 nextTri[5] = this->vert[0];
 770             }
 771             for (uint32_t i = 0; i < 6; ++i)
 772             {
 773                 this->vert[i] = nextTri[i];
 774             }
 775             this->reverseWinding ^= 1;
 776             this->adjExtraVert = index;
 777             this->curIndex--;
 778             break;
 779         }
 780     }
 781
 782     void ProcessVertTriList(uint32_t index, bool finish)
 783     {
 784         this->vert[this->curIndex] = index;
 785         this->curIndex++;
 786         if (this->curIndex == 3)
 787         {
 788             // assembled enough verts for prim, add to gather indices
 789             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 790             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 791             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 792
 793             // increment numPrimsAssembled
 794             this->numPrimsAssembled++;
 795
 796             // set up next prim state
 797             this->curIndex = 0;
 798         }
 799     }
 800
 801     void ProcessVertTriListAdj(uint32_t index, bool finish)
 802     {
 803         this->vert[this->curIndex] = index;
 804         this->curIndex++;
 805         if (this->curIndex == 6)
 806         {
 807             // assembled enough verts for prim, add to gather indices
 808             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 809             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 810             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 811             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 812             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 813             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 814
 815             // increment numPrimsAssembled
 816             this->numPrimsAssembled++;
 817
 818             // set up next prim state
 819             this->curIndex = 0;
 820         }
 821     }
 822
 823     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 824     {
 825         this->vert[this->curIndex] = index;
 826         this->curIndex++;
 827         if (this->curIndex == 6)
 828         {
 829             // assembled enough verts for prim, add to gather indices
 830             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 831             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 832             this->indices[2][this->numPrimsAssembled] = this->vert[4];
 833
 834             // increment numPrimsAssembled
 835             this->numPrimsAssembled++;
 836
 837             // set up next prim state
 838             this->curIndex = 0;
 839         }
 840     }
 841
 842
 843     void ProcessVertLineList(uint32_t index, bool finish)
 844     {
 845         this->vert[this->curIndex] = index;
 846         this->curIndex++;
 847         if (this->curIndex == 2)
 848         {
 849             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 850             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 851
 852             this->numPrimsAssembled++;
 853             this->curIndex = 0;
 854         }
 855     }
 856
 857     void ProcessVertLineStrip(uint32_t index, bool finish)
 858     {
 859         this->vert[this->curIndex] = index;
 860         this->curIndex++;
 861         if (this->curIndex == 2)
 862         {
 863             // assembled enough verts for prim, add to gather indices
 864             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 865             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 866
 867             // increment numPrimsAssembled
 868             this->numPrimsAssembled++;
 869
 870             // set up next prim state
 871             this->vert[0] = this->vert[1];
 872             this->curIndex = 1;
 873         }
 874     }
 875
 876     void ProcessVertLineStripAdj(uint32_t index, bool finish)
 877     {
 878         this->vert[this->curIndex] = index;
 879         this->curIndex++;
 880         if (this->curIndex == 4)
 881         {
 882             // assembled enough verts for prim, add to gather indices
 883             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 884             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 885             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 886             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 887
 888             // increment numPrimsAssembled
 889             this->numPrimsAssembled++;
 890
 891             // set up next prim state
 892             this->vert[0] = this->vert[1];
 893             this->vert[1] = this->vert[2];
 894             this->vert[2] = this->vert[3];
 895             this->curIndex = 3;
 896         }
 897     }
 898
 899     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
 900     {
 901         this->vert[this->curIndex] = index;
 902         this->curIndex++;
 903         if (this->curIndex == 4)
 904         {
 905             // assembled enough verts for prim, add to gather indices
 906             this->indices[0][this->numPrimsAssembled] = this->vert[1];
 907             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 908
 909             // increment numPrimsAssembled
 910             this->numPrimsAssembled++;
 911
 912             // set up next prim state
 913             this->vert[0] = this->vert[1];
 914             this->vert[1] = this->vert[2];
 915             this->vert[2] = this->vert[3];
 916             this->curIndex = 3;
 917         }
 918     }
 919
 920     void ProcessVertLineListAdj(uint32_t index, bool finish)
 921     {
 922         this->vert[this->curIndex] = index;
 923         this->curIndex++;
 924         if (this->curIndex == 4)
 925         {
 926             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 927             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 928             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 929             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 930
 931             this->numPrimsAssembled++;
 932             this->curIndex = 0;
 933         }
 934     }
 935
 936     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
 937     {
 938         this->vert[this->curIndex] = index;
 939         this->curIndex++;
 940         if (this->curIndex == 4)
 941         {
 942             this->indices[0][this->numPrimsAssembled] = this->vert[1];
 943             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 944
 945             this->numPrimsAssembled++;
 946             this->curIndex = 0;
 947         }
 948     }
 949
 950     void ProcessVertPointList(uint32_t index, bool finish)
 951     {
 952         this->vert[this->curIndex] = index;
 953         this->curIndex++;
 954         if (this->curIndex == 1)
 955         {
 956             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 957             this->numPrimsAssembled++;
 958             this->curIndex = 0;
 959         }
 960     }
 961 };
 962
 963 // Primitive Assembly for data output from the DomainShader.
 964 struct PA_TESS : PA_STATE
 965 {
 966     PA_TESS(
 967         DRAW_CONTEXT *in_pDC,
 968         const simdscalar* in_pVertData,
 969         uint32_t in_attributeStrideInVectors,
 970         uint32_t in_numAttributes,
 971         uint32_t* (&in_ppIndices)[3],
 972         uint32_t in_numPrims,
 973         PRIMITIVE_TOPOLOGY in_binTopology) :
 974
 975         PA_STATE(in_pDC, nullptr, 0),
 976         m_pVertexData(in_pVertData),
 977         m_attributeStrideInVectors(in_attributeStrideInVectors),
 978         m_numAttributes(in_numAttributes),
 979         m_numPrims(in_numPrims)
 980     {
 981         m_vPrimId = _simd_setzero_si();
 982         binTopology = in_binTopology;
 983         m_ppIndices[0] = in_ppIndices[0];
 984         m_ppIndices[1] = in_ppIndices[1];
 985         m_ppIndices[2] = in_ppIndices[2];
 986
 987         switch (binTopology)
 988         {
 989         case TOP_POINT_LIST:
 990             m_numVertsPerPrim = 1;
 991             break;
 992
 993         case TOP_LINE_LIST:
 994             m_numVertsPerPrim = 2;
 995             break;
 996
 997         case TOP_TRIANGLE_LIST:
 998             m_numVertsPerPrim = 3;
 999             break;
1000
1001         default:
1002             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1003             break;
1004         }
1005     }
1006
1007     bool HasWork()
1008     {
1009         return m_numPrims != 0;
1010     }
1011
1012     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1013     {
1014         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1015         static simdvector junk;
1016         return junk;
1017     }
1018
1019     static simdscalari GenPrimMask(uint32_t numPrims)
1020     {
1021         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1022 #if KNOB_SIMD_WIDTH == 8
1023         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1024         {
1025             -1, -1, -1, -1, -1, -1, -1, -1,
1026              0,  0,  0,  0,  0,  0,  0,  0
1027         };
1028 #elif KNOB_SIMD_WIDTH == 16
1029         static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
1030         {
1031             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1032              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1033         };
1034 #else
1035 #error "Help, help, I can't get up!"
1036 #endif
1037
1038         return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1039     }
1040
1041     bool Assemble(uint32_t slot, simdvector verts[])
1042     {
1043         static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1044         SWR_ASSERT(slot < m_numAttributes);
1045
1046         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1047         if (0 == numPrimsToAssemble)
1048         {
1049             return false;
1050         }
1051
1052         simdscalari mask = GenPrimMask(numPrimsToAssemble);
1053
1054         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1055         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1056         {
1057             simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1058
1059             const float* pBase = pBaseAttrib;
1060             for (uint32_t c = 0; c < 4; ++c)
1061             {
1062                 verts[i].v[c] = _simd_mask_i32gather_ps(
1063                     _simd_setzero_ps(),
1064                     pBase,
1065                     indices,
1066                     _simd_castsi_ps(mask),
1067                     4 /* gcc doesn't like sizeof(float) */);
1068                 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1069             }
1070         }
1071
1072         return true;
1073     }
1074
1075     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1076     {
1077         SWR_ASSERT(slot < m_numAttributes);
1078         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1079
1080         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1081         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1082         {
1083             uint32_t index = m_ppIndices[i][primIndex];
1084             const float* pVertData = pVertDataBase;
1085             float* pVert = (float*)&verts[i];
1086
1087             for (uint32_t c = 0; c < 4; ++c)
1088             {
1089                 pVert[c] = pVertData[index];
1090                 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1091             }
1092         }
1093     }
1094
1095     bool NextPrim()
1096     {
1097         uint32_t numPrims = PA_TESS::NumPrims();
1098         m_numPrims -= numPrims;
1099         m_ppIndices[0] += numPrims;
1100         m_ppIndices[1] += numPrims;
1101         m_ppIndices[2] += numPrims;
1102
1103         return HasWork();
1104     }
1105
1106     simdvertex& GetNextVsOutput()
1107     {
1108         SWR_ASSERT(0, "%s", __FUNCTION__);
1109         static simdvertex junk;
1110         return junk;
1111     }
1112
1113     bool GetNextStreamOutput()
1114     {
1115         SWR_ASSERT(0, "%s", __FUNCTION__);
1116         return false;
1117     }
1118
1119     simdmask& GetNextVsIndices()
1120     {
1121         SWR_ASSERT(0, "%s", __FUNCTION__);
1122         static simdmask junk;
1123         return junk;
1124     }
1125
1126     uint32_t NumPrims()
1127     {
1128         return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1129     }
1130
1131     void Reset() { SWR_ASSERT(0); };
1132
1133     simdscalari GetPrimID(uint32_t startID)
1134     {
1135         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1136     }
1137
1138 private:
1139     const simdscalar*   m_pVertexData = nullptr;
1140     uint32_t            m_attributeStrideInVectors = 0;
1141     uint32_t            m_numAttributes = 0;
1142     uint32_t            m_numPrims = 0;
1143     uint32_t*           m_ppIndices[3];
1144
1145     uint32_t            m_numVertsPerPrim = 0;
1146
1147     simdscalari         m_vPrimId;
1148 };
1149
1150 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1151 // based on state.
1152 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1153 struct PA_FACTORY
1154 {
1155     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1156     {
1157 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1158         const API_STATE& state = GetApiState(pDC);
1159         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1160             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1161             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1162             topo == TOP_TRIANGLE_LIST)) ||
1163
1164             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1165             // for them in the optimized PA
1166             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1167         {
1168             memset(&indexStore, 0, sizeof(indexStore));
1169             uint32_t numAttribs = state.feNumAttributes;
1170
1171             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1172                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1173             cutPA = true;
1174         }
1175         else
1176 #endif
1177         {
1178             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1179             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1180             cutPA = false;
1181         }
1182
1183     }
1184
1185     PA_STATE& GetPA()
1186     {
1187 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1188         if (cutPA)
1189         {
1190             return this->paCut;
1191         }
1192         else
1193 #endif
1194         {
1195             return this->paOpt;
1196         }
1197     }
1198
1199     PA_STATE_OPT paOpt;
1200     PA_STATE_CUT paCut;
1201     bool cutPA{ false };
1202
1203     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1204
1205     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1206     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1207 };