src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37     DRAW_CONTEXT *pDC{ nullptr };              // draw context
  38     uint8_t* pStreamBase{ nullptr };           // vertex stream
  39     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
  40
  41     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  42     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  43
  44     PA_STATE() {}
  45     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
  46         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
  47
  48     virtual bool HasWork() = 0;
  49     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  50     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  51     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
  52     virtual bool NextPrim() = 0;
  53     virtual simdvertex& GetNextVsOutput() = 0;
  54     virtual bool GetNextStreamOutput() = 0;
  55     virtual simdmask& GetNextVsIndices() = 0;
  56     virtual uint32_t NumPrims() = 0;
  57     virtual void Reset() = 0;
  58     virtual simdscalari GetPrimID(uint32_t startID) = 0;
  59 };
  60
  61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
  62 // output. Here is the sequence
  63 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
  64 //    2. Execute PA function to assemble and bin triangles.
  65 //        a.    The PA function is a set of functions that collectively make up the
  66 //            state machine for a given topology.
  67 //                1.    We use a state index to track which PA function to call.
  68 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
  69 //                1.    We call this the current and previous simd vertex.
  70 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
  71 //                    order to assemble the second triangle, for a triangle list, we'll need the
  72 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
  73 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
  74 //
  75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
  76 // cuts
  77 struct PA_STATE_OPT : public PA_STATE
  78 {
  79     simdvertex leadingVertex;            // For tri-fan
  80     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
  81     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
  82
  83     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
  84
  85     uint32_t cur{ 0 };                   // index to current VS output.
  86     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
  87     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
  88
  89     uint32_t counter{ 0 };               // state counter
  90     bool reset{ false };                 // reset state
  91
  92     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
  93     simdscalari primID;
  94
  95     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
  96     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
  97
  98     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
  99     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 100     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 101
 102     // state used to advance the PA when Next is called
 103     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 104     uint32_t           nextNumSimdPrims{ 0 };
 105     uint32_t           nextNumPrimsIncrement{ 0 };
 106     bool               nextReset{ false };
 107     bool               isStreaming{ false };
 108
 109     simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
 110
 111     PA_STATE_OPT() {}
 112     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 113         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 114
 115     bool HasWork()
 116     {
 117         return (this->numPrimsComplete < this->numPrims) ? true : false;
 118     }
 119
 120     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 121     {
 122         simdvertex* pVertex = (simdvertex*)pStreamBase;
 123         return pVertex[index].attrib[slot];
 124     }
 125
 126     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 127     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 128     bool Assemble(uint32_t slot, simdvector verts[])
 129     {
 130         return this->pfnPaFunc(*this, slot, verts);
 131     }
 132
 133     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 134     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
 135     {
 136         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 137     }
 138
 139     bool NextPrim()
 140     {
 141         this->pfnPaFunc = this->pfnPaNextFunc;
 142         this->numSimdPrims = this->nextNumSimdPrims;
 143         this->numPrimsComplete += this->nextNumPrimsIncrement;
 144         this->reset = this->nextReset;
 145
 146         if (this->isStreaming)
 147         {
 148             this->reset = false;
 149         }
 150
 151         bool morePrims = false;
 152
 153         if (this->numSimdPrims > 0)
 154         {
 155             morePrims = true;
 156             this->numSimdPrims--;
 157         }
 158         else
 159         {
 160             this->counter = (this->reset) ? 0 : (this->counter + 1);
 161             this->reset = false;
 162         }
 163
 164         this->pfnPaFunc = this->pfnPaNextFunc;
 165
 166         if (!HasWork())
 167         {
 168             morePrims = false;    // no more to do
 169         }
 170
 171         return morePrims;
 172     }
 173
 174     simdvertex& GetNextVsOutput()
 175     {
 176         // increment cur and prev indices
 177         const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
 178         this->prev = this->cur;  // prev is undefined for first state.
 179         this->cur = this->counter % numSimdVerts;
 180
 181         simdvertex* pVertex = (simdvertex*)pStreamBase;
 182         return pVertex[this->cur];
 183     }
 184
 185     simdmask& GetNextVsIndices()
 186     {
 187         // unused in optimized PA, pass tmp buffer back
 188         return tmpIndices;
 189     }
 190
 191     bool GetNextStreamOutput()
 192     {
 193         this->prev = this->cur;
 194         this->cur = this->counter;
 195
 196         return HasWork();
 197     }
 198
 199     uint32_t NumPrims()
 200     {
 201         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 202             (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
 203     }
 204
 205     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 206         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 207         uint32_t numSimdPrims = 0,
 208         uint32_t numPrimsIncrement = 0,
 209         bool reset = false)
 210     {
 211         this->pfnPaNextFunc = pfnPaNextFunc;
 212         this->nextNumSimdPrims = numSimdPrims;
 213         this->nextNumPrimsIncrement = numPrimsIncrement;
 214         this->nextReset = reset;
 215
 216         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 217     }
 218
 219     void Reset()
 220     {
 221         this->pfnPaFunc = this->pfnPaFuncReset;
 222         this->numPrimsComplete = 0;
 223         this->numSimdPrims = 0;
 224         this->cur = 0;
 225         this->prev = 0;
 226         this->first = 0;
 227         this->counter = 0;
 228         this->reset = false;
 229     }
 230
 231     simdscalari GetPrimID(uint32_t startID)
 232     {
 233         return _simd_add_epi32(this->primID,
 234             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
 235     }
 236 };
 237
 238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 239 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 240     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 241     uint32_t numSimdPrims = 0,
 242     uint32_t numPrimsIncrement = 0,
 243     bool reset = false)
 244 {
 245     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 246 }
 247 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 248 {
 249     return pa.GetSimdVector(index, slot);
 250 }
 251
 252 INLINE __m128 swizzleLane0(const simdvector &a)
 253 {
 254     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 255     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 256     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 257 }
 258
 259 INLINE __m128 swizzleLane1(const simdvector &a)
 260 {
 261     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 262     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 263     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 264 }
 265
 266 INLINE __m128 swizzleLane2(const simdvector &a)
 267 {
 268     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 269     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 270     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 271 }
 272
 273 INLINE __m128 swizzleLane3(const simdvector &a)
 274 {
 275     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 276     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 277     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 278 }
 279
 280 INLINE __m128 swizzleLane4(const simdvector &a)
 281 {
 282     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 283     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 284     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 285
 286 }
 287
 288 INLINE __m128 swizzleLane5(const simdvector &a)
 289 {
 290     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 291     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 292     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 293 }
 294
 295 INLINE __m128 swizzleLane6(const simdvector &a)
 296 {
 297     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 298     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 299     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 300 }
 301
 302 INLINE __m128 swizzleLane7(const simdvector &a)
 303 {
 304     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 305     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 306     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 307 }
 308
 309 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 310 {
 311     switch (lane) {
 312     case 0:
 313         return swizzleLane0(a);
 314     case 1:
 315         return swizzleLane1(a);
 316     case 2:
 317         return swizzleLane2(a);
 318     case 3:
 319         return swizzleLane3(a);
 320     case 4:
 321         return swizzleLane4(a);
 322     case 5:
 323         return swizzleLane5(a);
 324     case 6:
 325         return swizzleLane6(a);
 326     case 7:
 327         return swizzleLane7(a);
 328     default:
 329         return _mm_setzero_ps();
 330     }
 331 }
 332
 333 // Cut-aware primitive assembler.
 334 struct PA_STATE_CUT : public PA_STATE
 335 {
 336     simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 337     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 338     uint32_t numAttribs{ 0 };            // number of attributes
 339     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 340     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 341     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
 342     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 343     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 344     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 345     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 346     uint32_t curVertex{ 0 };             // current unprocessed vertex
 347     uint32_t startPrimId{ 0 };           // starting prim id
 348     simdscalari vPrimId;                 // vector of prim ID
 349     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 350     uint32_t vertsPerPrim{ 0 };
 351     simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
 352     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 353                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 354                                          // while the GS sends valid verts for every index
 355     // Topology state tracking
 356     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 357     uint32_t curIndex{ 0 };
 358     bool reverseWinding{ false };        // indicates reverse winding for strips
 359     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 360
 361     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 362     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 363
 364     PA_STATE_CUT() {}
 365     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
 366         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 367         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
 368     {
 369         numVerts = in_streamSizeInVerts;
 370         numAttribs = in_numAttribs;
 371         binTopology = topo;
 372         needOffsets = false;
 373         processCutVerts = in_processCutVerts;
 374
 375         numVertsToAssemble = numRemainingVerts = in_numVerts;
 376         numPrimsAssembled = 0;
 377         headVertex = tailVertex = curVertex = 0;
 378
 379         curIndex = 0;
 380         pCutIndices = in_pIndices;
 381         memset(indices, 0, sizeof(indices));
 382         vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 383         reverseWinding = false;
 384         adjExtraVert = -1;
 385
 386         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 387         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 388
 389         switch (topo)
 390         {
 391         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 392         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 393         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 394         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 395                                     {
 396                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 397                                     }
 398                                     else
 399                                     {
 400                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 401                                     }
 402                                     break;
 403
 404         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 405         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 406         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 407         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 408         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 409         default: assert(0 && "Unimplemented topology");
 410         }
 411     }
 412
 413     simdvertex& GetNextVsOutput()
 414     {
 415         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
 416         this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
 417         this->needOffsets = true;
 418         return ((simdvertex*)pStreamBase)[vertexIndex];
 419     }
 420
 421     simdmask& GetNextVsIndices()
 422     {
 423         uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
 424         simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
 425         return *pCurCutIndex;
 426     }
 427
 428     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 429     {
 430         // unused
 431         SWR_ASSERT(0 && "Not implemented");
 432         return this->tmpVertex.attrib[0];
 433     }
 434
 435     bool GetNextStreamOutput()
 436     {
 437         this->headVertex += KNOB_SIMD_WIDTH;
 438         this->needOffsets = true;
 439         return HasWork();
 440     }
 441
 442     simdscalari GetPrimID(uint32_t startID)
 443     {
 444         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 445     }
 446
 447     void Reset()
 448     {
 449         this->numRemainingVerts = this->numVertsToAssemble;
 450         this->numPrimsAssembled = 0;
 451         this->curIndex = 0;
 452         this->curVertex = 0;
 453         this->tailVertex = 0;
 454         this->headVertex = 0;
 455         this->reverseWinding = false;
 456         this->adjExtraVert = -1;
 457         this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 458     }
 459
 460     bool HasWork()
 461     {
 462         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 463     }
 464
 465     bool IsVertexStoreFull()
 466     {
 467         return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 468     }
 469
 470     void RestartTopology()
 471     {
 472         this->curIndex = 0;
 473         this->reverseWinding = false;
 474         this->adjExtraVert = -1;
 475     }
 476
 477     bool IsCutIndex(uint32_t vertex)
 478     {
 479         uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
 480         uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
 481         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 482     }
 483
 484     // iterates across the unprocessed verts until we hit the end or we
 485     // have assembled SIMD prims
 486     void ProcessVerts()
 487     {
 488         while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
 489             this->numRemainingVerts > 0 &&
 490             this->curVertex != this->headVertex)
 491         {
 492             // if cut index, restart topology
 493             if (IsCutIndex(this->curVertex))
 494             {
 495                 if (this->processCutVerts)
 496                 {
 497                     (this->*pfnPa)(this->curVertex, false);
 498                 }
 499                 // finish off tri strip w/ adj before restarting topo
 500                 if (this->adjExtraVert != -1)
 501                 {
 502                     (this->*pfnPa)(this->curVertex, true);
 503                 }
 504                 RestartTopology();
 505             }
 506             else
 507             {
 508                 (this->*pfnPa)(this->curVertex, false);
 509             }
 510
 511             this->curVertex = (this->curVertex + 1) % this->numVerts;
 512             this->numRemainingVerts--;
 513         }
 514
 515         // special case last primitive for tri strip w/ adj
 516         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 517         {
 518             (this->*pfnPa)(this->curVertex, true);
 519         }
 520     }
 521
 522     void Advance()
 523     {
 524         // done with current batch
 525         // advance tail to the current unsubmitted vertex
 526         this->tailVertex = this->curVertex;
 527         this->numPrimsAssembled = 0;
 528         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
 529     }
 530
 531     bool NextPrim()
 532     {
 533         // if we've assembled enough prims, we can advance to the next set of verts
 534         if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
 535         {
 536             Advance();
 537         }
 538         return false;
 539     }
 540
 541     void ComputeOffsets()
 542     {
 543         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 544         {
 545             simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
 546
 547             // step to simdvertex batch
 548             const uint32_t simdShift = 3; // @todo make knob
 549             simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 550             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
 551
 552             // step to index
 553             const uint32_t simdMask = 0x7; // @todo make knob
 554             simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 555             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 556         }
 557     }
 558
 559     bool Assemble(uint32_t slot, simdvector result[])
 560     {
 561         // process any outstanding verts
 562         ProcessVerts();
 563
 564         // return false if we don't have enough prims assembled
 565         if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
 566         {
 567             return false;
 568         }
 569
 570         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 571         if (this->needOffsets)
 572         {
 573             ComputeOffsets();
 574             this->needOffsets = false;
 575         }
 576
 577         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 578         {
 579             simdscalari offsets = this->vOffsets[v];
 580
 581             // step to attribute
 582             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 583
 584             float* pBase = (float*)this->pStreamBase;
 585             for (uint32_t c = 0; c < 4; ++c)
 586             {
 587                 result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 588
 589                 // move base to next component
 590                 pBase += KNOB_SIMD_WIDTH;
 591             }
 592         }
 593
 594         return true;
 595     }
 596
 597     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
 598     {
 599         // move to slot
 600         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 601         {
 602             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 603             uint32_t offset = pOffset[triIndex];
 604             offset += sizeof(simdvector) * slot;
 605             float* pVert = (float*)&tri[v];
 606             for (uint32_t c = 0; c < 4; ++c)
 607             {
 608                 float* pComponent = (float*)(this->pStreamBase + offset);
 609                 pVert[c] = *pComponent;
 610                 offset += KNOB_SIMD_WIDTH * sizeof(float);
 611             }
 612         }
 613     }
 614
 615     uint32_t NumPrims()
 616     {
 617         return this->numPrimsAssembled;
 618     }
 619
 620     // Per-topology functions
 621     void ProcessVertTriStrip(uint32_t index, bool finish)
 622     {
 623         this->vert[this->curIndex] = index;
 624         this->curIndex++;
 625         if (this->curIndex == 3)
 626         {
 627             // assembled enough verts for prim, add to gather indices
 628             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 629             if (reverseWinding)
 630             {
 631                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 632                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 633             }
 634             else
 635             {
 636                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 637                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 638             }
 639
 640             // increment numPrimsAssembled
 641             this->numPrimsAssembled++;
 642
 643             // set up next prim state
 644             this->vert[0] = this->vert[1];
 645             this->vert[1] = this->vert[2];
 646             this->curIndex = 2;
 647             this->reverseWinding ^= 1;
 648         }
 649     }
 650
 651     template<bool gsEnabled>
 652     void AssembleTriStripAdj()
 653     {
 654         if (!gsEnabled)
 655         {
 656             this->vert[1] = this->vert[2];
 657             this->vert[2] = this->vert[4];
 658
 659             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 660             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 661             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 662
 663             this->vert[4] = this->vert[2];
 664             this->vert[2] = this->vert[1];
 665         }
 666         else
 667         {
 668             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 669             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 670             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 671             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 672             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 673             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 674         }
 675         this->numPrimsAssembled++;
 676     }
 677
 678
 679     template<bool gsEnabled>
 680     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 681     {
 682         // handle last primitive of tristrip
 683         if (finish && this->adjExtraVert != -1)
 684         {
 685             this->vert[3] = this->adjExtraVert;
 686             AssembleTriStripAdj<gsEnabled>();
 687             this->adjExtraVert = -1;
 688             return;
 689         }
 690
 691         switch (this->curIndex)
 692         {
 693         case 0:
 694         case 1:
 695         case 2:
 696         case 4:
 697             this->vert[this->curIndex] = index;
 698             this->curIndex++;
 699             break;
 700         case 3:
 701             this->vert[5] = index;
 702             this->curIndex++;
 703             break;
 704         case 5:
 705             if (this->adjExtraVert == -1)
 706             {
 707                 this->adjExtraVert = index;
 708             }
 709             else
 710             {
 711                 this->vert[3] = index;
 712                 if (!gsEnabled)
 713                 {
 714                     AssembleTriStripAdj<gsEnabled>();
 715
 716                     uint32_t nextTri[6];
 717                     if (this->reverseWinding)
 718                     {
 719                         nextTri[0] = this->vert[4];
 720                         nextTri[1] = this->vert[0];
 721                         nextTri[2] = this->vert[2];
 722                         nextTri[4] = this->vert[3];
 723                         nextTri[5] = this->adjExtraVert;
 724                     }
 725                     else
 726                     {
 727                         nextTri[0] = this->vert[2];
 728                         nextTri[1] = this->adjExtraVert;
 729                         nextTri[2] = this->vert[3];
 730                         nextTri[4] = this->vert[4];
 731                         nextTri[5] = this->vert[0];
 732                     }
 733                     for (uint32_t i = 0; i < 6; ++i)
 734                     {
 735                         this->vert[i] = nextTri[i];
 736                     }
 737
 738                     this->adjExtraVert = -1;
 739                     this->reverseWinding ^= 1;
 740                 }
 741                 else
 742                 {
 743                     this->curIndex++;
 744                 }
 745             }
 746             break;
 747         case 6:
 748             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 749             AssembleTriStripAdj<gsEnabled>();
 750
 751             uint32_t nextTri[6];
 752             if (this->reverseWinding)
 753             {
 754                 nextTri[0] = this->vert[4];
 755                 nextTri[1] = this->vert[0];
 756                 nextTri[2] = this->vert[2];
 757                 nextTri[4] = this->vert[3];
 758                 nextTri[5] = this->adjExtraVert;
 759             }
 760             else
 761             {
 762                 nextTri[0] = this->vert[2];
 763                 nextTri[1] = this->adjExtraVert;
 764                 nextTri[2] = this->vert[3];
 765                 nextTri[4] = this->vert[4];
 766                 nextTri[5] = this->vert[0];
 767             }
 768             for (uint32_t i = 0; i < 6; ++i)
 769             {
 770                 this->vert[i] = nextTri[i];
 771             }
 772             this->reverseWinding ^= 1;
 773             this->adjExtraVert = index;
 774             this->curIndex--;
 775             break;
 776         }
 777     }
 778
 779     void ProcessVertTriList(uint32_t index, bool finish)
 780     {
 781         this->vert[this->curIndex] = index;
 782         this->curIndex++;
 783         if (this->curIndex == 3)
 784         {
 785             // assembled enough verts for prim, add to gather indices
 786             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 787             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 788             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 789
 790             // increment numPrimsAssembled
 791             this->numPrimsAssembled++;
 792
 793             // set up next prim state
 794             this->curIndex = 0;
 795         }
 796     }
 797
 798     void ProcessVertTriListAdj(uint32_t index, bool finish)
 799     {
 800         this->vert[this->curIndex] = index;
 801         this->curIndex++;
 802         if (this->curIndex == 6)
 803         {
 804             // assembled enough verts for prim, add to gather indices
 805             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 806             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 807             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 808             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 809             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 810             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 811
 812             // increment numPrimsAssembled
 813             this->numPrimsAssembled++;
 814
 815             // set up next prim state
 816             this->curIndex = 0;
 817         }
 818     }
 819
 820     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
 821     {
 822         this->vert[this->curIndex] = index;
 823         this->curIndex++;
 824         if (this->curIndex == 6)
 825         {
 826             // assembled enough verts for prim, add to gather indices
 827             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 828             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 829             this->indices[2][this->numPrimsAssembled] = this->vert[4];
 830
 831             // increment numPrimsAssembled
 832             this->numPrimsAssembled++;
 833
 834             // set up next prim state
 835             this->curIndex = 0;
 836         }
 837     }
 838
 839
 840     void ProcessVertLineList(uint32_t index, bool finish)
 841     {
 842         this->vert[this->curIndex] = index;
 843         this->curIndex++;
 844         if (this->curIndex == 2)
 845         {
 846             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 847             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 848
 849             this->numPrimsAssembled++;
 850             this->curIndex = 0;
 851         }
 852     }
 853
 854     void ProcessVertLineStrip(uint32_t index, bool finish)
 855     {
 856         this->vert[this->curIndex] = index;
 857         this->curIndex++;
 858         if (this->curIndex == 2)
 859         {
 860             // assembled enough verts for prim, add to gather indices
 861             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 862             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 863
 864             // increment numPrimsAssembled
 865             this->numPrimsAssembled++;
 866
 867             // set up next prim state
 868             this->vert[0] = this->vert[1];
 869             this->curIndex = 1;
 870         }
 871     }
 872
 873     void ProcessVertLineStripAdj(uint32_t index, bool finish)
 874     {
 875         this->vert[this->curIndex] = index;
 876         this->curIndex++;
 877         if (this->curIndex == 4)
 878         {
 879             // assembled enough verts for prim, add to gather indices
 880             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 881             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 882             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 883             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 884
 885             // increment numPrimsAssembled
 886             this->numPrimsAssembled++;
 887
 888             // set up next prim state
 889             this->vert[0] = this->vert[1];
 890             this->vert[1] = this->vert[2];
 891             this->vert[2] = this->vert[3];
 892             this->curIndex = 3;
 893         }
 894     }
 895
 896     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
 897     {
 898         this->vert[this->curIndex] = index;
 899         this->curIndex++;
 900         if (this->curIndex == 4)
 901         {
 902             // assembled enough verts for prim, add to gather indices
 903             this->indices[0][this->numPrimsAssembled] = this->vert[1];
 904             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 905
 906             // increment numPrimsAssembled
 907             this->numPrimsAssembled++;
 908
 909             // set up next prim state
 910             this->vert[0] = this->vert[1];
 911             this->vert[1] = this->vert[2];
 912             this->vert[2] = this->vert[3];
 913             this->curIndex = 3;
 914         }
 915     }
 916
 917     void ProcessVertLineListAdj(uint32_t index, bool finish)
 918     {
 919         this->vert[this->curIndex] = index;
 920         this->curIndex++;
 921         if (this->curIndex == 4)
 922         {
 923             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 924             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 925             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 926             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 927
 928             this->numPrimsAssembled++;
 929             this->curIndex = 0;
 930         }
 931     }
 932
 933     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
 934     {
 935         this->vert[this->curIndex] = index;
 936         this->curIndex++;
 937         if (this->curIndex == 4)
 938         {
 939             this->indices[0][this->numPrimsAssembled] = this->vert[1];
 940             this->indices[1][this->numPrimsAssembled] = this->vert[2];
 941
 942             this->numPrimsAssembled++;
 943             this->curIndex = 0;
 944         }
 945     }
 946
 947     void ProcessVertPointList(uint32_t index, bool finish)
 948     {
 949         this->vert[this->curIndex] = index;
 950         this->curIndex++;
 951         if (this->curIndex == 1)
 952         {
 953             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 954             this->numPrimsAssembled++;
 955             this->curIndex = 0;
 956         }
 957     }
 958 };
 959
 960 // Primitive Assembly for data output from the DomainShader.
 961 struct PA_TESS : PA_STATE
 962 {
 963     PA_TESS(
 964         DRAW_CONTEXT *in_pDC,
 965         const simdscalar* in_pVertData,
 966         uint32_t in_attributeStrideInVectors,
 967         uint32_t in_numAttributes,
 968         uint32_t* (&in_ppIndices)[3],
 969         uint32_t in_numPrims,
 970         PRIMITIVE_TOPOLOGY in_binTopology) :
 971
 972         PA_STATE(in_pDC, nullptr, 0),
 973         m_pVertexData(in_pVertData),
 974         m_attributeStrideInVectors(in_attributeStrideInVectors),
 975         m_numAttributes(in_numAttributes),
 976         m_numPrims(in_numPrims)
 977     {
 978         m_vPrimId = _simd_setzero_si();
 979         binTopology = in_binTopology;
 980         m_ppIndices[0] = in_ppIndices[0];
 981         m_ppIndices[1] = in_ppIndices[1];
 982         m_ppIndices[2] = in_ppIndices[2];
 983
 984         switch (binTopology)
 985         {
 986         case TOP_POINT_LIST:
 987             m_numVertsPerPrim = 1;
 988             break;
 989
 990         case TOP_LINE_LIST:
 991             m_numVertsPerPrim = 2;
 992             break;
 993
 994         case TOP_TRIANGLE_LIST:
 995             m_numVertsPerPrim = 3;
 996             break;
 997
 998         default:
 999             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1000             break;
1001         }
1002     }
1003
1004     bool HasWork()
1005     {
1006         return m_numPrims != 0;
1007     }
1008
1009     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1010     {
1011         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1012         static simdvector junk = { 0 };
1013         return junk;
1014     }
1015
1016     static simdscalari GenPrimMask(uint32_t numPrims)
1017     {
1018         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
1019 #if KNOB_SIMD_WIDTH == 8
1020         static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
1021         {
1022             -1, -1, -1, -1, -1, -1, -1, -1,
1023              0,  0,  0,  0,  0,  0,  0,  0
1024         };
1025 #elif KNOB_SIMD_WIDTH == 16
1026         static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
1027         {
1028             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1029              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1030         };
1031 #else
1032 #error "Help, help, I can't get up!"
1033 #endif
1034
1035         return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
1036     }
1037
1038     bool Assemble(uint32_t slot, simdvector verts[])
1039     {
1040         static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
1041         SWR_ASSERT(slot < m_numAttributes);
1042
1043         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1044         if (0 == numPrimsToAssemble)
1045         {
1046             return false;
1047         }
1048
1049         simdscalari mask = GenPrimMask(numPrimsToAssemble);
1050
1051         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1052         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1053         {
1054             simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
1055
1056             const float* pBase = pBaseAttrib;
1057             for (uint32_t c = 0; c < 4; ++c)
1058             {
1059                 verts[i].v[c] = _simd_mask_i32gather_ps(
1060                     _simd_setzero_ps(),
1061                     pBase,
1062                     indices,
1063                     _simd_castsi_ps(mask),
1064                     4 /* gcc doesn't like sizeof(float) */);
1065                 pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1066             }
1067         }
1068
1069         return true;
1070     }
1071
1072     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1073     {
1074         SWR_ASSERT(slot < m_numAttributes);
1075         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1076
1077         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1078         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1079         {
1080             uint32_t index = m_ppIndices[i][primIndex];
1081             const float* pVertData = pVertDataBase;
1082             float* pVert = (float*)&verts[i];
1083
1084             for (uint32_t c = 0; c < 4; ++c)
1085             {
1086                 pVert[c] = pVertData[index];
1087                 pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
1088             }
1089         }
1090     }
1091
1092     bool NextPrim()
1093     {
1094         uint32_t numPrims = PA_TESS::NumPrims();
1095         m_numPrims -= numPrims;
1096         m_ppIndices[0] += numPrims;
1097         m_ppIndices[1] += numPrims;
1098         m_ppIndices[2] += numPrims;
1099
1100         return HasWork();
1101     }
1102
1103     simdvertex& GetNextVsOutput()
1104     {
1105         SWR_ASSERT(0, "%s", __FUNCTION__);
1106         static simdvertex junk;
1107         return junk;
1108     }
1109
1110     bool GetNextStreamOutput()
1111     {
1112         SWR_ASSERT(0, "%s", __FUNCTION__);
1113         return false;
1114     }
1115
1116     simdmask& GetNextVsIndices()
1117     {
1118         SWR_ASSERT(0, "%s", __FUNCTION__);
1119         static simdmask junk;
1120         return junk;
1121     }
1122
1123     uint32_t NumPrims()
1124     {
1125         return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
1126     }
1127
1128     void Reset() { SWR_ASSERT(0); };
1129
1130     simdscalari GetPrimID(uint32_t startID)
1131     {
1132         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1133     }
1134
1135 private:
1136     const simdscalar*   m_pVertexData = nullptr;
1137     uint32_t            m_attributeStrideInVectors = 0;
1138     uint32_t            m_numAttributes = 0;
1139     uint32_t            m_numPrims = 0;
1140     uint32_t*           m_ppIndices[3];
1141
1142     uint32_t            m_numVertsPerPrim = 0;
1143
1144     simdscalari         m_vPrimId;
1145 };
1146
1147 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1148 // based on state.
1149 template <bool IsIndexedT>
1150 struct PA_FACTORY
1151 {
1152     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1153     {
1154 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1155         const API_STATE& state = GetApiState(pDC);
1156         if ((IsIndexedT && (
1157             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1158             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1159             topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
1160             topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
1161             topo == TOP_TRI_STRIP_ADJ)) ||
1162
1163             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1164             // for them in the optimized PA
1165             (!IsIndexedT && (
1166             topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
1167         {
1168             memset(&indexStore, 0, sizeof(indexStore));
1169             DWORD numAttribs;
1170             _BitScanReverse(&numAttribs, state.feAttribMask);
1171             numAttribs++;
1172             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
1173                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1174             cutPA = true;
1175         }
1176         else
1177 #endif
1178         {
1179             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1180             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
1181             cutPA = false;
1182         }
1183
1184     }
1185
1186     PA_STATE& GetPA()
1187     {
1188 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1189         if (cutPA)
1190         {
1191             return this->paCut;
1192         }
1193         else
1194 #endif
1195         {
1196             return this->paOpt;
1197         }
1198     }
1199
1200     PA_STATE_OPT paOpt;
1201     PA_STATE_CUT paCut;
1202     bool cutPA{ false };
1203
1204     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1205
1206     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
1207     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
1208 };