src/gallium/drivers/swr/rasterizer/core/pa.h

   1 /****************************************************************************
   2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
   3 *
   4 * Permission is hereby granted, free of charge, to any person obtaining a
   5 * copy of this software and associated documentation files (the "Software"),
   6 * to deal in the Software without restriction, including without limitation
   7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8 * and/or sell copies of the Software, and to permit persons to whom the
   9 * Software is furnished to do so, subject to the following conditions:
  10 *
  11 * The above copyright notice and this permission notice (including the next
  12 * paragraph) shall be included in all copies or substantial portions of the
  13 * Software.
  14 *
  15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21 * IN THE SOFTWARE.
  22 *
  23 * @file pa.h
  24 *
  25 * @brief Definitions for primitive assembly.
  26 *        N primitives are assembled at a time, where N is the SIMD width.
  27 *        A state machine, that is specific for a given topology, drives the
  28 *        assembly of vertices into triangles.
  29 *
  30 ******************************************************************************/
  31 #pragma once
  32
  33 #include "frontend.h"
  34
  35 struct PA_STATE
  36 {
  37 #if USE_SIMD16_FRONTEND
  38     enum
  39     {
  40         SIMD_WIDTH      = KNOB_SIMD16_WIDTH,
  41         SIMD_WIDTH_DIV2 = KNOB_SIMD16_WIDTH / 2,
  42         SIMD_WIDTH_LOG2 = 4
  43     };
  44
  45     typedef         simd16mask          SIMDMASK;
  46
  47     typedef         simd16scalar        SIMDSCALAR;
  48     typedef         simd16vector        SIMDVECTOR;
  49     typedef         simd16vertex        SIMDVERTEX;
  50
  51     typedef         simd16scalari       SIMDSCALARI;
  52
  53 #else
  54     enum
  55     {
  56         SIMD_WIDTH      = KNOB_SIMD_WIDTH,
  57         SIMD_WIDTH_DIV2 = KNOB_SIMD_WIDTH / 2,
  58         SIMD_WIDTH_LOG2 = 3
  59     };
  60
  61     typedef         simdmask            SIMDMASK;
  62
  63     typedef         simdscalar          SIMDSCALAR;
  64     typedef         simdvector          SIMDVECTOR;
  65     typedef         simdvertex          SIMDVERTEX;
  66
  67     typedef         simdscalari         SIMDSCALARI;
  68
  69 #endif
  70     DRAW_CONTEXT *pDC{ nullptr };              // draw context
  71     uint8_t* pStreamBase{ nullptr };           // vertex stream
  72     uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
  73
  74     // The topology the binner will use. In some cases the FE changes the topology from the api state.
  75     PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
  76
  77 #if ENABLE_AVX512_SIMD16
  78     bool useAlternateOffset{ false };
  79
  80 #endif
  81     PA_STATE() {}
  82     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
  83         pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
  84
  85     virtual bool HasWork() = 0;
  86     virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
  87 #if ENABLE_AVX512_SIMD16
  88     virtual simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) = 0;
  89 #endif
  90     virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
  91 #if ENABLE_AVX512_SIMD16
  92     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
  93 #endif
  94     virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
  95     virtual bool NextPrim() = 0;
  96     virtual SIMDVERTEX& GetNextVsOutput() = 0;
  97     virtual bool GetNextStreamOutput() = 0;
  98     virtual SIMDMASK& GetNextVsIndices() = 0;
  99     virtual uint32_t NumPrims() = 0;
 100     virtual void Reset() = 0;
 101     virtual SIMDSCALARI GetPrimID(uint32_t startID) = 0;
 102 };
 103
 104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
 105 // output. Here is the sequence
 106 //    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
 107 //    2. Execute PA function to assemble and bin triangles.
 108 //        a.    The PA function is a set of functions that collectively make up the
 109 //            state machine for a given topology.
 110 //                1.    We use a state index to track which PA function to call.
 111 //        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
 112 //                1.    We call this the current and previous simd vertex.
 113 //                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
 114 //                    order to assemble the second triangle, for a triangle list, we'll need the
 115 //                    last vertex from the previous simd and the first 2 vertices from the current simd.
 116 //                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
 117 //
 118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
 119 // cuts
 120 struct PA_STATE_OPT : public PA_STATE
 121 {
 122     SIMDVERTEX leadingVertex;            // For tri-fan
 123
 124     uint32_t numPrims{ 0 };              // Total number of primitives for draw.
 125     uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 126
 127     uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 128
 129     uint32_t cur{ 0 };                   // index to current VS output.
 130     uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
 131     uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
 132
 133     uint32_t counter{ 0 };               // state counter
 134     bool reset{ false };                 // reset state
 135
 136     uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
 137     SIMDSCALARI primID;
 138
 139     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
 140 #if ENABLE_AVX512_SIMD16
 141     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]);
 142 #endif
 143     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 144
 145     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 146 #if ENABLE_AVX512_SIMD16
 147     PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr };
 148 #endif
 149     PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
 150     PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 151 #if ENABLE_AVX512_SIMD16
 152     PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr };
 153 #endif
 154
 155     // state used to advance the PA when Next is called
 156     PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
 157 #if ENABLE_AVX512_SIMD16
 158     PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr };
 159 #endif
 160     uint32_t           nextNumSimdPrims{ 0 };
 161     uint32_t           nextNumPrimsIncrement{ 0 };
 162     bool               nextReset{ false };
 163     bool               isStreaming{ false };
 164
 165     SIMDMASK tmpIndices{ 0 };            // temporary index store for unused virtual function
 166
 167     PA_STATE_OPT() {}
 168     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
 169         bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
 170
 171     bool HasWork()
 172     {
 173         return (this->numPrimsComplete < this->numPrims) ? true : false;
 174     }
 175
 176     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 177     {
 178         simdvertex* pVertex = (simdvertex*)pStreamBase;
 179         return pVertex[index].attrib[slot];
 180     }
 181
 182 #if ENABLE_AVX512_SIMD16
 183     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 184     {
 185         simd16vertex* pVertex = (simd16vertex*)pStreamBase;
 186         return pVertex[index].attrib[slot];
 187     }
 188
 189 #endif
 190     // Assembles 4 triangles. Each simdvector is a single vertex from 4
 191     // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
 192     bool Assemble(uint32_t slot, simdvector verts[])
 193     {
 194         return this->pfnPaFunc(*this, slot, verts);
 195     }
 196
 197 #if ENABLE_AVX512_SIMD16
 198     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 199     {
 200         return this->pfnPaFunc_simd16(*this, slot, verts);
 201     }
 202
 203 #endif
 204     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
 205     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
 206     {
 207         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
 208     }
 209
 210     bool NextPrim()
 211     {
 212         this->pfnPaFunc = this->pfnPaNextFunc;
 213 #if ENABLE_AVX512_SIMD16
 214         this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16;
 215 #endif
 216         this->numSimdPrims = this->nextNumSimdPrims;
 217         this->numPrimsComplete += this->nextNumPrimsIncrement;
 218         this->reset = this->nextReset;
 219
 220         if (this->isStreaming)
 221         {
 222             this->reset = false;
 223         }
 224
 225         bool morePrims = false;
 226
 227         if (this->numSimdPrims > 0)
 228         {
 229             morePrims = true;
 230             this->numSimdPrims--;
 231         }
 232         else
 233         {
 234             this->counter = (this->reset) ? 0 : (this->counter + 1);
 235             this->reset = false;
 236         }
 237
 238         this->pfnPaFunc = this->pfnPaNextFunc;
 239
 240         if (!HasWork())
 241         {
 242             morePrims = false;    // no more to do
 243         }
 244
 245         return morePrims;
 246     }
 247
 248     SIMDVERTEX& GetNextVsOutput()
 249     {
 250         // increment cur and prev indices
 251         const uint32_t numSimdVerts = this->streamSizeInVerts / SIMD_WIDTH;
 252         this->prev = this->cur;  // prev is undefined for first state.
 253         this->cur = this->counter % numSimdVerts;
 254
 255         SIMDVERTEX* pVertex = (SIMDVERTEX*)pStreamBase;
 256         return pVertex[this->cur];
 257     }
 258
 259     SIMDMASK& GetNextVsIndices()
 260     {
 261         // unused in optimized PA, pass tmp buffer back
 262         return tmpIndices;
 263     }
 264
 265     bool GetNextStreamOutput()
 266     {
 267         this->prev = this->cur;
 268         this->cur = this->counter;
 269
 270         return HasWork();
 271     }
 272
 273     uint32_t NumPrims()
 274     {
 275         return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
 276             (SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : SIMD_WIDTH;
 277     }
 278
 279     void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 280         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 281         uint32_t numSimdPrims = 0,
 282         uint32_t numPrimsIncrement = 0,
 283         bool reset = false)
 284     {
 285         this->pfnPaNextFunc = pfnPaNextFunc;
 286         this->nextNumSimdPrims = numSimdPrims;
 287         this->nextNumPrimsIncrement = numPrimsIncrement;
 288         this->nextReset = reset;
 289
 290         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 291     }
 292
 293 #if ENABLE_AVX512_SIMD16
 294     void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 295         PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 296         uint32_t numSimdPrims = 0,
 297         uint32_t numPrimsIncrement = 0,
 298         bool reset = false)
 299     {
 300         this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16;
 301         this->nextNumSimdPrims = numSimdPrims;
 302         this->nextNumPrimsIncrement = numPrimsIncrement;
 303         this->nextReset = reset;
 304
 305         this->pfnPaSingleFunc = pfnPaNextSingleFunc;
 306     }
 307
 308 #endif
 309     void Reset()
 310     {
 311 #if ENABLE_AVX512_SIMD16
 312         useAlternateOffset = false;
 313
 314 #endif
 315         this->pfnPaFunc = this->pfnPaFuncReset;
 316         this->numPrimsComplete = 0;
 317         this->numSimdPrims = 0;
 318         this->cur = 0;
 319         this->prev = 0;
 320         this->first = 0;
 321         this->counter = 0;
 322         this->reset = false;
 323     }
 324
 325     SIMDSCALARI GetPrimID(uint32_t startID)
 326     {
 327 #if USE_SIMD16_FRONTEND
 328         return _simd16_add_epi32(this->primID,
 329             _simd16_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 330 #else
 331         return _simd_add_epi32(this->primID,
 332             _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / SIMD_WIDTH)));
 333 #endif
 334     }
 335 };
 336
 337 // helper C wrappers to avoid having to rewrite all the PA topology state functions
 338 INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
 339     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 340     uint32_t numSimdPrims = 0,
 341     uint32_t numPrimsIncrement = 0,
 342     bool reset = false)
 343 {
 344     return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 345 }
 346
 347 #if ENABLE_AVX512_SIMD16
 348 INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16,
 349     PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
 350     uint32_t numSimdPrims = 0,
 351     uint32_t numPrimsIncrement = 0,
 352     bool reset = false)
 353 {
 354     return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
 355 }
 356
 357 #endif
 358 INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
 359 {
 360     return pa.GetSimdVector(index, slot);
 361 }
 362
 363 #if ENABLE_AVX512_SIMD16
 364 INLINE simd16vector& PaGetSimdVector_simd16(PA_STATE& pa, uint32_t index, uint32_t slot)
 365 {
 366     return pa.GetSimdVector_simd16(index, slot);
 367 }
 368
 369 #endif
 370 INLINE __m128 swizzleLane0(const simdvector &a)
 371 {
 372     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 373     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 374     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 375 }
 376
 377 INLINE __m128 swizzleLane1(const simdvector &a)
 378 {
 379     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 380     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 381     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 382 }
 383
 384 INLINE __m128 swizzleLane2(const simdvector &a)
 385 {
 386     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 387     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 388     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 389 }
 390
 391 INLINE __m128 swizzleLane3(const simdvector &a)
 392 {
 393     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 394     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 395     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 396 }
 397
 398 INLINE __m128 swizzleLane4(const simdvector &a)
 399 {
 400     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 401     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 402     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 403
 404 }
 405
 406 INLINE __m128 swizzleLane5(const simdvector &a)
 407 {
 408     simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
 409     simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
 410     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 411 }
 412
 413 INLINE __m128 swizzleLane6(const simdvector &a)
 414 {
 415     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 416     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 417     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 418 }
 419
 420 INLINE __m128 swizzleLane7(const simdvector &a)
 421 {
 422     simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
 423     simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
 424     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 425 }
 426
 427 INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 428 {
 429     switch (lane) {
 430     case 0:
 431         return swizzleLane0(a);
 432     case 1:
 433         return swizzleLane1(a);
 434     case 2:
 435         return swizzleLane2(a);
 436     case 3:
 437         return swizzleLane3(a);
 438     case 4:
 439         return swizzleLane4(a);
 440     case 5:
 441         return swizzleLane5(a);
 442     case 6:
 443         return swizzleLane6(a);
 444     case 7:
 445         return swizzleLane7(a);
 446     default:
 447         return _mm_setzero_ps();
 448     }
 449 }
 450
 451 // Cut-aware primitive assembler.
 452 struct PA_STATE_CUT : public PA_STATE
 453 {
 454     SIMDMASK* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
 455     uint32_t numVerts{ 0 };              // number of vertices available in buffer store
 456     uint32_t numAttribs{ 0 };            // number of attributes
 457     int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
 458     uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
 459 #if ENABLE_AVX512_SIMD16
 460     OSALIGNSIMD16(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 461 #else
 462     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][SIMD_WIDTH];    // current index buffer for gather
 463 #endif
 464     SIMDSCALARI vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
 465     uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
 466     uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
 467     uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
 468     uint32_t curVertex{ 0 };             // current unprocessed vertex
 469     uint32_t startPrimId{ 0 };           // starting prim id
 470     SIMDSCALARI vPrimId;                 // vector of prim ID
 471     bool needOffsets{ false };           // need to compute gather offsets for current SIMD
 472     uint32_t vertsPerPrim{ 0 };
 473     SIMDVERTEX tmpVertex;                // temporary simdvertex for unimplemented API
 474     bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
 475                                          // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
 476                                          // while the GS sends valid verts for every index
 477     // Topology state tracking
 478     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
 479     uint32_t curIndex{ 0 };
 480     bool reverseWinding{ false };        // indicates reverse winding for strips
 481     int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 482
 483     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
 484     PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 485
 486     PA_STATE_CUT() {}
 487     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, SIMDMASK* in_pIndices, uint32_t in_numVerts,
 488         uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
 489         : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
 490     {
 491         numVerts = in_streamSizeInVerts;
 492         numAttribs = in_numAttribs;
 493         binTopology = topo;
 494         needOffsets = false;
 495         processCutVerts = in_processCutVerts;
 496
 497         numVertsToAssemble = numRemainingVerts = in_numVerts;
 498         numPrimsAssembled = 0;
 499         headVertex = tailVertex = curVertex = 0;
 500
 501         curIndex = 0;
 502         pCutIndices = in_pIndices;
 503         memset(indices, 0, sizeof(indices));
 504 #if USE_SIMD16_FRONTEND
 505         vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 506 #else
 507         vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 508 #endif
 509         reverseWinding = false;
 510         adjExtraVert = -1;
 511
 512         bool gsEnabled = pDC->pState->state.gsState.gsEnable;
 513         vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
 514
 515         switch (topo)
 516         {
 517         case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
 518         case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
 519         case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
 520         case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
 521                                     {
 522                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
 523                                     }
 524                                     else
 525                                     {
 526                                         pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
 527                                     }
 528                                     break;
 529
 530         case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
 531         case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
 532         case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
 533         case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
 534         case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
 535         default: assert(0 && "Unimplemented topology");
 536         }
 537     }
 538
 539     SIMDVERTEX& GetNextVsOutput()
 540     {
 541         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 542         this->headVertex = (this->headVertex + SIMD_WIDTH) % this->numVerts;
 543         this->needOffsets = true;
 544         return ((SIMDVERTEX*)pStreamBase)[vertexIndex];
 545     }
 546
 547     SIMDMASK& GetNextVsIndices()
 548     {
 549         uint32_t vertexIndex = this->headVertex / SIMD_WIDTH;
 550         SIMDMASK* pCurCutIndex = this->pCutIndices + vertexIndex;
 551         return *pCurCutIndex;
 552     }
 553
 554     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
 555     {
 556         // unused
 557         SWR_ASSERT(0 && "Not implemented");
 558         static simdvector junk;
 559         return junk;
 560     }
 561
 562 #if ENABLE_AVX512_SIMD16
 563     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
 564     {
 565         // unused
 566         SWR_ASSERT(0 && "Not implemented");
 567         static simd16vector junk;
 568         return junk;
 569     }
 570
 571 #endif
 572     bool GetNextStreamOutput()
 573     {
 574         this->headVertex += SIMD_WIDTH;
 575         this->needOffsets = true;
 576         return HasWork();
 577     }
 578
 579     SIMDSCALARI GetPrimID(uint32_t startID)
 580     {
 581 #if USE_SIMD16_FRONTEND
 582         return _simd16_add_epi32(_simd16_set1_epi32(startID), this->vPrimId);
 583 #else
 584         return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
 585 #endif
 586     }
 587
 588     void Reset()
 589     {
 590 #if ENABLE_AVX512_SIMD16
 591         useAlternateOffset = false;
 592
 593 #endif
 594         this->numRemainingVerts = this->numVertsToAssemble;
 595         this->numPrimsAssembled = 0;
 596         this->curIndex = 0;
 597         this->curVertex = 0;
 598         this->tailVertex = 0;
 599         this->headVertex = 0;
 600         this->reverseWinding = false;
 601         this->adjExtraVert = -1;
 602 #if USE_SIMD16_FRONTEND
 603         this->vPrimId = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
 604 #else
 605         this->vPrimId = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
 606 #endif
 607     }
 608
 609     bool HasWork()
 610     {
 611         return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
 612     }
 613
 614     bool IsVertexStoreFull()
 615     {
 616         return ((this->headVertex + SIMD_WIDTH) % this->numVerts) == this->tailVertex;
 617     }
 618
 619     void RestartTopology()
 620     {
 621         this->curIndex = 0;
 622         this->reverseWinding = false;
 623         this->adjExtraVert = -1;
 624     }
 625
 626     bool IsCutIndex(uint32_t vertex)
 627     {
 628         uint32_t vertexIndex = vertex / SIMD_WIDTH;
 629         uint32_t vertexOffset = vertex & (SIMD_WIDTH - 1);
 630         return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
 631     }
 632
 633     // iterates across the unprocessed verts until we hit the end or we
 634     // have assembled SIMD prims
 635     void ProcessVerts()
 636     {
 637         while (this->numPrimsAssembled != SIMD_WIDTH &&
 638             this->numRemainingVerts > 0 &&
 639             this->curVertex != this->headVertex)
 640         {
 641             // if cut index, restart topology
 642             if (IsCutIndex(this->curVertex))
 643             {
 644                 if (this->processCutVerts)
 645                 {
 646                     (this->*pfnPa)(this->curVertex, false);
 647                 }
 648                 // finish off tri strip w/ adj before restarting topo
 649                 if (this->adjExtraVert != -1)
 650                 {
 651                     (this->*pfnPa)(this->curVertex, true);
 652                 }
 653                 RestartTopology();
 654             }
 655             else
 656             {
 657                 (this->*pfnPa)(this->curVertex, false);
 658             }
 659
 660             this->curVertex++;
 661             if (this->curVertex >= this->numVerts) {
 662                this->curVertex = 0;
 663             }
 664             this->numRemainingVerts--;
 665         }
 666
 667         // special case last primitive for tri strip w/ adj
 668         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
 669         {
 670             (this->*pfnPa)(this->curVertex, true);
 671         }
 672     }
 673
 674     void Advance()
 675     {
 676         // done with current batch
 677         // advance tail to the current unsubmitted vertex
 678         this->tailVertex = this->curVertex;
 679         this->numPrimsAssembled = 0;
 680 #if USE_SIMD16_FRONTEND
 681         this->vPrimId = _simd16_add_epi32(vPrimId, _simd16_set1_epi32(SIMD_WIDTH));
 682 #else
 683         this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(SIMD_WIDTH));
 684 #endif
 685     }
 686
 687     bool NextPrim()
 688     {
 689         // if we've assembled enough prims, we can advance to the next set of verts
 690         if (this->numPrimsAssembled == SIMD_WIDTH || this->numRemainingVerts <= 0)
 691         {
 692             Advance();
 693         }
 694         return false;
 695     }
 696
 697     void ComputeOffsets()
 698     {
 699         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 700         {
 701             SIMDSCALARI vIndices = *(SIMDSCALARI*)&this->indices[v][0];
 702
 703             // step to simdvertex batch
 704             const uint32_t simdShift = SIMD_WIDTH_LOG2;
 705 #if USE_SIMD16_FRONTEND
 706             SIMDSCALARI vVertexBatch = _simd16_srai_epi32(vIndices, simdShift);
 707             this->vOffsets[v] = _simd16_mullo_epi32(vVertexBatch, _simd16_set1_epi32(sizeof(SIMDVERTEX)));
 708 #else
 709             SIMDSCALARI vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
 710             this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(SIMDVERTEX)));
 711 #endif
 712
 713             // step to index
 714             const uint32_t simdMask = SIMD_WIDTH - 1;
 715 #if USE_SIMD16_FRONTEND
 716             SIMDSCALARI vVertexIndex = _simd16_and_si(vIndices, _simd16_set1_epi32(simdMask));
 717             this->vOffsets[v] = _simd16_add_epi32(this->vOffsets[v], _simd16_mullo_epi32(vVertexIndex, _simd16_set1_epi32(sizeof(float))));
 718 #else
 719             SIMDSCALARI vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
 720             this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
 721 #endif
 722         }
 723     }
 724
 725     bool Assemble(uint32_t slot, simdvector verts[])
 726     {
 727         // process any outstanding verts
 728         ProcessVerts();
 729
 730         // return false if we don't have enough prims assembled
 731         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 732         {
 733             return false;
 734         }
 735
 736         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 737         if (this->needOffsets)
 738         {
 739             ComputeOffsets();
 740             this->needOffsets = false;
 741         }
 742
 743         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 744         {
 745             SIMDSCALARI offsets = this->vOffsets[v];
 746
 747             // step to attribute
 748 #if USE_SIMD16_FRONTEND
 749             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 750 #else
 751             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(SIMDVECTOR)));
 752 #endif
 753
 754             float* pBase = (float*)this->pStreamBase;
 755             for (uint32_t c = 0; c < 4; ++c)
 756             {
 757 #if USE_SIMD16_FRONTEND
 758                 simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1);
 759
 760                 verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo;
 761 #else
 762                 verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
 763 #endif
 764
 765                 // move base to next component
 766                 pBase += SIMD_WIDTH;
 767             }
 768         }
 769
 770         return true;
 771     }
 772
 773 #if ENABLE_AVX512_SIMD16
 774     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
 775     {
 776         // process any outstanding verts
 777         ProcessVerts();
 778
 779         // return false if we don't have enough prims assembled
 780         if (this->numPrimsAssembled != SIMD_WIDTH && this->numRemainingVerts > 0)
 781         {
 782             return false;
 783         }
 784
 785         // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
 786         if (this->needOffsets)
 787         {
 788             ComputeOffsets();
 789             this->needOffsets = false;
 790         }
 791
 792         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 793         {
 794             SIMDSCALARI offsets = this->vOffsets[v];
 795
 796             // step to attribute
 797 #if USE_SIMD16_FRONTEND
 798             offsets = _simd16_add_epi32(offsets, _simd16_set1_epi32(slot * sizeof(SIMDVECTOR)));
 799 #else
 800             offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
 801 #endif
 802
 803             float* pBase = (float*)this->pStreamBase;
 804             for (uint32_t c = 0; c < 4; ++c)
 805             {
 806 #if USE_SIMD16_FRONTEND
 807                 verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1);
 808 #else
 809                 verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1);
 810                 verts[v].v[c].hi = _simd_setzero_ps();
 811 #endif
 812
 813                 // move base to next component
 814                 pBase += SIMD_WIDTH;
 815             }
 816         }
 817
 818         return true;
 819     }
 820
 821 #endif
 822     void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
 823     {
 824         // move to slot
 825         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
 826         {
 827             uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
 828 #if USE_SIMD16_FRONTEND
 829             uint32_t offset = useAlternateOffset ? pOffset[triIndex + SIMD_WIDTH_DIV2] : pOffset[triIndex];
 830 #else
 831             uint32_t offset = pOffset[triIndex];
 832 #endif
 833             offset += sizeof(SIMDVECTOR) * slot;
 834             float* pVert = (float*)&tri[v];
 835             for (uint32_t c = 0; c < 4; ++c)
 836             {
 837                 float* pComponent = (float*)(this->pStreamBase + offset);
 838                 pVert[c] = *pComponent;
 839                 offset += SIMD_WIDTH * sizeof(float);
 840             }
 841         }
 842     }
 843
 844     uint32_t NumPrims()
 845     {
 846         return this->numPrimsAssembled;
 847     }
 848
 849     // Per-topology functions
 850     void ProcessVertTriStrip(uint32_t index, bool finish)
 851     {
 852         this->vert[this->curIndex] = index;
 853         this->curIndex++;
 854         if (this->curIndex == 3)
 855         {
 856             // assembled enough verts for prim, add to gather indices
 857             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 858             if (reverseWinding)
 859             {
 860                 this->indices[1][this->numPrimsAssembled] = this->vert[2];
 861                 this->indices[2][this->numPrimsAssembled] = this->vert[1];
 862             }
 863             else
 864             {
 865                 this->indices[1][this->numPrimsAssembled] = this->vert[1];
 866                 this->indices[2][this->numPrimsAssembled] = this->vert[2];
 867             }
 868
 869             // increment numPrimsAssembled
 870             this->numPrimsAssembled++;
 871
 872             // set up next prim state
 873             this->vert[0] = this->vert[1];
 874             this->vert[1] = this->vert[2];
 875             this->curIndex = 2;
 876             this->reverseWinding ^= 1;
 877         }
 878     }
 879
 880     template<bool gsEnabled>
 881     void AssembleTriStripAdj()
 882     {
 883         if (!gsEnabled)
 884         {
 885             this->vert[1] = this->vert[2];
 886             this->vert[2] = this->vert[4];
 887
 888             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 889             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 890             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 891
 892             this->vert[4] = this->vert[2];
 893             this->vert[2] = this->vert[1];
 894         }
 895         else
 896         {
 897             this->indices[0][this->numPrimsAssembled] = this->vert[0];
 898             this->indices[1][this->numPrimsAssembled] = this->vert[1];
 899             this->indices[2][this->numPrimsAssembled] = this->vert[2];
 900             this->indices[3][this->numPrimsAssembled] = this->vert[3];
 901             this->indices[4][this->numPrimsAssembled] = this->vert[4];
 902             this->indices[5][this->numPrimsAssembled] = this->vert[5];
 903         }
 904         this->numPrimsAssembled++;
 905     }
 906
 907
 908     template<bool gsEnabled>
 909     void ProcessVertTriStripAdj(uint32_t index, bool finish)
 910     {
 911         // handle last primitive of tristrip
 912         if (finish && this->adjExtraVert != -1)
 913         {
 914             this->vert[3] = this->adjExtraVert;
 915             AssembleTriStripAdj<gsEnabled>();
 916             this->adjExtraVert = -1;
 917             return;
 918         }
 919
 920         switch (this->curIndex)
 921         {
 922         case 0:
 923         case 1:
 924         case 2:
 925         case 4:
 926             this->vert[this->curIndex] = index;
 927             this->curIndex++;
 928             break;
 929         case 3:
 930             this->vert[5] = index;
 931             this->curIndex++;
 932             break;
 933         case 5:
 934             if (this->adjExtraVert == -1)
 935             {
 936                 this->adjExtraVert = index;
 937             }
 938             else
 939             {
 940                 this->vert[3] = index;
 941                 if (!gsEnabled)
 942                 {
 943                     AssembleTriStripAdj<gsEnabled>();
 944
 945                     uint32_t nextTri[6];
 946                     if (this->reverseWinding)
 947                     {
 948                         nextTri[0] = this->vert[4];
 949                         nextTri[1] = this->vert[0];
 950                         nextTri[2] = this->vert[2];
 951                         nextTri[4] = this->vert[3];
 952                         nextTri[5] = this->adjExtraVert;
 953                     }
 954                     else
 955                     {
 956                         nextTri[0] = this->vert[2];
 957                         nextTri[1] = this->adjExtraVert;
 958                         nextTri[2] = this->vert[3];
 959                         nextTri[4] = this->vert[4];
 960                         nextTri[5] = this->vert[0];
 961                     }
 962                     for (uint32_t i = 0; i < 6; ++i)
 963                     {
 964                         this->vert[i] = nextTri[i];
 965                     }
 966
 967                     this->adjExtraVert = -1;
 968                     this->reverseWinding ^= 1;
 969                 }
 970                 else
 971                 {
 972                     this->curIndex++;
 973                 }
 974             }
 975             break;
 976         case 6:
 977             SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
 978             AssembleTriStripAdj<gsEnabled>();
 979
 980             uint32_t nextTri[6];
 981             if (this->reverseWinding)
 982             {
 983                 nextTri[0] = this->vert[4];
 984                 nextTri[1] = this->vert[0];
 985                 nextTri[2] = this->vert[2];
 986                 nextTri[4] = this->vert[3];
 987                 nextTri[5] = this->adjExtraVert;
 988             }
 989             else
 990             {
 991                 nextTri[0] = this->vert[2];
 992                 nextTri[1] = this->adjExtraVert;
 993                 nextTri[2] = this->vert[3];
 994                 nextTri[4] = this->vert[4];
 995                 nextTri[5] = this->vert[0];
 996             }
 997             for (uint32_t i = 0; i < 6; ++i)
 998             {
 999                 this->vert[i] = nextTri[i];
1000             }
1001             this->reverseWinding ^= 1;
1002             this->adjExtraVert = index;
1003             this->curIndex--;
1004             break;
1005         }
1006     }
1007
1008     void ProcessVertTriList(uint32_t index, bool finish)
1009     {
1010         this->vert[this->curIndex] = index;
1011         this->curIndex++;
1012         if (this->curIndex == 3)
1013         {
1014             // assembled enough verts for prim, add to gather indices
1015             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1016             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1017             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1018
1019             // increment numPrimsAssembled
1020             this->numPrimsAssembled++;
1021
1022             // set up next prim state
1023             this->curIndex = 0;
1024         }
1025     }
1026
1027     void ProcessVertTriListAdj(uint32_t index, bool finish)
1028     {
1029         this->vert[this->curIndex] = index;
1030         this->curIndex++;
1031         if (this->curIndex == 6)
1032         {
1033             // assembled enough verts for prim, add to gather indices
1034             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1035             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1036             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1037             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1038             this->indices[4][this->numPrimsAssembled] = this->vert[4];
1039             this->indices[5][this->numPrimsAssembled] = this->vert[5];
1040
1041             // increment numPrimsAssembled
1042             this->numPrimsAssembled++;
1043
1044             // set up next prim state
1045             this->curIndex = 0;
1046         }
1047     }
1048
1049     void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
1050     {
1051         this->vert[this->curIndex] = index;
1052         this->curIndex++;
1053         if (this->curIndex == 6)
1054         {
1055             // assembled enough verts for prim, add to gather indices
1056             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1057             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1058             this->indices[2][this->numPrimsAssembled] = this->vert[4];
1059
1060             // increment numPrimsAssembled
1061             this->numPrimsAssembled++;
1062
1063             // set up next prim state
1064             this->curIndex = 0;
1065         }
1066     }
1067
1068
1069     void ProcessVertLineList(uint32_t index, bool finish)
1070     {
1071         this->vert[this->curIndex] = index;
1072         this->curIndex++;
1073         if (this->curIndex == 2)
1074         {
1075             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1076             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1077
1078             this->numPrimsAssembled++;
1079             this->curIndex = 0;
1080         }
1081     }
1082
1083     void ProcessVertLineStrip(uint32_t index, bool finish)
1084     {
1085         this->vert[this->curIndex] = index;
1086         this->curIndex++;
1087         if (this->curIndex == 2)
1088         {
1089             // assembled enough verts for prim, add to gather indices
1090             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1091             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1092
1093             // increment numPrimsAssembled
1094             this->numPrimsAssembled++;
1095
1096             // set up next prim state
1097             this->vert[0] = this->vert[1];
1098             this->curIndex = 1;
1099         }
1100     }
1101
1102     void ProcessVertLineStripAdj(uint32_t index, bool finish)
1103     {
1104         this->vert[this->curIndex] = index;
1105         this->curIndex++;
1106         if (this->curIndex == 4)
1107         {
1108             // assembled enough verts for prim, add to gather indices
1109             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1110             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1111             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1112             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1113
1114             // increment numPrimsAssembled
1115             this->numPrimsAssembled++;
1116
1117             // set up next prim state
1118             this->vert[0] = this->vert[1];
1119             this->vert[1] = this->vert[2];
1120             this->vert[2] = this->vert[3];
1121             this->curIndex = 3;
1122         }
1123     }
1124
1125     void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
1126     {
1127         this->vert[this->curIndex] = index;
1128         this->curIndex++;
1129         if (this->curIndex == 4)
1130         {
1131             // assembled enough verts for prim, add to gather indices
1132             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1133             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1134
1135             // increment numPrimsAssembled
1136             this->numPrimsAssembled++;
1137
1138             // set up next prim state
1139             this->vert[0] = this->vert[1];
1140             this->vert[1] = this->vert[2];
1141             this->vert[2] = this->vert[3];
1142             this->curIndex = 3;
1143         }
1144     }
1145
1146     void ProcessVertLineListAdj(uint32_t index, bool finish)
1147     {
1148         this->vert[this->curIndex] = index;
1149         this->curIndex++;
1150         if (this->curIndex == 4)
1151         {
1152             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1153             this->indices[1][this->numPrimsAssembled] = this->vert[1];
1154             this->indices[2][this->numPrimsAssembled] = this->vert[2];
1155             this->indices[3][this->numPrimsAssembled] = this->vert[3];
1156
1157             this->numPrimsAssembled++;
1158             this->curIndex = 0;
1159         }
1160     }
1161
1162     void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
1163     {
1164         this->vert[this->curIndex] = index;
1165         this->curIndex++;
1166         if (this->curIndex == 4)
1167         {
1168             this->indices[0][this->numPrimsAssembled] = this->vert[1];
1169             this->indices[1][this->numPrimsAssembled] = this->vert[2];
1170
1171             this->numPrimsAssembled++;
1172             this->curIndex = 0;
1173         }
1174     }
1175
1176     void ProcessVertPointList(uint32_t index, bool finish)
1177     {
1178         this->vert[this->curIndex] = index;
1179         this->curIndex++;
1180         if (this->curIndex == 1)
1181         {
1182             this->indices[0][this->numPrimsAssembled] = this->vert[0];
1183             this->numPrimsAssembled++;
1184             this->curIndex = 0;
1185         }
1186     }
1187 };
1188
1189 // Primitive Assembly for data output from the DomainShader.
1190 struct PA_TESS : PA_STATE
1191 {
1192     PA_TESS(
1193         DRAW_CONTEXT *in_pDC,
1194         const SIMDSCALAR* in_pVertData,
1195         uint32_t in_attributeStrideInVectors,
1196         uint32_t in_numAttributes,
1197         uint32_t* (&in_ppIndices)[3],
1198         uint32_t in_numPrims,
1199         PRIMITIVE_TOPOLOGY in_binTopology) :
1200
1201         PA_STATE(in_pDC, nullptr, 0),
1202         m_pVertexData(in_pVertData),
1203         m_attributeStrideInVectors(in_attributeStrideInVectors),
1204         m_numAttributes(in_numAttributes),
1205         m_numPrims(in_numPrims)
1206     {
1207 #if USE_SIMD16_FRONTEND
1208         m_vPrimId = _simd16_setzero_si();
1209 #else
1210         m_vPrimId = _simd_setzero_si();
1211 #endif
1212         binTopology = in_binTopology;
1213         m_ppIndices[0] = in_ppIndices[0];
1214         m_ppIndices[1] = in_ppIndices[1];
1215         m_ppIndices[2] = in_ppIndices[2];
1216
1217         switch (binTopology)
1218         {
1219         case TOP_POINT_LIST:
1220             m_numVertsPerPrim = 1;
1221             break;
1222
1223         case TOP_LINE_LIST:
1224             m_numVertsPerPrim = 2;
1225             break;
1226
1227         case TOP_TRIANGLE_LIST:
1228             m_numVertsPerPrim = 3;
1229             break;
1230
1231         default:
1232             SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
1233             break;
1234         }
1235     }
1236
1237     bool HasWork()
1238     {
1239         return m_numPrims != 0;
1240     }
1241
1242     simdvector& GetSimdVector(uint32_t index, uint32_t slot)
1243     {
1244         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1245         static simdvector junk;
1246         return junk;
1247     }
1248
1249 #if ENABLE_AVX512_SIMD16
1250     simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot)
1251     {
1252         SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
1253         static simd16vector junk;
1254         return junk;
1255     }
1256
1257 #endif
1258     static SIMDSCALARI GenPrimMask(uint32_t numPrims)
1259     {
1260         SWR_ASSERT(numPrims <= SIMD_WIDTH);
1261 #if USE_SIMD16_FRONTEND
1262         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1263         {
1264             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1265             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
1266         };
1267
1268         return _simd16_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1269 #else
1270         static const OSALIGNLINE(int32_t) maskGen[SIMD_WIDTH * 2] =
1271         {
1272             -1, -1, -1, -1, -1, -1, -1, -1,
1273             0,  0,  0,  0,  0,  0,  0,  0
1274         };
1275
1276         return _simd_loadu_si((const SIMDSCALARI*)&maskGen[SIMD_WIDTH - numPrims]);
1277 #endif
1278     }
1279
1280     bool Assemble(uint32_t slot, simdvector verts[])
1281     {
1282         SWR_ASSERT(slot < m_numAttributes);
1283
1284         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1285         if (0 == numPrimsToAssemble)
1286         {
1287             return false;
1288         }
1289
1290         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1291
1292         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1293         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1294         {
1295 #if USE_SIMD16_FRONTEND
1296             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1297 #else
1298             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1299 #endif
1300
1301             const float* pBase = pBaseAttrib;
1302             for (uint32_t c = 0; c < 4; ++c)
1303             {
1304 #if USE_SIMD16_FRONTEND
1305                 simd16scalar temp = _simd16_mask_i32gather_ps(
1306                     _simd16_setzero_ps(),
1307                     pBase,
1308                     indices,
1309                     mask,
1310                     4 /* gcc doesn't like sizeof(float) */);
1311
1312                 verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo;
1313 #else
1314                 verts[i].v[c] = _simd_mask_i32gather_ps(
1315                     _simd_setzero_ps(),
1316                     pBase,
1317                     indices,
1318                     _simd_castsi_ps(mask),
1319                     4 /* gcc doesn't like sizeof(float) */);
1320 #endif
1321                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1322             }
1323         }
1324
1325         return true;
1326     }
1327
1328 #if ENABLE_AVX512_SIMD16
1329     bool Assemble_simd16(uint32_t slot, simd16vector verts[])
1330     {
1331         SWR_ASSERT(slot < m_numAttributes);
1332
1333         uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
1334         if (0 == numPrimsToAssemble)
1335         {
1336             return false;
1337         }
1338
1339         SIMDSCALARI mask = GenPrimMask(numPrimsToAssemble);
1340
1341         const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1342         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1343         {
1344 #if USE_SIMD16_FRONTEND
1345             SIMDSCALARI indices = _simd16_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1346 #else
1347             SIMDSCALARI indices = _simd_load_si((const SIMDSCALARI*)m_ppIndices[i]);
1348 #endif
1349
1350             const float* pBase = pBaseAttrib;
1351             for (uint32_t c = 0; c < 4; ++c)
1352             {
1353 #if USE_SIMD16_FRONTEND
1354                 verts[i].v[c] = _simd16_mask_i32gather_ps(
1355                     _simd16_setzero_ps(),
1356                     pBase,
1357                     indices,
1358                     mask,
1359                     4 /* gcc doesn't like sizeof(float) */);
1360 #else
1361                 verts[i].v[c].lo = _simd_mask_i32gather_ps(
1362                     _simd_setzero_ps(),
1363                     pBase,
1364                     indices,
1365                     _simd_castsi_ps(mask),
1366                     4 /* gcc doesn't like sizeof(float) */);
1367                 verts[i].v[c].hi = _simd_setzero_ps();
1368 #endif
1369                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
1370             }
1371         }
1372
1373         return true;
1374     }
1375
1376 #endif
1377     void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
1378     {
1379         SWR_ASSERT(slot < m_numAttributes);
1380         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
1381
1382         const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
1383         for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
1384         {
1385 #if USE_SIMD16_FRONTEND
1386             uint32_t index = useAlternateOffset ? m_ppIndices[i][primIndex + SIMD_WIDTH_DIV2] : m_ppIndices[i][primIndex];
1387 #else
1388             uint32_t index = m_ppIndices[i][primIndex];
1389 #endif
1390             const float* pVertData = pVertDataBase;
1391             float* pVert = (float*)&verts[i];
1392
1393             for (uint32_t c = 0; c < 4; ++c)
1394             {
1395                 pVert[c] = pVertData[index];
1396                 pVertData += m_attributeStrideInVectors * SIMD_WIDTH;
1397             }
1398         }
1399     }
1400
1401     bool NextPrim()
1402     {
1403         uint32_t numPrims = PA_TESS::NumPrims();
1404         m_numPrims -= numPrims;
1405         m_ppIndices[0] += numPrims;
1406         m_ppIndices[1] += numPrims;
1407         m_ppIndices[2] += numPrims;
1408
1409         return HasWork();
1410     }
1411
1412     SIMDVERTEX& GetNextVsOutput()
1413     {
1414         SWR_ASSERT(0, "%s", __FUNCTION__);
1415         static SIMDVERTEX junk;
1416         return junk;
1417     }
1418
1419     bool GetNextStreamOutput()
1420     {
1421         SWR_ASSERT(0, "%s", __FUNCTION__);
1422         return false;
1423     }
1424
1425     SIMDMASK& GetNextVsIndices()
1426     {
1427         SWR_ASSERT(0, "%s", __FUNCTION__);
1428         static SIMDMASK junk;
1429         return junk;
1430     }
1431
1432     uint32_t NumPrims()
1433     {
1434         return std::min<uint32_t>(m_numPrims, SIMD_WIDTH);
1435     }
1436
1437     void Reset() { SWR_ASSERT(0); };
1438
1439     SIMDSCALARI GetPrimID(uint32_t startID)
1440     {
1441 #if USE_SIMD16_FRONTEND
1442         return _simd16_add_epi32(_simd16_set1_epi32(startID), m_vPrimId);
1443 #else
1444         return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
1445 #endif
1446     }
1447
1448 private:
1449     const SIMDSCALAR*   m_pVertexData = nullptr;
1450     uint32_t            m_attributeStrideInVectors = 0;
1451     uint32_t            m_numAttributes = 0;
1452     uint32_t            m_numPrims = 0;
1453     uint32_t*           m_ppIndices[3];
1454
1455     uint32_t            m_numVertsPerPrim = 0;
1456
1457     SIMDSCALARI         m_vPrimId;
1458 };
1459
1460 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1461 // based on state.
1462 template <typename IsIndexedT, typename IsCutIndexEnabledT>
1463 struct PA_FACTORY
1464 {
1465     PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
1466     {
1467 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1468         const API_STATE& state = GetApiState(pDC);
1469         if ((IsIndexedT::value && IsCutIndexEnabledT::value && (
1470             topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
1471             topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
1472             topo == TOP_TRIANGLE_LIST)) ||
1473
1474             // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1475             // for them in the optimized PA
1476             (topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ))
1477         {
1478             memset(&indexStore, 0, sizeof(indexStore));
1479             uint32_t numAttribs = state.feNumAttributes;
1480
1481             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH,
1482                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
1483             cutPA = true;
1484         }
1485         else
1486 #endif
1487         {
1488             uint32_t numPrims = GetNumPrims(in_topo, numVerts);
1489             new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false);
1490             cutPA = false;
1491         }
1492
1493     }
1494
1495     PA_STATE& GetPA()
1496     {
1497 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1498         if (cutPA)
1499         {
1500             return this->paCut;
1501         }
1502         else
1503 #endif
1504         {
1505             return this->paOpt;
1506         }
1507     }
1508
1509     PA_STATE_OPT paOpt;
1510     PA_STATE_CUT paCut;
1511     bool cutPA{ false };
1512
1513     PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
1514
1515     PA_STATE::SIMDVERTEX    vertexStore[MAX_NUM_VERTS_PER_PRIM];
1516     PA_STATE::SIMDMASK      indexStore[MAX_NUM_VERTS_PER_PRIM];
1517 };