1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 #if USE_SIMD16_FRONTEND
40 SIMD_WIDTH
= KNOB_SIMD16_WIDTH
,
41 SIMD_WIDTH_DIV2
= KNOB_SIMD16_WIDTH
/ 2,
45 typedef simd16mask SIMDMASK
;
47 typedef simd16scalar SIMDSCALAR
;
48 typedef simd16vector SIMDVECTOR
;
49 typedef simd16vertex SIMDVERTEX
;
51 typedef simd16scalari SIMDSCALARI
;
56 SIMD_WIDTH
= KNOB_SIMD_WIDTH
,
57 SIMD_WIDTH_DIV2
= KNOB_SIMD_WIDTH
/ 2,
61 typedef simdmask SIMDMASK
;
63 typedef simdscalar SIMDSCALAR
;
64 typedef simdvector SIMDVECTOR
;
65 typedef simdvertex SIMDVERTEX
;
67 typedef simdscalari SIMDSCALARI
;
70 DRAW_CONTEXT
*pDC
{ nullptr }; // draw context
71 uint8_t* pStreamBase
{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts
{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride
{ 0 }; // stride of a vertex in simdvector units
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology
{ TOP_UNKNOWN
};
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset
{ false };
82 bool viewportArrayActive
{ false };
83 bool rtArrayActive
{ false };
84 uint32_t numVertsPerPrim
{ 0 };
87 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
, uint32_t in_numVertsPerPrim
) :
88 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
), vertexStride(in_vertexStride
), numVertsPerPrim(in_numVertsPerPrim
) {}
90 virtual bool HasWork() = 0;
91 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
) = 0;
95 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
96 #if ENABLE_AVX512_SIMD16
97 virtual bool Assemble(uint32_t slot
, simd16vector verts
[]) = 0;
99 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[]) = 0;
100 virtual bool NextPrim() = 0;
101 virtual SIMDVERTEX
& GetNextVsOutput() = 0;
102 virtual bool GetNextStreamOutput() = 0;
103 virtual SIMDMASK
& GetNextVsIndices() = 0;
104 virtual uint32_t NumPrims() = 0;
105 virtual void Reset() = 0;
106 virtual SIMDSCALARI
GetPrimID(uint32_t startID
) = 0;
109 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
110 // output. Here is the sequence
111 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
112 // 2. Execute PA function to assemble and bin triangles.
113 // a. The PA function is a set of functions that collectively make up the
114 // state machine for a given topology.
115 // 1. We use a state index to track which PA function to call.
116 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
117 // 1. We call this the current and previous simd vertex.
118 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
119 // order to assemble the second triangle, for a triangle list, we'll need the
120 // last vertex from the previous simd and the first 2 vertices from the current simd.
121 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
123 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
125 struct PA_STATE_OPT
: public PA_STATE
127 uint32_t numPrims
{ 0 }; // Total number of primitives for draw.
128 uint32_t numPrimsComplete
{ 0 }; // Total number of complete primitives.
130 uint32_t numSimdPrims
{ 0 }; // Number of prims in current simd.
132 uint32_t cur
{ 0 }; // index to current VS output.
133 uint32_t prev
{ 0 }; // index to prev VS output. Not really needed in the state.
134 const uint32_t first
{ 0 }; // index to first VS output. Used for tri fan and line loop.
136 uint32_t counter
{ 0 }; // state counter
137 bool reset
{ false }; // reset state
139 uint32_t primIDIncr
{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
142 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
143 #if ENABLE_AVX512_SIMD16
144 typedef bool(*PFN_PA_FUNC_SIMD16
)(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
146 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[]);
148 PFN_PA_FUNC pfnPaFunc
{ nullptr }; // PA state machine function for assembling 4 triangles.
149 #if ENABLE_AVX512_SIMD16
150 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16
{ nullptr };
152 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
{ nullptr }; // PA state machine function for assembling single triangle.
153 PFN_PA_FUNC pfnPaFuncReset
{ nullptr }; // initial state to set on reset
154 #if ENABLE_AVX512_SIMD16
155 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16
{ nullptr };
158 // state used to advance the PA when Next is called
159 PFN_PA_FUNC pfnPaNextFunc
{ nullptr };
160 #if ENABLE_AVX512_SIMD16
161 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
{ nullptr };
163 uint32_t nextNumSimdPrims
{ 0 };
164 uint32_t nextNumPrimsIncrement
{ 0 };
165 bool nextReset
{ false };
166 bool isStreaming
{ false };
168 SIMDMASK junkIndices
{ 0 }; // temporary index store for unused virtual function
171 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
172 uint32_t vertexStride
, bool in_isStreaming
, uint32_t numVertsPerPrim
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
176 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
179 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
181 SWR_ASSERT(slot
< vertexStride
);
182 uint32_t offset
= index
* vertexStride
+ slot
;
183 simdvector
& vertexSlot
= ((simdvector
*)pStreamBase
)[offset
];
187 #if ENABLE_AVX512_SIMD16
188 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
190 SWR_ASSERT(slot
< vertexStride
);
191 uint32_t offset
= index
* vertexStride
+ slot
;
192 simd16vector
& vertexSlot
= ((simd16vector
*)pStreamBase
)[offset
];
197 // Assembles 4 triangles. Each simdvector is a single vertex from 4
198 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
199 bool Assemble(uint32_t slot
, simdvector verts
[])
201 return this->pfnPaFunc(*this, slot
, verts
);
204 #if ENABLE_AVX512_SIMD16
205 bool Assemble(uint32_t slot
, simd16vector verts
[])
207 return this->pfnPaFunc_simd16(*this, slot
, verts
);
211 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
212 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[])
214 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
219 this->pfnPaFunc
= this->pfnPaNextFunc
;
220 #if ENABLE_AVX512_SIMD16
221 this->pfnPaFunc_simd16
= this->pfnPaNextFunc_simd16
;
223 this->numSimdPrims
= this->nextNumSimdPrims
;
224 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
225 this->reset
= this->nextReset
;
227 if (this->isStreaming
)
232 bool morePrims
= false;
234 if (this->numSimdPrims
> 0)
237 this->numSimdPrims
--;
241 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
247 morePrims
= false; // no more to do
253 SIMDVERTEX
& GetNextVsOutput()
255 const uint32_t numSimdVerts
= streamSizeInVerts
/ SIMD_WIDTH
;
257 // increment cur and prev indices
258 if (counter
< numSimdVerts
)
260 // prev undefined for first state
266 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
267 uint32_t temp
= prev
;
273 SWR_ASSERT(cur
< numSimdVerts
);
274 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[cur
* vertexStride
];
276 return *(SIMDVERTEX
*)pVertex
;
279 SIMDMASK
& GetNextVsIndices()
281 // unused in optimized PA, pass tmp buffer back
285 bool GetNextStreamOutput()
287 this->prev
= this->cur
;
288 this->cur
= this->counter
;
295 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
296 (SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : SIMD_WIDTH
;
299 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
300 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
301 uint32_t numSimdPrims
= 0,
302 uint32_t numPrimsIncrement
= 0,
305 this->pfnPaNextFunc
= pfnPaNextFunc
;
306 this->nextNumSimdPrims
= numSimdPrims
;
307 this->nextNumPrimsIncrement
= numPrimsIncrement
;
308 this->nextReset
= reset
;
310 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
313 #if ENABLE_AVX512_SIMD16
314 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
315 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
316 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
317 uint32_t numSimdPrims
= 0,
318 uint32_t numPrimsIncrement
= 0,
321 this->pfnPaNextFunc_simd16
= pfnPaNextFunc_simd16
;
322 this->pfnPaNextFunc
= pfnPaNextFunc
;
323 this->nextNumSimdPrims
= numSimdPrims
;
324 this->nextNumPrimsIncrement
= numPrimsIncrement
;
325 this->nextReset
= reset
;
327 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
333 #if ENABLE_AVX512_SIMD16
334 useAlternateOffset
= false;
337 this->pfnPaFunc
= this->pfnPaFuncReset
;
338 #if ENABLE_AVX512_SIMD16
339 this->pfnPaFunc_simd16
= this->pfnPaFuncReset_simd16
;
341 this->numPrimsComplete
= 0;
342 this->numSimdPrims
= 0;
349 SIMDSCALARI
GetPrimID(uint32_t startID
)
351 #if USE_SIMD16_FRONTEND
352 return _simd16_add_epi32(this->primID
,
353 _simd16_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
355 return _simd_add_epi32(this->primID
,
356 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
361 // helper C wrappers to avoid having to rewrite all the PA topology state functions
362 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
363 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
364 uint32_t numSimdPrims
= 0,
365 uint32_t numPrimsIncrement
= 0,
368 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
371 #if ENABLE_AVX512_SIMD16
372 INLINE
void SetNextPaState_simd16(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
373 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
374 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
375 uint32_t numSimdPrims
= 0,
376 uint32_t numPrimsIncrement
= 0,
379 return pa
.SetNextState_simd16(pfnPaNextFunc_simd16
, pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
383 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
385 return pa
.GetSimdVector(index
, slot
);
388 #if ENABLE_AVX512_SIMD16
389 INLINE simd16vector
& PaGetSimdVector_simd16(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
391 return pa
.GetSimdVector_simd16(index
, slot
);
395 // Cut-aware primitive assembler.
396 struct PA_STATE_CUT
: public PA_STATE
398 SIMDMASK
* pCutIndices
{ nullptr }; // cut indices buffer, 1 bit per vertex
399 uint32_t numVerts
{ 0 }; // number of vertices available in buffer store
400 uint32_t numAttribs
{ 0 }; // number of attributes
401 int32_t numRemainingVerts
{ 0 }; // number of verts remaining to be assembled
402 uint32_t numVertsToAssemble
{ 0 }; // total number of verts to assemble for the draw
403 #if ENABLE_AVX512_SIMD16
404 OSALIGNSIMD16(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
406 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
408 SIMDSCALARI vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
409 uint32_t numPrimsAssembled
{ 0 }; // number of primitives that are fully assembled
410 uint32_t headVertex
{ 0 }; // current unused vertex slot in vertex buffer store
411 uint32_t tailVertex
{ 0 }; // beginning vertex currently assembling
412 uint32_t curVertex
{ 0 }; // current unprocessed vertex
413 uint32_t startPrimId
{ 0 }; // starting prim id
414 SIMDSCALARI vPrimId
; // vector of prim ID
415 bool needOffsets
{ false }; // need to compute gather offsets for current SIMD
416 uint32_t vertsPerPrim
{ 0 };
417 bool processCutVerts
{ false }; // vertex indices with cuts should be processed as normal, otherwise they
418 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
419 // while the GS sends valid verts for every index
421 simdvector junkVector
; // junk simdvector for unimplemented API
422 #if ENABLE_AVX512_SIMD16
423 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
426 // Topology state tracking
427 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
428 uint32_t curIndex
{ 0 };
429 bool reverseWinding
{ false }; // indicates reverse winding for strips
430 int32_t adjExtraVert
{ 0 }; // extra vert uses for tristrip w/ adj
432 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
433 PFN_PA_FUNC pfnPa
{ nullptr }; // per-topology function that processes a single vert
436 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
, SIMDMASK
* in_pIndices
, uint32_t in_numVerts
,
437 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
, uint32_t in_numVertsPerPrim
)
438 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
, in_vertexStride
, in_numVertsPerPrim
)
440 numVerts
= in_streamSizeInVerts
;
441 numAttribs
= in_numAttribs
;
444 processCutVerts
= in_processCutVerts
;
446 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
447 numPrimsAssembled
= 0;
448 headVertex
= tailVertex
= curVertex
= 0;
451 pCutIndices
= in_pIndices
;
452 memset(indices
, 0, sizeof(indices
));
453 #if USE_SIMD16_FRONTEND
454 vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
456 vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
458 reverseWinding
= false;
461 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
462 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
466 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
467 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
468 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
469 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
471 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
475 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
479 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
480 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
481 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
482 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
483 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
484 default: assert(0 && "Unimplemented topology");
488 SIMDVERTEX
& GetNextVsOutput()
490 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
491 this->headVertex
= (this->headVertex
+ SIMD_WIDTH
) % this->numVerts
;
492 this->needOffsets
= true;
493 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[vertexIndex
* vertexStride
];
495 return *(SIMDVERTEX
*)pVertex
;
498 SIMDMASK
& GetNextVsIndices()
500 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
501 SIMDMASK
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
502 return *pCurCutIndex
;
505 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
508 SWR_ASSERT(0 && "Not implemented");
512 #if ENABLE_AVX512_SIMD16
513 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
516 SWR_ASSERT(0 && "Not implemented");
517 return junkVector_simd16
;
521 bool GetNextStreamOutput()
523 this->headVertex
+= SIMD_WIDTH
;
524 this->needOffsets
= true;
528 SIMDSCALARI
GetPrimID(uint32_t startID
)
530 #if USE_SIMD16_FRONTEND
531 return _simd16_add_epi32(_simd16_set1_epi32(startID
), this->vPrimId
);
533 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
539 #if ENABLE_AVX512_SIMD16
540 useAlternateOffset
= false;
543 this->numRemainingVerts
= this->numVertsToAssemble
;
544 this->numPrimsAssembled
= 0;
547 this->tailVertex
= 0;
548 this->headVertex
= 0;
549 this->reverseWinding
= false;
550 this->adjExtraVert
= -1;
551 #if USE_SIMD16_FRONTEND
552 this->vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
554 this->vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
560 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
563 bool IsVertexStoreFull()
565 return ((this->headVertex
+ SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
568 void RestartTopology()
571 this->reverseWinding
= false;
572 this->adjExtraVert
= -1;
575 bool IsCutIndex(uint32_t vertex
)
577 uint32_t vertexIndex
= vertex
/ SIMD_WIDTH
;
578 uint32_t vertexOffset
= vertex
& (SIMD_WIDTH
- 1);
579 return CheckBit(this->pCutIndices
[vertexIndex
], vertexOffset
);
582 // iterates across the unprocessed verts until we hit the end or we
583 // have assembled SIMD prims
586 while (this->numPrimsAssembled
!= SIMD_WIDTH
&&
587 this->numRemainingVerts
> 0 &&
588 this->curVertex
!= this->headVertex
)
590 // if cut index, restart topology
591 if (IsCutIndex(this->curVertex
))
593 if (this->processCutVerts
)
595 (this->*pfnPa
)(this->curVertex
, false);
597 // finish off tri strip w/ adj before restarting topo
598 if (this->adjExtraVert
!= -1)
600 (this->*pfnPa
)(this->curVertex
, true);
606 (this->*pfnPa
)(this->curVertex
, false);
610 if (this->curVertex
>= this->numVerts
) {
613 this->numRemainingVerts
--;
616 // special case last primitive for tri strip w/ adj
617 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
619 (this->*pfnPa
)(this->curVertex
, true);
625 // done with current batch
626 // advance tail to the current unsubmitted vertex
627 this->tailVertex
= this->curVertex
;
628 this->numPrimsAssembled
= 0;
629 #if USE_SIMD16_FRONTEND
630 this->vPrimId
= _simd16_add_epi32(vPrimId
, _simd16_set1_epi32(SIMD_WIDTH
));
632 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(SIMD_WIDTH
));
638 // if we've assembled enough prims, we can advance to the next set of verts
639 if (this->numPrimsAssembled
== SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
646 void ComputeOffsets()
648 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
650 uint32_t vertexStrideBytes
= vertexStride
* sizeof(SIMDVECTOR
);
651 SIMDSCALARI vIndices
= *(SIMDSCALARI
*)&this->indices
[v
][0];
653 // step to simdvertex batch
654 const uint32_t simdShift
= SIMD_WIDTH_LOG2
;
655 #if USE_SIMD16_FRONTEND
656 SIMDSCALARI vVertexBatch
= _simd16_srai_epi32(vIndices
, simdShift
);
657 this->vOffsets
[v
] = _simd16_mullo_epi32(vVertexBatch
, _simd16_set1_epi32(vertexStrideBytes
));
659 SIMDSCALARI vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
660 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(vertexStrideBytes
));
664 const uint32_t simdMask
= SIMD_WIDTH
- 1;
665 #if USE_SIMD16_FRONTEND
666 SIMDSCALARI vVertexIndex
= _simd16_and_si(vIndices
, _simd16_set1_epi32(simdMask
));
667 this->vOffsets
[v
] = _simd16_add_epi32(this->vOffsets
[v
], _simd16_mullo_epi32(vVertexIndex
, _simd16_set1_epi32(sizeof(float))));
669 SIMDSCALARI vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
670 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
675 bool Assemble(uint32_t slot
, simdvector
*verts
)
677 // process any outstanding verts
680 // return false if we don't have enough prims assembled
681 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
686 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
687 if (this->needOffsets
)
690 this->needOffsets
= false;
693 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
695 SIMDSCALARI offsets
= this->vOffsets
[v
];
698 #if USE_SIMD16_FRONTEND
699 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
701 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
704 float* pBase
= (float*)this->pStreamBase
;
705 for (uint32_t c
= 0; c
< 4; ++c
)
707 #if USE_SIMD16_FRONTEND
708 simd16scalar temp
= _simd16_i32gather_ps(pBase
, offsets
, 1);
710 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
711 simdscalar t
= useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
714 verts
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
717 // move base to next component
725 #if ENABLE_AVX512_SIMD16
726 bool Assemble(uint32_t slot
, simd16vector verts
[])
728 // process any outstanding verts
731 // return false if we don't have enough prims assembled
732 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
737 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
738 if (this->needOffsets
)
741 this->needOffsets
= false;
744 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
746 SIMDSCALARI offsets
= this->vOffsets
[v
];
749 #if USE_SIMD16_FRONTEND
750 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
752 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
755 float* pBase
= (float*)this->pStreamBase
;
756 for (uint32_t c
= 0; c
< 4; ++c
)
758 #if USE_SIMD16_FRONTEND
759 verts
[v
].v
[c
] = _simd16_i32gather_ps(pBase
, offsets
, 1);
761 verts
[v
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase
, offsets
, 1), 0);
764 // move base to next component
773 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, simd4scalar tri
[3])
776 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
778 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
779 #if USE_SIMD16_FRONTEND
780 uint32_t offset
= useAlternateOffset
? pOffset
[triIndex
+ SIMD_WIDTH_DIV2
] : pOffset
[triIndex
];
782 uint32_t offset
= pOffset
[triIndex
];
784 offset
+= sizeof(SIMDVECTOR
) * slot
;
785 float* pVert
= (float*)&tri
[v
];
786 for (uint32_t c
= 0; c
< 4; ++c
)
788 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
789 pVert
[c
] = *pComponent
;
790 offset
+= SIMD_WIDTH
* sizeof(float);
797 return this->numPrimsAssembled
;
800 // Per-topology functions
801 void ProcessVertTriStrip(uint32_t index
, bool finish
)
803 this->vert
[this->curIndex
] = index
;
805 if (this->curIndex
== 3)
807 // assembled enough verts for prim, add to gather indices
808 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
811 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
812 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
816 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
817 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
820 // increment numPrimsAssembled
821 this->numPrimsAssembled
++;
823 // set up next prim state
824 this->vert
[0] = this->vert
[1];
825 this->vert
[1] = this->vert
[2];
827 this->reverseWinding
^= 1;
831 template<bool gsEnabled
>
832 void AssembleTriStripAdj()
836 this->vert
[1] = this->vert
[2];
837 this->vert
[2] = this->vert
[4];
839 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
840 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
841 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
843 this->vert
[4] = this->vert
[2];
844 this->vert
[2] = this->vert
[1];
848 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
849 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
850 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
851 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
852 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
853 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
855 this->numPrimsAssembled
++;
859 template<bool gsEnabled
>
860 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
862 // handle last primitive of tristrip
863 if (finish
&& this->adjExtraVert
!= -1)
865 this->vert
[3] = this->adjExtraVert
;
866 AssembleTriStripAdj
<gsEnabled
>();
867 this->adjExtraVert
= -1;
871 switch (this->curIndex
)
877 this->vert
[this->curIndex
] = index
;
881 this->vert
[5] = index
;
885 if (this->adjExtraVert
== -1)
887 this->adjExtraVert
= index
;
891 this->vert
[3] = index
;
894 AssembleTriStripAdj
<gsEnabled
>();
897 if (this->reverseWinding
)
899 nextTri
[0] = this->vert
[4];
900 nextTri
[1] = this->vert
[0];
901 nextTri
[2] = this->vert
[2];
902 nextTri
[4] = this->vert
[3];
903 nextTri
[5] = this->adjExtraVert
;
907 nextTri
[0] = this->vert
[2];
908 nextTri
[1] = this->adjExtraVert
;
909 nextTri
[2] = this->vert
[3];
910 nextTri
[4] = this->vert
[4];
911 nextTri
[5] = this->vert
[0];
913 for (uint32_t i
= 0; i
< 6; ++i
)
915 this->vert
[i
] = nextTri
[i
];
918 this->adjExtraVert
= -1;
919 this->reverseWinding
^= 1;
928 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
929 AssembleTriStripAdj
<gsEnabled
>();
932 if (this->reverseWinding
)
934 nextTri
[0] = this->vert
[4];
935 nextTri
[1] = this->vert
[0];
936 nextTri
[2] = this->vert
[2];
937 nextTri
[4] = this->vert
[3];
938 nextTri
[5] = this->adjExtraVert
;
942 nextTri
[0] = this->vert
[2];
943 nextTri
[1] = this->adjExtraVert
;
944 nextTri
[2] = this->vert
[3];
945 nextTri
[4] = this->vert
[4];
946 nextTri
[5] = this->vert
[0];
948 for (uint32_t i
= 0; i
< 6; ++i
)
950 this->vert
[i
] = nextTri
[i
];
952 this->reverseWinding
^= 1;
953 this->adjExtraVert
= index
;
959 void ProcessVertTriList(uint32_t index
, bool finish
)
961 this->vert
[this->curIndex
] = index
;
963 if (this->curIndex
== 3)
965 // assembled enough verts for prim, add to gather indices
966 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
967 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
968 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
970 // increment numPrimsAssembled
971 this->numPrimsAssembled
++;
973 // set up next prim state
978 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
980 this->vert
[this->curIndex
] = index
;
982 if (this->curIndex
== 6)
984 // assembled enough verts for prim, add to gather indices
985 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
986 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
987 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
988 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
989 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
990 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
992 // increment numPrimsAssembled
993 this->numPrimsAssembled
++;
995 // set up next prim state
1000 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
1002 this->vert
[this->curIndex
] = index
;
1004 if (this->curIndex
== 6)
1006 // assembled enough verts for prim, add to gather indices
1007 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1008 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1009 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
1011 // increment numPrimsAssembled
1012 this->numPrimsAssembled
++;
1014 // set up next prim state
1020 void ProcessVertLineList(uint32_t index
, bool finish
)
1022 this->vert
[this->curIndex
] = index
;
1024 if (this->curIndex
== 2)
1026 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1027 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1029 this->numPrimsAssembled
++;
1034 void ProcessVertLineStrip(uint32_t index
, bool finish
)
1036 this->vert
[this->curIndex
] = index
;
1038 if (this->curIndex
== 2)
1040 // assembled enough verts for prim, add to gather indices
1041 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1042 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1044 // increment numPrimsAssembled
1045 this->numPrimsAssembled
++;
1047 // set up next prim state
1048 this->vert
[0] = this->vert
[1];
1053 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
1055 this->vert
[this->curIndex
] = index
;
1057 if (this->curIndex
== 4)
1059 // assembled enough verts for prim, add to gather indices
1060 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1061 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1062 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1063 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1065 // increment numPrimsAssembled
1066 this->numPrimsAssembled
++;
1068 // set up next prim state
1069 this->vert
[0] = this->vert
[1];
1070 this->vert
[1] = this->vert
[2];
1071 this->vert
[2] = this->vert
[3];
1076 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
1078 this->vert
[this->curIndex
] = index
;
1080 if (this->curIndex
== 4)
1082 // assembled enough verts for prim, add to gather indices
1083 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1084 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1086 // increment numPrimsAssembled
1087 this->numPrimsAssembled
++;
1089 // set up next prim state
1090 this->vert
[0] = this->vert
[1];
1091 this->vert
[1] = this->vert
[2];
1092 this->vert
[2] = this->vert
[3];
1097 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
1099 this->vert
[this->curIndex
] = index
;
1101 if (this->curIndex
== 4)
1103 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1104 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1105 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1106 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1108 this->numPrimsAssembled
++;
1113 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
1115 this->vert
[this->curIndex
] = index
;
1117 if (this->curIndex
== 4)
1119 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1120 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1122 this->numPrimsAssembled
++;
1127 void ProcessVertPointList(uint32_t index
, bool finish
)
1129 this->vert
[this->curIndex
] = index
;
1131 if (this->curIndex
== 1)
1133 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1134 this->numPrimsAssembled
++;
1140 // Primitive Assembly for data output from the DomainShader.
1141 struct PA_TESS
: PA_STATE
1144 DRAW_CONTEXT
*in_pDC
,
1145 const SIMDSCALAR
* in_pVertData
,
1146 uint32_t in_attributeStrideInVectors
,
1147 uint32_t in_vertexStride
,
1148 uint32_t in_numAttributes
,
1149 uint32_t* (&in_ppIndices
)[3],
1150 uint32_t in_numPrims
,
1151 PRIMITIVE_TOPOLOGY in_binTopology
,
1152 uint32_t numVertsPerPrim
) :
1154 PA_STATE(in_pDC
, nullptr, 0, in_vertexStride
, numVertsPerPrim
),
1155 m_pVertexData(in_pVertData
),
1156 m_attributeStrideInVectors(in_attributeStrideInVectors
),
1157 m_numAttributes(in_numAttributes
),
1158 m_numPrims(in_numPrims
)
1160 #if USE_SIMD16_FRONTEND
1161 m_vPrimId
= _simd16_setzero_si();
1163 m_vPrimId
= _simd_setzero_si();
1165 binTopology
= in_binTopology
;
1166 m_ppIndices
[0] = in_ppIndices
[0];
1167 m_ppIndices
[1] = in_ppIndices
[1];
1168 m_ppIndices
[2] = in_ppIndices
[2];
1170 switch (binTopology
)
1172 case TOP_POINT_LIST
:
1173 m_numVertsPerPrim
= 1;
1177 m_numVertsPerPrim
= 2;
1180 case TOP_TRIANGLE_LIST
:
1181 m_numVertsPerPrim
= 3;
1185 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1192 return m_numPrims
!= 0;
1195 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1197 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1201 #if ENABLE_AVX512_SIMD16
1202 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
1204 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1205 return junkVector_simd16
;
1209 static SIMDSCALARI
GenPrimMask(uint32_t numPrims
)
1211 SWR_ASSERT(numPrims
<= SIMD_WIDTH
);
1212 #if USE_SIMD16_FRONTEND
1213 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1215 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1216 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1219 return _simd16_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1221 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1223 -1, -1, -1, -1, -1, -1, -1, -1,
1224 0, 0, 0, 0, 0, 0, 0, 0
1227 return _simd_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1231 bool Assemble(uint32_t slot
, simdvector verts
[])
1233 SWR_ASSERT(slot
< m_numAttributes
);
1235 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1236 if (0 == numPrimsToAssemble
)
1241 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1243 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1244 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1246 #if USE_SIMD16_FRONTEND
1247 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1249 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1252 const float* pBase
= pBaseAttrib
;
1253 for (uint32_t c
= 0; c
< 4; ++c
)
1255 #if USE_SIMD16_FRONTEND
1256 simd16scalar temp
= _simd16_mask_i32gather_ps(
1257 _simd16_setzero_ps(),
1260 _simd16_castsi_ps(mask
),
1261 4 /* gcc doesn't like sizeof(float) */);
1263 verts
[i
].v
[c
] = useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
1265 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1269 _simd_castsi_ps(mask
),
1270 4); // gcc doesn't like sizeof(float)
1272 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1279 #if ENABLE_AVX512_SIMD16
1280 bool Assemble(uint32_t slot
, simd16vector verts
[])
1282 SWR_ASSERT(slot
< m_numAttributes
);
1284 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1285 if (0 == numPrimsToAssemble
)
1290 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1292 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1293 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1295 #if USE_SIMD16_FRONTEND
1296 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1298 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1301 const float* pBase
= pBaseAttrib
;
1302 for (uint32_t c
= 0; c
< 4; ++c
)
1304 #if USE_SIMD16_FRONTEND
1305 verts
[i
].v
[c
] = _simd16_mask_i32gather_ps(
1306 _simd16_setzero_ps(),
1309 _simd16_castsi_ps(mask
),
1310 4 /* gcc doesn't like sizeof(float) */);
1312 simdscalar temp
= _simd_mask_i32gather_ps(
1316 _simd_castsi_ps(mask
),
1317 4 /* gcc doesn't like sizeof(float) */);
1318 verts
[i
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
1320 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1328 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[])
1330 SWR_ASSERT(slot
< m_numAttributes
);
1331 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1333 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1334 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1336 #if USE_SIMD16_FRONTEND
1337 uint32_t index
= useAlternateOffset
? m_ppIndices
[i
][primIndex
+ SIMD_WIDTH_DIV2
] : m_ppIndices
[i
][primIndex
];
1339 uint32_t index
= m_ppIndices
[i
][primIndex
];
1341 const float* pVertData
= pVertDataBase
;
1342 float* pVert
= (float*)&verts
[i
];
1344 for (uint32_t c
= 0; c
< 4; ++c
)
1346 pVert
[c
] = pVertData
[index
];
1347 pVertData
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1354 uint32_t numPrims
= PA_TESS::NumPrims();
1355 m_numPrims
-= numPrims
;
1356 m_ppIndices
[0] += numPrims
;
1357 m_ppIndices
[1] += numPrims
;
1358 m_ppIndices
[2] += numPrims
;
1363 SIMDVERTEX
& GetNextVsOutput()
1369 bool GetNextStreamOutput()
1375 SIMDMASK
& GetNextVsIndices()
1383 return std::min
<uint32_t>(m_numPrims
, SIMD_WIDTH
);
1391 SIMDSCALARI
GetPrimID(uint32_t startID
)
1393 #if USE_SIMD16_FRONTEND
1394 return _simd16_add_epi32(_simd16_set1_epi32(startID
), m_vPrimId
);
1396 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1401 const SIMDSCALAR
* m_pVertexData
= nullptr;
1402 uint32_t m_attributeStrideInVectors
= 0;
1403 uint32_t m_numAttributes
= 0;
1404 uint32_t m_numPrims
= 0;
1405 uint32_t* m_ppIndices
[3];
1407 uint32_t m_numVertsPerPrim
= 0;
1409 SIMDSCALARI m_vPrimId
;
1411 simdvector junkVector
; // junk simdvector for unimplemented API
1412 #if ENABLE_AVX512_SIMD16
1413 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
1415 SIMDVERTEX junkVertex
; // junk SIMDVERTEX for unimplemented API
1416 SIMDMASK junkIndices
; // temporary index store for unused virtual function
1419 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1421 template <typename IsIndexedT
, typename IsCutIndexEnabledT
>
1424 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
, PA_STATE::SIMDVERTEX
*pVertexStore
, uint32_t vertexStoreSize
, uint32_t vertexStride
, uint32_t numVertsPerPrim
) : topo(in_topo
)
1426 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1427 const API_STATE
& state
= GetApiState(pDC
);
1428 if ((IsIndexedT::value
&& IsCutIndexEnabledT::value
&& (
1429 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1430 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1431 topo
== TOP_TRIANGLE_LIST
)) ||
1433 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1434 // for them in the optimized PA
1435 (topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
))
1437 memset(&indexStore
, 0, sizeof(indexStore
));
1438 uint32_t numAttribs
= state
.feNumAttributes
;
1440 new (&this->paCut
) PA_STATE_CUT(pDC
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
,
1441 vertexStride
, &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false, numVertsPerPrim
);
1447 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1448 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
, vertexStride
, false, numVertsPerPrim
);
1456 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1471 bool cutPA
{ false };
1473 PRIMITIVE_TOPOLOGY topo
{ TOP_UNKNOWN
};
1475 PA_STATE::SIMDMASK indexStore
[MAX_NUM_VERTS_PER_PRIM
];