1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 #if USE_SIMD16_FRONTEND
40 SIMD_WIDTH
= KNOB_SIMD16_WIDTH
,
41 SIMD_WIDTH_DIV2
= KNOB_SIMD16_WIDTH
/ 2,
45 typedef simd16mask SIMDMASK
;
47 typedef simd16scalar SIMDSCALAR
;
48 typedef simd16vector SIMDVECTOR
;
49 typedef simd16vertex SIMDVERTEX
;
51 typedef simd16scalari SIMDSCALARI
;
56 SIMD_WIDTH
= KNOB_SIMD_WIDTH
,
57 SIMD_WIDTH_DIV2
= KNOB_SIMD_WIDTH
/ 2,
61 typedef simdmask SIMDMASK
;
63 typedef simdscalar SIMDSCALAR
;
64 typedef simdvector SIMDVECTOR
;
65 typedef simdvertex SIMDVERTEX
;
67 typedef simdscalari SIMDSCALARI
;
70 DRAW_CONTEXT
*pDC
{ nullptr }; // draw context
71 uint8_t* pStreamBase
{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts
{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride
{ 0 }; // stride of a vertex in simdvector units
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology
{ TOP_UNKNOWN
};
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset
{ false };
83 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
) :
84 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
), vertexStride(in_vertexStride
) {}
86 virtual bool HasWork() = 0;
87 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
88 #if ENABLE_AVX512_SIMD16
89 virtual simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
) = 0;
91 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual bool Assemble_simd16(uint32_t slot
, simd16vector verts
[]) = 0;
95 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[]) = 0;
96 virtual bool NextPrim() = 0;
97 virtual SIMDVERTEX
& GetNextVsOutput() = 0;
98 virtual bool GetNextStreamOutput() = 0;
99 virtual SIMDMASK
& GetNextVsIndices() = 0;
100 virtual uint32_t NumPrims() = 0;
101 virtual void Reset() = 0;
102 virtual SIMDSCALARI
GetPrimID(uint32_t startID
) = 0;
105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
106 // output. Here is the sequence
107 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
108 // 2. Execute PA function to assemble and bin triangles.
109 // a. The PA function is a set of functions that collectively make up the
110 // state machine for a given topology.
111 // 1. We use a state index to track which PA function to call.
112 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
113 // 1. We call this the current and previous simd vertex.
114 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
115 // order to assemble the second triangle, for a triangle list, we'll need the
116 // last vertex from the previous simd and the first 2 vertices from the current simd.
117 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
121 struct PA_STATE_OPT
: public PA_STATE
123 uint32_t numPrims
{ 0 }; // Total number of primitives for draw.
124 uint32_t numPrimsComplete
{ 0 }; // Total number of complete primitives.
126 uint32_t numSimdPrims
{ 0 }; // Number of prims in current simd.
128 uint32_t cur
{ 0 }; // index to current VS output.
129 uint32_t prev
{ 0 }; // index to prev VS output. Not really needed in the state.
130 const uint32_t first
{ 0 }; // index to first VS output. Used for tri fan and line loop.
132 uint32_t counter
{ 0 }; // state counter
133 bool reset
{ false }; // reset state
135 uint32_t primIDIncr
{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
138 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
139 #if ENABLE_AVX512_SIMD16
140 typedef bool(*PFN_PA_FUNC_SIMD16
)(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
142 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[]);
144 PFN_PA_FUNC pfnPaFunc
{ nullptr }; // PA state machine function for assembling 4 triangles.
145 #if ENABLE_AVX512_SIMD16
146 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16
{ nullptr };
148 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
{ nullptr }; // PA state machine function for assembling single triangle.
149 PFN_PA_FUNC pfnPaFuncReset
{ nullptr }; // initial state to set on reset
150 #if ENABLE_AVX512_SIMD16
151 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16
{ nullptr };
154 // state used to advance the PA when Next is called
155 PFN_PA_FUNC pfnPaNextFunc
{ nullptr };
156 #if ENABLE_AVX512_SIMD16
157 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
{ nullptr };
159 uint32_t nextNumSimdPrims
{ 0 };
160 uint32_t nextNumPrimsIncrement
{ 0 };
161 bool nextReset
{ false };
162 bool isStreaming
{ false };
164 SIMDMASK junkIndices
{ 0 }; // temporary index store for unused virtual function
167 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
168 uint32_t vertexStride
, bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
172 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
175 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
177 SWR_ASSERT(slot
< vertexStride
);
178 uint32_t offset
= index
* vertexStride
+ slot
;
179 simdvector
& vertexSlot
= ((simdvector
*)pStreamBase
)[offset
];
183 #if ENABLE_AVX512_SIMD16
184 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
186 SWR_ASSERT(slot
< vertexStride
);
187 uint32_t offset
= index
* vertexStride
+ slot
;
188 simd16vector
& vertexSlot
= ((simd16vector
*)pStreamBase
)[offset
];
193 // Assembles 4 triangles. Each simdvector is a single vertex from 4
194 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
195 bool Assemble(uint32_t slot
, simdvector verts
[])
197 return this->pfnPaFunc(*this, slot
, verts
);
200 #if ENABLE_AVX512_SIMD16
201 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
203 return this->pfnPaFunc_simd16(*this, slot
, verts
);
207 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
208 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[])
210 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
215 this->pfnPaFunc
= this->pfnPaNextFunc
;
216 #if ENABLE_AVX512_SIMD16
217 this->pfnPaFunc_simd16
= this->pfnPaNextFunc_simd16
;
219 this->numSimdPrims
= this->nextNumSimdPrims
;
220 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
221 this->reset
= this->nextReset
;
223 if (this->isStreaming
)
228 bool morePrims
= false;
230 if (this->numSimdPrims
> 0)
233 this->numSimdPrims
--;
237 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
243 morePrims
= false; // no more to do
249 SIMDVERTEX
& GetNextVsOutput()
251 const uint32_t numSimdVerts
= streamSizeInVerts
/ SIMD_WIDTH
;
253 // increment cur and prev indices
254 if (counter
< numSimdVerts
)
256 // prev undefined for first state
262 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
263 uint32_t temp
= prev
;
269 SWR_ASSERT(cur
< numSimdVerts
);
270 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[cur
* vertexStride
];
272 return *(SIMDVERTEX
*)pVertex
;
275 SIMDMASK
& GetNextVsIndices()
277 // unused in optimized PA, pass tmp buffer back
281 bool GetNextStreamOutput()
283 this->prev
= this->cur
;
284 this->cur
= this->counter
;
291 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
292 (SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : SIMD_WIDTH
;
295 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
296 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
297 uint32_t numSimdPrims
= 0,
298 uint32_t numPrimsIncrement
= 0,
301 this->pfnPaNextFunc
= pfnPaNextFunc
;
302 this->nextNumSimdPrims
= numSimdPrims
;
303 this->nextNumPrimsIncrement
= numPrimsIncrement
;
304 this->nextReset
= reset
;
306 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
309 #if ENABLE_AVX512_SIMD16
310 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
311 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
312 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
313 uint32_t numSimdPrims
= 0,
314 uint32_t numPrimsIncrement
= 0,
317 this->pfnPaNextFunc_simd16
= pfnPaNextFunc_simd16
;
318 this->pfnPaNextFunc
= pfnPaNextFunc
;
319 this->nextNumSimdPrims
= numSimdPrims
;
320 this->nextNumPrimsIncrement
= numPrimsIncrement
;
321 this->nextReset
= reset
;
323 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
329 #if ENABLE_AVX512_SIMD16
330 useAlternateOffset
= false;
333 this->pfnPaFunc
= this->pfnPaFuncReset
;
334 #if ENABLE_AVX512_SIMD16
335 this->pfnPaFunc_simd16
= this->pfnPaFuncReset_simd16
;
337 this->numPrimsComplete
= 0;
338 this->numSimdPrims
= 0;
345 SIMDSCALARI
GetPrimID(uint32_t startID
)
347 #if USE_SIMD16_FRONTEND
348 return _simd16_add_epi32(this->primID
,
349 _simd16_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
351 return _simd_add_epi32(this->primID
,
352 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
358 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
359 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
360 uint32_t numSimdPrims
= 0,
361 uint32_t numPrimsIncrement
= 0,
364 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
367 #if ENABLE_AVX512_SIMD16
368 INLINE
void SetNextPaState_simd16(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
369 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
370 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
371 uint32_t numSimdPrims
= 0,
372 uint32_t numPrimsIncrement
= 0,
375 return pa
.SetNextState_simd16(pfnPaNextFunc_simd16
, pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
379 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
381 return pa
.GetSimdVector(index
, slot
);
384 #if ENABLE_AVX512_SIMD16
385 INLINE simd16vector
& PaGetSimdVector_simd16(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
387 return pa
.GetSimdVector_simd16(index
, slot
);
391 // Cut-aware primitive assembler.
392 struct PA_STATE_CUT
: public PA_STATE
394 SIMDMASK
* pCutIndices
{ nullptr }; // cut indices buffer, 1 bit per vertex
395 uint32_t numVerts
{ 0 }; // number of vertices available in buffer store
396 uint32_t numAttribs
{ 0 }; // number of attributes
397 int32_t numRemainingVerts
{ 0 }; // number of verts remaining to be assembled
398 uint32_t numVertsToAssemble
{ 0 }; // total number of verts to assemble for the draw
399 #if ENABLE_AVX512_SIMD16
400 OSALIGNSIMD16(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
402 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
404 SIMDSCALARI vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
405 uint32_t numPrimsAssembled
{ 0 }; // number of primitives that are fully assembled
406 uint32_t headVertex
{ 0 }; // current unused vertex slot in vertex buffer store
407 uint32_t tailVertex
{ 0 }; // beginning vertex currently assembling
408 uint32_t curVertex
{ 0 }; // current unprocessed vertex
409 uint32_t startPrimId
{ 0 }; // starting prim id
410 SIMDSCALARI vPrimId
; // vector of prim ID
411 bool needOffsets
{ false }; // need to compute gather offsets for current SIMD
412 uint32_t vertsPerPrim
{ 0 };
413 bool processCutVerts
{ false }; // vertex indices with cuts should be processed as normal, otherwise they
414 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
415 // while the GS sends valid verts for every index
417 simdvector junkVector
; // junk simdvector for unimplemented API
418 #if ENABLE_AVX512_SIMD16
419 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
422 // Topology state tracking
423 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
424 uint32_t curIndex
{ 0 };
425 bool reverseWinding
{ false }; // indicates reverse winding for strips
426 int32_t adjExtraVert
{ 0 }; // extra vert uses for tristrip w/ adj
428 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
429 PFN_PA_FUNC pfnPa
{ nullptr }; // per-topology function that processes a single vert
432 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
, SIMDMASK
* in_pIndices
, uint32_t in_numVerts
,
433 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
)
434 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
, in_vertexStride
)
436 numVerts
= in_streamSizeInVerts
;
437 numAttribs
= in_numAttribs
;
440 processCutVerts
= in_processCutVerts
;
442 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
443 numPrimsAssembled
= 0;
444 headVertex
= tailVertex
= curVertex
= 0;
447 pCutIndices
= in_pIndices
;
448 memset(indices
, 0, sizeof(indices
));
449 #if USE_SIMD16_FRONTEND
450 vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
452 vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
454 reverseWinding
= false;
457 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
458 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
462 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
463 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
464 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
465 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
467 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
471 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
475 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
476 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
477 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
478 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
479 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
480 default: assert(0 && "Unimplemented topology");
484 SIMDVERTEX
& GetNextVsOutput()
486 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
487 this->headVertex
= (this->headVertex
+ SIMD_WIDTH
) % this->numVerts
;
488 this->needOffsets
= true;
489 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[vertexIndex
* vertexStride
];
491 return *(SIMDVERTEX
*)pVertex
;
494 SIMDMASK
& GetNextVsIndices()
496 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
497 SIMDMASK
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
498 return *pCurCutIndex
;
501 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
504 SWR_ASSERT(0 && "Not implemented");
508 #if ENABLE_AVX512_SIMD16
509 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
512 SWR_ASSERT(0 && "Not implemented");
513 return junkVector_simd16
;
517 bool GetNextStreamOutput()
519 this->headVertex
+= SIMD_WIDTH
;
520 this->needOffsets
= true;
524 SIMDSCALARI
GetPrimID(uint32_t startID
)
526 #if USE_SIMD16_FRONTEND
527 return _simd16_add_epi32(_simd16_set1_epi32(startID
), this->vPrimId
);
529 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
535 #if ENABLE_AVX512_SIMD16
536 useAlternateOffset
= false;
539 this->numRemainingVerts
= this->numVertsToAssemble
;
540 this->numPrimsAssembled
= 0;
543 this->tailVertex
= 0;
544 this->headVertex
= 0;
545 this->reverseWinding
= false;
546 this->adjExtraVert
= -1;
547 #if USE_SIMD16_FRONTEND
548 this->vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
550 this->vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
556 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
559 bool IsVertexStoreFull()
561 return ((this->headVertex
+ SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
564 void RestartTopology()
567 this->reverseWinding
= false;
568 this->adjExtraVert
= -1;
571 bool IsCutIndex(uint32_t vertex
)
573 uint32_t vertexIndex
= vertex
/ SIMD_WIDTH
;
574 uint32_t vertexOffset
= vertex
& (SIMD_WIDTH
- 1);
575 return _bittest((const LONG
*)&this->pCutIndices
[vertexIndex
], vertexOffset
) == 1;
578 // iterates across the unprocessed verts until we hit the end or we
579 // have assembled SIMD prims
582 while (this->numPrimsAssembled
!= SIMD_WIDTH
&&
583 this->numRemainingVerts
> 0 &&
584 this->curVertex
!= this->headVertex
)
586 // if cut index, restart topology
587 if (IsCutIndex(this->curVertex
))
589 if (this->processCutVerts
)
591 (this->*pfnPa
)(this->curVertex
, false);
593 // finish off tri strip w/ adj before restarting topo
594 if (this->adjExtraVert
!= -1)
596 (this->*pfnPa
)(this->curVertex
, true);
602 (this->*pfnPa
)(this->curVertex
, false);
606 if (this->curVertex
>= this->numVerts
) {
609 this->numRemainingVerts
--;
612 // special case last primitive for tri strip w/ adj
613 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
615 (this->*pfnPa
)(this->curVertex
, true);
621 // done with current batch
622 // advance tail to the current unsubmitted vertex
623 this->tailVertex
= this->curVertex
;
624 this->numPrimsAssembled
= 0;
625 #if USE_SIMD16_FRONTEND
626 this->vPrimId
= _simd16_add_epi32(vPrimId
, _simd16_set1_epi32(SIMD_WIDTH
));
628 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(SIMD_WIDTH
));
634 // if we've assembled enough prims, we can advance to the next set of verts
635 if (this->numPrimsAssembled
== SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
642 void ComputeOffsets()
644 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
646 uint32_t vertexStrideBytes
= vertexStride
* sizeof(SIMDVECTOR
);
647 SIMDSCALARI vIndices
= *(SIMDSCALARI
*)&this->indices
[v
][0];
649 // step to simdvertex batch
650 const uint32_t simdShift
= SIMD_WIDTH_LOG2
;
651 #if USE_SIMD16_FRONTEND
652 SIMDSCALARI vVertexBatch
= _simd16_srai_epi32(vIndices
, simdShift
);
653 this->vOffsets
[v
] = _simd16_mullo_epi32(vVertexBatch
, _simd16_set1_epi32(vertexStrideBytes
));
655 SIMDSCALARI vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
656 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(vertexStrideBytes
));
660 const uint32_t simdMask
= SIMD_WIDTH
- 1;
661 #if USE_SIMD16_FRONTEND
662 SIMDSCALARI vVertexIndex
= _simd16_and_si(vIndices
, _simd16_set1_epi32(simdMask
));
663 this->vOffsets
[v
] = _simd16_add_epi32(this->vOffsets
[v
], _simd16_mullo_epi32(vVertexIndex
, _simd16_set1_epi32(sizeof(float))));
665 SIMDSCALARI vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
666 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
671 bool Assemble(uint32_t slot
, simdvector
*verts
)
673 // process any outstanding verts
676 // return false if we don't have enough prims assembled
677 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
682 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
683 if (this->needOffsets
)
686 this->needOffsets
= false;
689 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
691 SIMDSCALARI offsets
= this->vOffsets
[v
];
694 #if USE_SIMD16_FRONTEND
695 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
697 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
700 float* pBase
= (float*)this->pStreamBase
;
701 for (uint32_t c
= 0; c
< 4; ++c
)
703 #if USE_SIMD16_FRONTEND
704 simd16scalar temp
= _simd16_i32gather_ps(pBase
, offsets
, 1);
706 // Assigning to a temporary first to avoid an MSVC 2017 compiler bug
707 simdscalar t
= useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
710 verts
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
713 // move base to next component
721 #if ENABLE_AVX512_SIMD16
722 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
724 // process any outstanding verts
727 // return false if we don't have enough prims assembled
728 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
733 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
734 if (this->needOffsets
)
737 this->needOffsets
= false;
740 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
742 SIMDSCALARI offsets
= this->vOffsets
[v
];
745 #if USE_SIMD16_FRONTEND
746 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
748 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
751 float* pBase
= (float*)this->pStreamBase
;
752 for (uint32_t c
= 0; c
< 4; ++c
)
754 #if USE_SIMD16_FRONTEND
755 verts
[v
].v
[c
] = _simd16_i32gather_ps(pBase
, offsets
, 1);
757 verts
[v
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase
, offsets
, 1), 0);
760 // move base to next component
769 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, simd4scalar tri
[3])
772 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
774 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
775 #if USE_SIMD16_FRONTEND
776 uint32_t offset
= useAlternateOffset
? pOffset
[triIndex
+ SIMD_WIDTH_DIV2
] : pOffset
[triIndex
];
778 uint32_t offset
= pOffset
[triIndex
];
780 offset
+= sizeof(SIMDVECTOR
) * slot
;
781 float* pVert
= (float*)&tri
[v
];
782 for (uint32_t c
= 0; c
< 4; ++c
)
784 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
785 pVert
[c
] = *pComponent
;
786 offset
+= SIMD_WIDTH
* sizeof(float);
793 return this->numPrimsAssembled
;
796 // Per-topology functions
797 void ProcessVertTriStrip(uint32_t index
, bool finish
)
799 this->vert
[this->curIndex
] = index
;
801 if (this->curIndex
== 3)
803 // assembled enough verts for prim, add to gather indices
804 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
807 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
808 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
812 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
813 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
816 // increment numPrimsAssembled
817 this->numPrimsAssembled
++;
819 // set up next prim state
820 this->vert
[0] = this->vert
[1];
821 this->vert
[1] = this->vert
[2];
823 this->reverseWinding
^= 1;
827 template<bool gsEnabled
>
828 void AssembleTriStripAdj()
832 this->vert
[1] = this->vert
[2];
833 this->vert
[2] = this->vert
[4];
835 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
836 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
837 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
839 this->vert
[4] = this->vert
[2];
840 this->vert
[2] = this->vert
[1];
844 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
845 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
846 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
847 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
848 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
849 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
851 this->numPrimsAssembled
++;
855 template<bool gsEnabled
>
856 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
858 // handle last primitive of tristrip
859 if (finish
&& this->adjExtraVert
!= -1)
861 this->vert
[3] = this->adjExtraVert
;
862 AssembleTriStripAdj
<gsEnabled
>();
863 this->adjExtraVert
= -1;
867 switch (this->curIndex
)
873 this->vert
[this->curIndex
] = index
;
877 this->vert
[5] = index
;
881 if (this->adjExtraVert
== -1)
883 this->adjExtraVert
= index
;
887 this->vert
[3] = index
;
890 AssembleTriStripAdj
<gsEnabled
>();
893 if (this->reverseWinding
)
895 nextTri
[0] = this->vert
[4];
896 nextTri
[1] = this->vert
[0];
897 nextTri
[2] = this->vert
[2];
898 nextTri
[4] = this->vert
[3];
899 nextTri
[5] = this->adjExtraVert
;
903 nextTri
[0] = this->vert
[2];
904 nextTri
[1] = this->adjExtraVert
;
905 nextTri
[2] = this->vert
[3];
906 nextTri
[4] = this->vert
[4];
907 nextTri
[5] = this->vert
[0];
909 for (uint32_t i
= 0; i
< 6; ++i
)
911 this->vert
[i
] = nextTri
[i
];
914 this->adjExtraVert
= -1;
915 this->reverseWinding
^= 1;
924 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
925 AssembleTriStripAdj
<gsEnabled
>();
928 if (this->reverseWinding
)
930 nextTri
[0] = this->vert
[4];
931 nextTri
[1] = this->vert
[0];
932 nextTri
[2] = this->vert
[2];
933 nextTri
[4] = this->vert
[3];
934 nextTri
[5] = this->adjExtraVert
;
938 nextTri
[0] = this->vert
[2];
939 nextTri
[1] = this->adjExtraVert
;
940 nextTri
[2] = this->vert
[3];
941 nextTri
[4] = this->vert
[4];
942 nextTri
[5] = this->vert
[0];
944 for (uint32_t i
= 0; i
< 6; ++i
)
946 this->vert
[i
] = nextTri
[i
];
948 this->reverseWinding
^= 1;
949 this->adjExtraVert
= index
;
955 void ProcessVertTriList(uint32_t index
, bool finish
)
957 this->vert
[this->curIndex
] = index
;
959 if (this->curIndex
== 3)
961 // assembled enough verts for prim, add to gather indices
962 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
963 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
964 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
966 // increment numPrimsAssembled
967 this->numPrimsAssembled
++;
969 // set up next prim state
974 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
976 this->vert
[this->curIndex
] = index
;
978 if (this->curIndex
== 6)
980 // assembled enough verts for prim, add to gather indices
981 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
982 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
983 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
984 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
985 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
986 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
988 // increment numPrimsAssembled
989 this->numPrimsAssembled
++;
991 // set up next prim state
996 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
998 this->vert
[this->curIndex
] = index
;
1000 if (this->curIndex
== 6)
1002 // assembled enough verts for prim, add to gather indices
1003 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1004 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1005 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
1007 // increment numPrimsAssembled
1008 this->numPrimsAssembled
++;
1010 // set up next prim state
1016 void ProcessVertLineList(uint32_t index
, bool finish
)
1018 this->vert
[this->curIndex
] = index
;
1020 if (this->curIndex
== 2)
1022 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1023 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1025 this->numPrimsAssembled
++;
1030 void ProcessVertLineStrip(uint32_t index
, bool finish
)
1032 this->vert
[this->curIndex
] = index
;
1034 if (this->curIndex
== 2)
1036 // assembled enough verts for prim, add to gather indices
1037 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1038 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1040 // increment numPrimsAssembled
1041 this->numPrimsAssembled
++;
1043 // set up next prim state
1044 this->vert
[0] = this->vert
[1];
1049 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
1051 this->vert
[this->curIndex
] = index
;
1053 if (this->curIndex
== 4)
1055 // assembled enough verts for prim, add to gather indices
1056 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1057 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1058 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1059 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1061 // increment numPrimsAssembled
1062 this->numPrimsAssembled
++;
1064 // set up next prim state
1065 this->vert
[0] = this->vert
[1];
1066 this->vert
[1] = this->vert
[2];
1067 this->vert
[2] = this->vert
[3];
1072 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
1074 this->vert
[this->curIndex
] = index
;
1076 if (this->curIndex
== 4)
1078 // assembled enough verts for prim, add to gather indices
1079 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1080 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1082 // increment numPrimsAssembled
1083 this->numPrimsAssembled
++;
1085 // set up next prim state
1086 this->vert
[0] = this->vert
[1];
1087 this->vert
[1] = this->vert
[2];
1088 this->vert
[2] = this->vert
[3];
1093 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
1095 this->vert
[this->curIndex
] = index
;
1097 if (this->curIndex
== 4)
1099 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1100 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1101 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1102 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1104 this->numPrimsAssembled
++;
1109 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
1111 this->vert
[this->curIndex
] = index
;
1113 if (this->curIndex
== 4)
1115 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1116 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1118 this->numPrimsAssembled
++;
1123 void ProcessVertPointList(uint32_t index
, bool finish
)
1125 this->vert
[this->curIndex
] = index
;
1127 if (this->curIndex
== 1)
1129 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1130 this->numPrimsAssembled
++;
1136 // Primitive Assembly for data output from the DomainShader.
1137 struct PA_TESS
: PA_STATE
1140 DRAW_CONTEXT
*in_pDC
,
1141 const SIMDSCALAR
* in_pVertData
,
1142 uint32_t in_attributeStrideInVectors
,
1143 uint32_t in_vertexStride
,
1144 uint32_t in_numAttributes
,
1145 uint32_t* (&in_ppIndices
)[3],
1146 uint32_t in_numPrims
,
1147 PRIMITIVE_TOPOLOGY in_binTopology
) :
1149 PA_STATE(in_pDC
, nullptr, 0, in_vertexStride
),
1150 m_pVertexData(in_pVertData
),
1151 m_attributeStrideInVectors(in_attributeStrideInVectors
),
1152 m_numAttributes(in_numAttributes
),
1153 m_numPrims(in_numPrims
)
1155 #if USE_SIMD16_FRONTEND
1156 m_vPrimId
= _simd16_setzero_si();
1158 m_vPrimId
= _simd_setzero_si();
1160 binTopology
= in_binTopology
;
1161 m_ppIndices
[0] = in_ppIndices
[0];
1162 m_ppIndices
[1] = in_ppIndices
[1];
1163 m_ppIndices
[2] = in_ppIndices
[2];
1165 switch (binTopology
)
1167 case TOP_POINT_LIST
:
1168 m_numVertsPerPrim
= 1;
1172 m_numVertsPerPrim
= 2;
1175 case TOP_TRIANGLE_LIST
:
1176 m_numVertsPerPrim
= 3;
1180 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1187 return m_numPrims
!= 0;
1190 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1192 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1196 #if ENABLE_AVX512_SIMD16
1197 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
1199 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1200 return junkVector_simd16
;
1204 static SIMDSCALARI
GenPrimMask(uint32_t numPrims
)
1206 SWR_ASSERT(numPrims
<= SIMD_WIDTH
);
1207 #if USE_SIMD16_FRONTEND
1208 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1210 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1211 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1214 return _simd16_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1216 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1218 -1, -1, -1, -1, -1, -1, -1, -1,
1219 0, 0, 0, 0, 0, 0, 0, 0
1222 return _simd_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1226 bool Assemble(uint32_t slot
, simdvector verts
[])
1228 SWR_ASSERT(slot
< m_numAttributes
);
1230 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1231 if (0 == numPrimsToAssemble
)
1236 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1238 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1239 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1241 #if USE_SIMD16_FRONTEND
1242 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1244 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1247 const float* pBase
= pBaseAttrib
;
1248 for (uint32_t c
= 0; c
< 4; ++c
)
1250 #if USE_SIMD16_FRONTEND
1251 simd16scalar temp
= _simd16_mask_i32gather_ps(
1252 _simd16_setzero_ps(),
1255 _simd16_castsi_ps(mask
),
1256 4 /* gcc doesn't like sizeof(float) */);
1258 verts
[i
].v
[c
] = useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
1260 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1264 _simd_castsi_ps(mask
),
1265 4); // gcc doesn't like sizeof(float)
1267 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1274 #if ENABLE_AVX512_SIMD16
1275 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
1277 SWR_ASSERT(slot
< m_numAttributes
);
1279 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1280 if (0 == numPrimsToAssemble
)
1285 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1287 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1288 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1290 #if USE_SIMD16_FRONTEND
1291 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1293 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1296 const float* pBase
= pBaseAttrib
;
1297 for (uint32_t c
= 0; c
< 4; ++c
)
1299 #if USE_SIMD16_FRONTEND
1300 verts
[i
].v
[c
] = _simd16_mask_i32gather_ps(
1301 _simd16_setzero_ps(),
1304 _simd16_castsi_ps(mask
),
1305 4 /* gcc doesn't like sizeof(float) */);
1307 simdscalar temp
= _simd_mask_i32gather_ps(
1311 _simd_castsi_ps(mask
),
1312 4 /* gcc doesn't like sizeof(float) */);
1313 verts
[i
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
1315 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1323 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, simd4scalar verts
[])
1325 SWR_ASSERT(slot
< m_numAttributes
);
1326 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1328 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1329 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1331 #if USE_SIMD16_FRONTEND
1332 uint32_t index
= useAlternateOffset
? m_ppIndices
[i
][primIndex
+ SIMD_WIDTH_DIV2
] : m_ppIndices
[i
][primIndex
];
1334 uint32_t index
= m_ppIndices
[i
][primIndex
];
1336 const float* pVertData
= pVertDataBase
;
1337 float* pVert
= (float*)&verts
[i
];
1339 for (uint32_t c
= 0; c
< 4; ++c
)
1341 pVert
[c
] = pVertData
[index
];
1342 pVertData
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1349 uint32_t numPrims
= PA_TESS::NumPrims();
1350 m_numPrims
-= numPrims
;
1351 m_ppIndices
[0] += numPrims
;
1352 m_ppIndices
[1] += numPrims
;
1353 m_ppIndices
[2] += numPrims
;
1358 SIMDVERTEX
& GetNextVsOutput()
1364 bool GetNextStreamOutput()
1370 SIMDMASK
& GetNextVsIndices()
1378 return std::min
<uint32_t>(m_numPrims
, SIMD_WIDTH
);
1386 SIMDSCALARI
GetPrimID(uint32_t startID
)
1388 #if USE_SIMD16_FRONTEND
1389 return _simd16_add_epi32(_simd16_set1_epi32(startID
), m_vPrimId
);
1391 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1396 const SIMDSCALAR
* m_pVertexData
= nullptr;
1397 uint32_t m_attributeStrideInVectors
= 0;
1398 uint32_t m_numAttributes
= 0;
1399 uint32_t m_numPrims
= 0;
1400 uint32_t* m_ppIndices
[3];
1402 uint32_t m_numVertsPerPrim
= 0;
1404 SIMDSCALARI m_vPrimId
;
1406 simdvector junkVector
; // junk simdvector for unimplemented API
1407 #if ENABLE_AVX512_SIMD16
1408 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
1410 SIMDVERTEX junkVertex
; // junk SIMDVERTEX for unimplemented API
1411 SIMDMASK junkIndices
; // temporary index store for unused virtual function
1414 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1416 template <typename IsIndexedT
, typename IsCutIndexEnabledT
>
1419 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
, PA_STATE::SIMDVERTEX
*pVertexStore
, uint32_t vertexStoreSize
, uint32_t vertexStride
) : topo(in_topo
)
1421 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1422 const API_STATE
& state
= GetApiState(pDC
);
1423 if ((IsIndexedT::value
&& IsCutIndexEnabledT::value
&& (
1424 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1425 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1426 topo
== TOP_TRIANGLE_LIST
)) ||
1428 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1429 // for them in the optimized PA
1430 (topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
))
1432 memset(&indexStore
, 0, sizeof(indexStore
));
1433 uint32_t numAttribs
= state
.feNumAttributes
;
1435 new (&this->paCut
) PA_STATE_CUT(pDC
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
,
1436 vertexStride
, &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false);
1442 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1443 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
, vertexStride
, false);
1451 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1466 bool cutPA
{ false };
1468 PRIMITIVE_TOPOLOGY topo
{ TOP_UNKNOWN
};
1470 PA_STATE::SIMDMASK indexStore
[MAX_NUM_VERTS_PER_PRIM
];