1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 #if USE_SIMD16_FRONTEND
40 SIMD_WIDTH
= KNOB_SIMD16_WIDTH
,
41 SIMD_WIDTH_DIV2
= KNOB_SIMD16_WIDTH
/ 2,
45 typedef simd16mask SIMDMASK
;
47 typedef simd16scalar SIMDSCALAR
;
48 typedef simd16vector SIMDVECTOR
;
49 typedef simd16vertex SIMDVERTEX
;
51 typedef simd16scalari SIMDSCALARI
;
56 SIMD_WIDTH
= KNOB_SIMD_WIDTH
,
57 SIMD_WIDTH_DIV2
= KNOB_SIMD_WIDTH
/ 2,
61 typedef simdmask SIMDMASK
;
63 typedef simdscalar SIMDSCALAR
;
64 typedef simdvector SIMDVECTOR
;
65 typedef simdvertex SIMDVERTEX
;
67 typedef simdscalari SIMDSCALARI
;
70 DRAW_CONTEXT
*pDC
{ nullptr }; // draw context
71 uint8_t* pStreamBase
{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts
{ 0 }; // total size of the input stream in verts
73 uint32_t vertexStride
{ 0 }; // stride of a vertex in simdvector units
75 // The topology the binner will use. In some cases the FE changes the topology from the api state.
76 PRIMITIVE_TOPOLOGY binTopology
{ TOP_UNKNOWN
};
78 #if ENABLE_AVX512_SIMD16
79 bool useAlternateOffset
{ false };
83 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
) :
84 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
), vertexStride(in_vertexStride
) {}
86 virtual bool HasWork() = 0;
87 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
88 #if ENABLE_AVX512_SIMD16
89 virtual simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
) = 0;
91 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
92 #if ENABLE_AVX512_SIMD16
93 virtual bool Assemble_simd16(uint32_t slot
, simd16vector verts
[]) = 0;
95 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[]) = 0;
96 virtual bool NextPrim() = 0;
97 virtual SIMDVERTEX
& GetNextVsOutput() = 0;
98 virtual bool GetNextStreamOutput() = 0;
99 virtual SIMDMASK
& GetNextVsIndices() = 0;
100 virtual uint32_t NumPrims() = 0;
101 virtual void Reset() = 0;
102 virtual SIMDSCALARI
GetPrimID(uint32_t startID
) = 0;
105 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
106 // output. Here is the sequence
107 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
108 // 2. Execute PA function to assemble and bin triangles.
109 // a. The PA function is a set of functions that collectively make up the
110 // state machine for a given topology.
111 // 1. We use a state index to track which PA function to call.
112 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
113 // 1. We call this the current and previous simd vertex.
114 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
115 // order to assemble the second triangle, for a triangle list, we'll need the
116 // last vertex from the previous simd and the first 2 vertices from the current simd.
117 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
119 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
121 struct PA_STATE_OPT
: public PA_STATE
123 uint32_t numPrims
{ 0 }; // Total number of primitives for draw.
124 uint32_t numPrimsComplete
{ 0 }; // Total number of complete primitives.
126 uint32_t numSimdPrims
{ 0 }; // Number of prims in current simd.
128 uint32_t cur
{ 0 }; // index to current VS output.
129 uint32_t prev
{ 0 }; // index to prev VS output. Not really needed in the state.
130 const uint32_t first
{ 0 }; // index to first VS output. Used for tri fan and line loop.
132 uint32_t counter
{ 0 }; // state counter
133 bool reset
{ false }; // reset state
135 uint32_t primIDIncr
{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
138 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, simdvector verts
[]);
139 #if ENABLE_AVX512_SIMD16
140 typedef bool(*PFN_PA_FUNC_SIMD16
)(PA_STATE_OPT
& pa
, uint32_t slot
, simd16vector verts
[]);
142 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
144 PFN_PA_FUNC pfnPaFunc
{ nullptr }; // PA state machine function for assembling 4 triangles.
145 #if ENABLE_AVX512_SIMD16
146 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16
{ nullptr };
148 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
{ nullptr }; // PA state machine function for assembling single triangle.
149 PFN_PA_FUNC pfnPaFuncReset
{ nullptr }; // initial state to set on reset
150 #if ENABLE_AVX512_SIMD16
151 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16
{ nullptr };
154 // state used to advance the PA when Next is called
155 PFN_PA_FUNC pfnPaNextFunc
{ nullptr };
156 #if ENABLE_AVX512_SIMD16
157 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
{ nullptr };
159 uint32_t nextNumSimdPrims
{ 0 };
160 uint32_t nextNumPrimsIncrement
{ 0 };
161 bool nextReset
{ false };
162 bool isStreaming
{ false };
164 SIMDMASK junkIndices
{ 0 }; // temporary index store for unused virtual function
167 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
168 uint32_t vertexStride
, bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
172 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
175 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
177 SWR_ASSERT(slot
< vertexStride
);
178 uint32_t offset
= index
* vertexStride
+ slot
;
179 simdvector
& vertexSlot
= ((simdvector
*)pStreamBase
)[offset
];
183 #if ENABLE_AVX512_SIMD16
184 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
186 SWR_ASSERT(slot
< vertexStride
);
187 uint32_t offset
= index
* vertexStride
+ slot
;
188 simd16vector
& vertexSlot
= ((simd16vector
*)pStreamBase
)[offset
];
193 // Assembles 4 triangles. Each simdvector is a single vertex from 4
194 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
195 bool Assemble(uint32_t slot
, simdvector verts
[])
197 return this->pfnPaFunc(*this, slot
, verts
);
200 #if ENABLE_AVX512_SIMD16
201 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
203 return this->pfnPaFunc_simd16(*this, slot
, verts
);
207 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
208 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
210 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
215 this->pfnPaFunc
= this->pfnPaNextFunc
;
216 #if ENABLE_AVX512_SIMD16
217 this->pfnPaFunc_simd16
= this->pfnPaNextFunc_simd16
;
219 this->numSimdPrims
= this->nextNumSimdPrims
;
220 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
221 this->reset
= this->nextReset
;
223 if (this->isStreaming
)
228 bool morePrims
= false;
230 if (this->numSimdPrims
> 0)
233 this->numSimdPrims
--;
237 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
243 morePrims
= false; // no more to do
249 SIMDVERTEX
& GetNextVsOutput()
251 const uint32_t numSimdVerts
= streamSizeInVerts
/ SIMD_WIDTH
;
253 // increment cur and prev indices
254 if (counter
< numSimdVerts
)
256 // prev undefined for first state
262 // swap/recycle last two simd verts for prev and cur, leave other simd verts intact in the buffer
263 uint32_t temp
= prev
;
269 SWR_ASSERT(cur
< numSimdVerts
);
270 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[cur
* vertexStride
];
272 return *(SIMDVERTEX
*)pVertex
;
275 SIMDMASK
& GetNextVsIndices()
277 // unused in optimized PA, pass tmp buffer back
281 bool GetNextStreamOutput()
283 this->prev
= this->cur
;
284 this->cur
= this->counter
;
291 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
292 (SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : SIMD_WIDTH
;
295 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
296 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
297 uint32_t numSimdPrims
= 0,
298 uint32_t numPrimsIncrement
= 0,
301 this->pfnPaNextFunc
= pfnPaNextFunc
;
302 this->nextNumSimdPrims
= numSimdPrims
;
303 this->nextNumPrimsIncrement
= numPrimsIncrement
;
304 this->nextReset
= reset
;
306 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
309 #if ENABLE_AVX512_SIMD16
310 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
311 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
312 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
313 uint32_t numSimdPrims
= 0,
314 uint32_t numPrimsIncrement
= 0,
317 this->pfnPaNextFunc_simd16
= pfnPaNextFunc_simd16
;
318 this->pfnPaNextFunc
= pfnPaNextFunc
;
319 this->nextNumSimdPrims
= numSimdPrims
;
320 this->nextNumPrimsIncrement
= numPrimsIncrement
;
321 this->nextReset
= reset
;
323 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
329 #if ENABLE_AVX512_SIMD16
330 useAlternateOffset
= false;
333 this->pfnPaFunc
= this->pfnPaFuncReset
;
334 #if ENABLE_AVX512_SIMD16
335 this->pfnPaFunc_simd16
= this->pfnPaFuncReset_simd16
;
337 this->numPrimsComplete
= 0;
338 this->numSimdPrims
= 0;
345 SIMDSCALARI
GetPrimID(uint32_t startID
)
347 #if USE_SIMD16_FRONTEND
348 return _simd16_add_epi32(this->primID
,
349 _simd16_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
351 return _simd_add_epi32(this->primID
,
352 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
357 // helper C wrappers to avoid having to rewrite all the PA topology state functions
358 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
359 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
360 uint32_t numSimdPrims
= 0,
361 uint32_t numPrimsIncrement
= 0,
364 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
367 #if ENABLE_AVX512_SIMD16
368 INLINE
void SetNextPaState_simd16(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
369 PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
370 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
371 uint32_t numSimdPrims
= 0,
372 uint32_t numPrimsIncrement
= 0,
375 return pa
.SetNextState_simd16(pfnPaNextFunc_simd16
, pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
379 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
381 return pa
.GetSimdVector(index
, slot
);
384 #if ENABLE_AVX512_SIMD16
385 INLINE simd16vector
& PaGetSimdVector_simd16(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
387 return pa
.GetSimdVector_simd16(index
, slot
);
391 // Cut-aware primitive assembler.
392 struct PA_STATE_CUT
: public PA_STATE
394 SIMDMASK
* pCutIndices
{ nullptr }; // cut indices buffer, 1 bit per vertex
395 uint32_t numVerts
{ 0 }; // number of vertices available in buffer store
396 uint32_t numAttribs
{ 0 }; // number of attributes
397 int32_t numRemainingVerts
{ 0 }; // number of verts remaining to be assembled
398 uint32_t numVertsToAssemble
{ 0 }; // total number of verts to assemble for the draw
399 #if ENABLE_AVX512_SIMD16
400 OSALIGNSIMD16(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
402 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
404 SIMDSCALARI vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
405 uint32_t numPrimsAssembled
{ 0 }; // number of primitives that are fully assembled
406 uint32_t headVertex
{ 0 }; // current unused vertex slot in vertex buffer store
407 uint32_t tailVertex
{ 0 }; // beginning vertex currently assembling
408 uint32_t curVertex
{ 0 }; // current unprocessed vertex
409 uint32_t startPrimId
{ 0 }; // starting prim id
410 SIMDSCALARI vPrimId
; // vector of prim ID
411 bool needOffsets
{ false }; // need to compute gather offsets for current SIMD
412 uint32_t vertsPerPrim
{ 0 };
413 bool processCutVerts
{ false }; // vertex indices with cuts should be processed as normal, otherwise they
414 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
415 // while the GS sends valid verts for every index
417 simdvector junkVector
; // junk simdvector for unimplemented API
418 #if ENABLE_AVX512_SIMD16
419 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
422 // Topology state tracking
423 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
424 uint32_t curIndex
{ 0 };
425 bool reverseWinding
{ false }; // indicates reverse winding for strips
426 int32_t adjExtraVert
{ 0 }; // extra vert uses for tristrip w/ adj
428 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
429 PFN_PA_FUNC pfnPa
{ nullptr }; // per-topology function that processes a single vert
432 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, uint32_t in_vertexStride
, SIMDMASK
* in_pIndices
, uint32_t in_numVerts
,
433 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
)
434 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
, in_vertexStride
)
436 numVerts
= in_streamSizeInVerts
;
437 numAttribs
= in_numAttribs
;
440 processCutVerts
= in_processCutVerts
;
442 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
443 numPrimsAssembled
= 0;
444 headVertex
= tailVertex
= curVertex
= 0;
447 pCutIndices
= in_pIndices
;
448 memset(indices
, 0, sizeof(indices
));
449 #if USE_SIMD16_FRONTEND
450 vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
452 vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
454 reverseWinding
= false;
457 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
458 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
462 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
463 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
464 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
465 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
467 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
471 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
475 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
476 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
477 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
478 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
479 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
480 default: assert(0 && "Unimplemented topology");
484 SIMDVERTEX
& GetNextVsOutput()
486 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
487 this->headVertex
= (this->headVertex
+ SIMD_WIDTH
) % this->numVerts
;
488 this->needOffsets
= true;
489 SIMDVECTOR
* pVertex
= &((SIMDVECTOR
*)pStreamBase
)[vertexIndex
* vertexStride
];
491 return *(SIMDVERTEX
*)pVertex
;
494 SIMDMASK
& GetNextVsIndices()
496 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
497 SIMDMASK
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
498 return *pCurCutIndex
;
501 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
504 SWR_ASSERT(0 && "Not implemented");
508 #if ENABLE_AVX512_SIMD16
509 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
512 SWR_ASSERT(0 && "Not implemented");
513 return junkVector_simd16
;
517 bool GetNextStreamOutput()
519 this->headVertex
+= SIMD_WIDTH
;
520 this->needOffsets
= true;
524 SIMDSCALARI
GetPrimID(uint32_t startID
)
526 #if USE_SIMD16_FRONTEND
527 return _simd16_add_epi32(_simd16_set1_epi32(startID
), this->vPrimId
);
529 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
535 #if ENABLE_AVX512_SIMD16
536 useAlternateOffset
= false;
539 this->numRemainingVerts
= this->numVertsToAssemble
;
540 this->numPrimsAssembled
= 0;
543 this->tailVertex
= 0;
544 this->headVertex
= 0;
545 this->reverseWinding
= false;
546 this->adjExtraVert
= -1;
547 #if USE_SIMD16_FRONTEND
548 this->vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
550 this->vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
556 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
559 bool IsVertexStoreFull()
561 return ((this->headVertex
+ SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
564 void RestartTopology()
567 this->reverseWinding
= false;
568 this->adjExtraVert
= -1;
571 bool IsCutIndex(uint32_t vertex
)
573 uint32_t vertexIndex
= vertex
/ SIMD_WIDTH
;
574 uint32_t vertexOffset
= vertex
& (SIMD_WIDTH
- 1);
575 return _bittest((const LONG
*)&this->pCutIndices
[vertexIndex
], vertexOffset
) == 1;
578 // iterates across the unprocessed verts until we hit the end or we
579 // have assembled SIMD prims
582 while (this->numPrimsAssembled
!= SIMD_WIDTH
&&
583 this->numRemainingVerts
> 0 &&
584 this->curVertex
!= this->headVertex
)
586 // if cut index, restart topology
587 if (IsCutIndex(this->curVertex
))
589 if (this->processCutVerts
)
591 (this->*pfnPa
)(this->curVertex
, false);
593 // finish off tri strip w/ adj before restarting topo
594 if (this->adjExtraVert
!= -1)
596 (this->*pfnPa
)(this->curVertex
, true);
602 (this->*pfnPa
)(this->curVertex
, false);
606 if (this->curVertex
>= this->numVerts
) {
609 this->numRemainingVerts
--;
612 // special case last primitive for tri strip w/ adj
613 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
615 (this->*pfnPa
)(this->curVertex
, true);
621 // done with current batch
622 // advance tail to the current unsubmitted vertex
623 this->tailVertex
= this->curVertex
;
624 this->numPrimsAssembled
= 0;
625 #if USE_SIMD16_FRONTEND
626 this->vPrimId
= _simd16_add_epi32(vPrimId
, _simd16_set1_epi32(SIMD_WIDTH
));
628 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(SIMD_WIDTH
));
634 // if we've assembled enough prims, we can advance to the next set of verts
635 if (this->numPrimsAssembled
== SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
642 void ComputeOffsets()
644 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
646 uint32_t vertexStrideBytes
= vertexStride
* sizeof(SIMDVECTOR
);
647 SIMDSCALARI vIndices
= *(SIMDSCALARI
*)&this->indices
[v
][0];
649 // step to simdvertex batch
650 const uint32_t simdShift
= SIMD_WIDTH_LOG2
;
651 #if USE_SIMD16_FRONTEND
652 SIMDSCALARI vVertexBatch
= _simd16_srai_epi32(vIndices
, simdShift
);
653 this->vOffsets
[v
] = _simd16_mullo_epi32(vVertexBatch
, _simd16_set1_epi32(vertexStrideBytes
));
655 SIMDSCALARI vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
656 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(vertexStrideBytes
));
660 const uint32_t simdMask
= SIMD_WIDTH
- 1;
661 #if USE_SIMD16_FRONTEND
662 SIMDSCALARI vVertexIndex
= _simd16_and_si(vIndices
, _simd16_set1_epi32(simdMask
));
663 this->vOffsets
[v
] = _simd16_add_epi32(this->vOffsets
[v
], _simd16_mullo_epi32(vVertexIndex
, _simd16_set1_epi32(sizeof(float))));
665 SIMDSCALARI vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
666 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
671 // disabling buffer overrun warning for this function for what appears to be a bug in MSVC 2017
672 PRAGMA_WARNING_PUSH_DISABLE(4789)
673 bool Assemble(uint32_t slot
, simdvector
*verts
)
675 // process any outstanding verts
678 // return false if we don't have enough prims assembled
679 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
684 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
685 if (this->needOffsets
)
688 this->needOffsets
= false;
691 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
693 SIMDSCALARI offsets
= this->vOffsets
[v
];
696 #if USE_SIMD16_FRONTEND
697 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
699 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
702 float* pBase
= (float*)this->pStreamBase
;
703 for (uint32_t c
= 0; c
< 4; ++c
)
705 #if USE_SIMD16_FRONTEND
706 simd16scalar temp
= _simd16_i32gather_ps(pBase
, offsets
, 1);
708 verts
[v
].v
[c
] = useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
710 verts
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
713 // move base to next component
722 #if ENABLE_AVX512_SIMD16
723 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
725 // process any outstanding verts
728 // return false if we don't have enough prims assembled
729 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
734 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
735 if (this->needOffsets
)
738 this->needOffsets
= false;
741 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
743 SIMDSCALARI offsets
= this->vOffsets
[v
];
746 #if USE_SIMD16_FRONTEND
747 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
749 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
752 float* pBase
= (float*)this->pStreamBase
;
753 for (uint32_t c
= 0; c
< 4; ++c
)
755 #if USE_SIMD16_FRONTEND
756 verts
[v
].v
[c
] = _simd16_i32gather_ps(pBase
, offsets
, 1);
758 verts
[v
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), _simd_i32gather_ps(pBase
, offsets
, 1), 0);
761 // move base to next component
770 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, __m128 tri
[3])
773 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
775 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
776 #if USE_SIMD16_FRONTEND
777 uint32_t offset
= useAlternateOffset
? pOffset
[triIndex
+ SIMD_WIDTH_DIV2
] : pOffset
[triIndex
];
779 uint32_t offset
= pOffset
[triIndex
];
781 offset
+= sizeof(SIMDVECTOR
) * slot
;
782 float* pVert
= (float*)&tri
[v
];
783 for (uint32_t c
= 0; c
< 4; ++c
)
785 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
786 pVert
[c
] = *pComponent
;
787 offset
+= SIMD_WIDTH
* sizeof(float);
794 return this->numPrimsAssembled
;
797 // Per-topology functions
798 void ProcessVertTriStrip(uint32_t index
, bool finish
)
800 this->vert
[this->curIndex
] = index
;
802 if (this->curIndex
== 3)
804 // assembled enough verts for prim, add to gather indices
805 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
808 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
809 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
813 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
814 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
817 // increment numPrimsAssembled
818 this->numPrimsAssembled
++;
820 // set up next prim state
821 this->vert
[0] = this->vert
[1];
822 this->vert
[1] = this->vert
[2];
824 this->reverseWinding
^= 1;
828 template<bool gsEnabled
>
829 void AssembleTriStripAdj()
833 this->vert
[1] = this->vert
[2];
834 this->vert
[2] = this->vert
[4];
836 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
837 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
838 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
840 this->vert
[4] = this->vert
[2];
841 this->vert
[2] = this->vert
[1];
845 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
846 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
847 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
848 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
849 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
850 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
852 this->numPrimsAssembled
++;
856 template<bool gsEnabled
>
857 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
859 // handle last primitive of tristrip
860 if (finish
&& this->adjExtraVert
!= -1)
862 this->vert
[3] = this->adjExtraVert
;
863 AssembleTriStripAdj
<gsEnabled
>();
864 this->adjExtraVert
= -1;
868 switch (this->curIndex
)
874 this->vert
[this->curIndex
] = index
;
878 this->vert
[5] = index
;
882 if (this->adjExtraVert
== -1)
884 this->adjExtraVert
= index
;
888 this->vert
[3] = index
;
891 AssembleTriStripAdj
<gsEnabled
>();
894 if (this->reverseWinding
)
896 nextTri
[0] = this->vert
[4];
897 nextTri
[1] = this->vert
[0];
898 nextTri
[2] = this->vert
[2];
899 nextTri
[4] = this->vert
[3];
900 nextTri
[5] = this->adjExtraVert
;
904 nextTri
[0] = this->vert
[2];
905 nextTri
[1] = this->adjExtraVert
;
906 nextTri
[2] = this->vert
[3];
907 nextTri
[4] = this->vert
[4];
908 nextTri
[5] = this->vert
[0];
910 for (uint32_t i
= 0; i
< 6; ++i
)
912 this->vert
[i
] = nextTri
[i
];
915 this->adjExtraVert
= -1;
916 this->reverseWinding
^= 1;
925 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
926 AssembleTriStripAdj
<gsEnabled
>();
929 if (this->reverseWinding
)
931 nextTri
[0] = this->vert
[4];
932 nextTri
[1] = this->vert
[0];
933 nextTri
[2] = this->vert
[2];
934 nextTri
[4] = this->vert
[3];
935 nextTri
[5] = this->adjExtraVert
;
939 nextTri
[0] = this->vert
[2];
940 nextTri
[1] = this->adjExtraVert
;
941 nextTri
[2] = this->vert
[3];
942 nextTri
[4] = this->vert
[4];
943 nextTri
[5] = this->vert
[0];
945 for (uint32_t i
= 0; i
< 6; ++i
)
947 this->vert
[i
] = nextTri
[i
];
949 this->reverseWinding
^= 1;
950 this->adjExtraVert
= index
;
956 void ProcessVertTriList(uint32_t index
, bool finish
)
958 this->vert
[this->curIndex
] = index
;
960 if (this->curIndex
== 3)
962 // assembled enough verts for prim, add to gather indices
963 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
964 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
965 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
967 // increment numPrimsAssembled
968 this->numPrimsAssembled
++;
970 // set up next prim state
975 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
977 this->vert
[this->curIndex
] = index
;
979 if (this->curIndex
== 6)
981 // assembled enough verts for prim, add to gather indices
982 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
983 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
984 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
985 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
986 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
987 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
989 // increment numPrimsAssembled
990 this->numPrimsAssembled
++;
992 // set up next prim state
997 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
999 this->vert
[this->curIndex
] = index
;
1001 if (this->curIndex
== 6)
1003 // assembled enough verts for prim, add to gather indices
1004 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1005 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1006 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
1008 // increment numPrimsAssembled
1009 this->numPrimsAssembled
++;
1011 // set up next prim state
1017 void ProcessVertLineList(uint32_t index
, bool finish
)
1019 this->vert
[this->curIndex
] = index
;
1021 if (this->curIndex
== 2)
1023 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1024 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1026 this->numPrimsAssembled
++;
1031 void ProcessVertLineStrip(uint32_t index
, bool finish
)
1033 this->vert
[this->curIndex
] = index
;
1035 if (this->curIndex
== 2)
1037 // assembled enough verts for prim, add to gather indices
1038 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1039 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1041 // increment numPrimsAssembled
1042 this->numPrimsAssembled
++;
1044 // set up next prim state
1045 this->vert
[0] = this->vert
[1];
1050 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
1052 this->vert
[this->curIndex
] = index
;
1054 if (this->curIndex
== 4)
1056 // assembled enough verts for prim, add to gather indices
1057 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1058 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1059 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1060 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1062 // increment numPrimsAssembled
1063 this->numPrimsAssembled
++;
1065 // set up next prim state
1066 this->vert
[0] = this->vert
[1];
1067 this->vert
[1] = this->vert
[2];
1068 this->vert
[2] = this->vert
[3];
1073 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
1075 this->vert
[this->curIndex
] = index
;
1077 if (this->curIndex
== 4)
1079 // assembled enough verts for prim, add to gather indices
1080 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1081 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1083 // increment numPrimsAssembled
1084 this->numPrimsAssembled
++;
1086 // set up next prim state
1087 this->vert
[0] = this->vert
[1];
1088 this->vert
[1] = this->vert
[2];
1089 this->vert
[2] = this->vert
[3];
1094 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
1096 this->vert
[this->curIndex
] = index
;
1098 if (this->curIndex
== 4)
1100 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1101 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1102 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1103 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1105 this->numPrimsAssembled
++;
1110 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
1112 this->vert
[this->curIndex
] = index
;
1114 if (this->curIndex
== 4)
1116 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1117 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1119 this->numPrimsAssembled
++;
1124 void ProcessVertPointList(uint32_t index
, bool finish
)
1126 this->vert
[this->curIndex
] = index
;
1128 if (this->curIndex
== 1)
1130 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1131 this->numPrimsAssembled
++;
1137 // Primitive Assembly for data output from the DomainShader.
1138 struct PA_TESS
: PA_STATE
1141 DRAW_CONTEXT
*in_pDC
,
1142 const SIMDSCALAR
* in_pVertData
,
1143 uint32_t in_attributeStrideInVectors
,
1144 uint32_t in_vertexStride
,
1145 uint32_t in_numAttributes
,
1146 uint32_t* (&in_ppIndices
)[3],
1147 uint32_t in_numPrims
,
1148 PRIMITIVE_TOPOLOGY in_binTopology
) :
1150 PA_STATE(in_pDC
, nullptr, 0, in_vertexStride
),
1151 m_pVertexData(in_pVertData
),
1152 m_attributeStrideInVectors(in_attributeStrideInVectors
),
1153 m_numAttributes(in_numAttributes
),
1154 m_numPrims(in_numPrims
)
1156 #if USE_SIMD16_FRONTEND
1157 m_vPrimId
= _simd16_setzero_si();
1159 m_vPrimId
= _simd_setzero_si();
1161 binTopology
= in_binTopology
;
1162 m_ppIndices
[0] = in_ppIndices
[0];
1163 m_ppIndices
[1] = in_ppIndices
[1];
1164 m_ppIndices
[2] = in_ppIndices
[2];
1166 switch (binTopology
)
1168 case TOP_POINT_LIST
:
1169 m_numVertsPerPrim
= 1;
1173 m_numVertsPerPrim
= 2;
1176 case TOP_TRIANGLE_LIST
:
1177 m_numVertsPerPrim
= 3;
1181 SWR_INVALID("Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1188 return m_numPrims
!= 0;
1191 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1193 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1197 #if ENABLE_AVX512_SIMD16
1198 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
1200 SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__
);
1201 return junkVector_simd16
;
1205 static SIMDSCALARI
GenPrimMask(uint32_t numPrims
)
1207 SWR_ASSERT(numPrims
<= SIMD_WIDTH
);
1208 #if USE_SIMD16_FRONTEND
1209 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1211 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1212 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1215 return _simd16_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1217 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1219 -1, -1, -1, -1, -1, -1, -1, -1,
1220 0, 0, 0, 0, 0, 0, 0, 0
1223 return _simd_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1227 bool Assemble(uint32_t slot
, simdvector verts
[])
1229 SWR_ASSERT(slot
< m_numAttributes
);
1231 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1232 if (0 == numPrimsToAssemble
)
1237 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1239 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1240 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1242 #if USE_SIMD16_FRONTEND
1243 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1245 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1248 const float* pBase
= pBaseAttrib
;
1249 for (uint32_t c
= 0; c
< 4; ++c
)
1251 #if USE_SIMD16_FRONTEND
1252 simd16scalar temp
= _simd16_mask_i32gather_ps(
1253 _simd16_setzero_ps(),
1257 4 /* gcc doesn't like sizeof(float) */);
1259 verts
[i
].v
[c
] = useAlternateOffset
? _simd16_extract_ps(temp
, 1) : _simd16_extract_ps(temp
, 0);
1261 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1265 _simd_castsi_ps(mask
),
1266 4 /* gcc doesn't like sizeof(float) */);
1268 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1275 #if ENABLE_AVX512_SIMD16
1276 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
1278 SWR_ASSERT(slot
< m_numAttributes
);
1280 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1281 if (0 == numPrimsToAssemble
)
1286 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1288 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1289 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1291 #if USE_SIMD16_FRONTEND
1292 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1294 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1297 const float* pBase
= pBaseAttrib
;
1298 for (uint32_t c
= 0; c
< 4; ++c
)
1300 #if USE_SIMD16_FRONTEND
1301 verts
[i
].v
[c
] = _simd16_mask_i32gather_ps(
1302 _simd16_setzero_ps(),
1306 4 /* gcc doesn't like sizeof(float) */);
1308 simdscalar temp
= _simd_mask_i32gather_ps(
1312 _simd_castsi_ps(mask
),
1313 4 /* gcc doesn't like sizeof(float) */);
1314 verts
[i
].v
[c
] = _simd16_insert_ps(_simd16_setzero_ps(), temp
, 0);
1316 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1324 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1326 SWR_ASSERT(slot
< m_numAttributes
);
1327 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1329 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1330 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1332 #if USE_SIMD16_FRONTEND
1333 uint32_t index
= useAlternateOffset
? m_ppIndices
[i
][primIndex
+ SIMD_WIDTH_DIV2
] : m_ppIndices
[i
][primIndex
];
1335 uint32_t index
= m_ppIndices
[i
][primIndex
];
1337 const float* pVertData
= pVertDataBase
;
1338 float* pVert
= (float*)&verts
[i
];
1340 for (uint32_t c
= 0; c
< 4; ++c
)
1342 pVert
[c
] = pVertData
[index
];
1343 pVertData
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1350 uint32_t numPrims
= PA_TESS::NumPrims();
1351 m_numPrims
-= numPrims
;
1352 m_ppIndices
[0] += numPrims
;
1353 m_ppIndices
[1] += numPrims
;
1354 m_ppIndices
[2] += numPrims
;
1359 SIMDVERTEX
& GetNextVsOutput()
1365 bool GetNextStreamOutput()
1371 SIMDMASK
& GetNextVsIndices()
1379 return std::min
<uint32_t>(m_numPrims
, SIMD_WIDTH
);
1387 SIMDSCALARI
GetPrimID(uint32_t startID
)
1389 #if USE_SIMD16_FRONTEND
1390 return _simd16_add_epi32(_simd16_set1_epi32(startID
), m_vPrimId
);
1392 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1397 const SIMDSCALAR
* m_pVertexData
= nullptr;
1398 uint32_t m_attributeStrideInVectors
= 0;
1399 uint32_t m_numAttributes
= 0;
1400 uint32_t m_numPrims
= 0;
1401 uint32_t* m_ppIndices
[3];
1403 uint32_t m_numVertsPerPrim
= 0;
1405 SIMDSCALARI m_vPrimId
;
1407 simdvector junkVector
; // junk simdvector for unimplemented API
1408 #if ENABLE_AVX512_SIMD16
1409 simd16vector junkVector_simd16
; // junk simd16vector for unimplemented API
1411 SIMDVERTEX junkVertex
; // junk SIMDVERTEX for unimplemented API
1412 SIMDMASK junkIndices
; // temporary index store for unused virtual function
1415 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1417 template <typename IsIndexedT
, typename IsCutIndexEnabledT
>
1420 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
, PA_STATE::SIMDVERTEX
*pVertexStore
, uint32_t vertexStoreSize
, uint32_t vertexStride
) : topo(in_topo
)
1422 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1423 const API_STATE
& state
= GetApiState(pDC
);
1424 if ((IsIndexedT::value
&& IsCutIndexEnabledT::value
&& (
1425 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1426 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1427 topo
== TOP_TRIANGLE_LIST
)) ||
1429 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1430 // for them in the optimized PA
1431 (topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
))
1433 memset(&indexStore
, 0, sizeof(indexStore
));
1434 uint32_t numAttribs
= state
.feNumAttributes
;
1436 new (&this->paCut
) PA_STATE_CUT(pDC
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
,
1437 vertexStride
, &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false);
1443 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1444 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, reinterpret_cast<uint8_t *>(pVertexStore
), vertexStoreSize
* PA_STATE::SIMD_WIDTH
, vertexStride
, false);
1452 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1467 bool cutPA
{ false };
1469 PRIMITIVE_TOPOLOGY topo
{ TOP_UNKNOWN
};
1471 PA_STATE::SIMDMASK indexStore
[MAX_NUM_VERTS_PER_PRIM
];