1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 #if USE_SIMD16_FRONTEND
40 SIMD_WIDTH
= KNOB_SIMD16_WIDTH
,
41 SIMD_WIDTH_DIV2
= KNOB_SIMD16_WIDTH
/ 2,
45 typedef simd16mask SIMDMASK
;
47 typedef simd16scalar SIMDSCALAR
;
48 typedef simd16vector SIMDVECTOR
;
49 typedef simd16vertex SIMDVERTEX
;
51 typedef simd16scalari SIMDSCALARI
;
56 SIMD_WIDTH
= KNOB_SIMD_WIDTH
,
57 SIMD_WIDTH_DIV2
= KNOB_SIMD_WIDTH
/ 2,
61 typedef simdmask SIMDMASK
;
63 typedef simdscalar SIMDSCALAR
;
64 typedef simdvector SIMDVECTOR
;
65 typedef simdvertex SIMDVERTEX
;
67 typedef simdscalari SIMDSCALARI
;
70 DRAW_CONTEXT
*pDC
{ nullptr }; // draw context
71 uint8_t* pStreamBase
{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts
{ 0 }; // total size of the input stream in verts
74 // The topology the binner will use. In some cases the FE changes the topology from the api state.
75 PRIMITIVE_TOPOLOGY binTopology
{ TOP_UNKNOWN
};
77 #if ENABLE_AVX512_SIMD16
78 bool useAlternateOffset
{ false };
82 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
) :
83 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
) {}
85 virtual bool HasWork() = 0;
86 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
87 #if ENABLE_AVX512_SIMD16
88 virtual simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
) = 0;
90 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
91 #if ENABLE_AVX512_SIMD16
92 virtual bool Assemble_simd16(uint32_t slot
, simd16vector verts
[]) = 0;
94 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[]) = 0;
95 virtual bool NextPrim() = 0;
96 virtual SIMDVERTEX
& GetNextVsOutput() = 0;
97 virtual bool GetNextStreamOutput() = 0;
98 virtual SIMDMASK
& GetNextVsIndices() = 0;
99 virtual uint32_t NumPrims() = 0;
100 virtual void Reset() = 0;
101 virtual SIMDSCALARI
GetPrimID(uint32_t startID
) = 0;
104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
105 // output. Here is the sequence
106 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
107 // 2. Execute PA function to assemble and bin triangles.
108 // a. The PA function is a set of functions that collectively make up the
109 // state machine for a given topology.
110 // 1. We use a state index to track which PA function to call.
111 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
112 // 1. We call this the current and previous simd vertex.
113 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
114 // order to assemble the second triangle, for a triangle list, we'll need the
115 // last vertex from the previous simd and the first 2 vertices from the current simd.
116 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
120 struct PA_STATE_OPT
: public PA_STATE
122 uint32_t numPrims
{ 0 }; // Total number of primitives for draw.
123 uint32_t numPrimsComplete
{ 0 }; // Total number of complete primitives.
125 uint32_t numSimdPrims
{ 0 }; // Number of prims in current simd.
127 uint32_t cur
{ 0 }; // index to current VS output.
128 uint32_t prev
{ 0 }; // index to prev VS output. Not really needed in the state.
129 uint32_t first
{ 0 }; // index to first VS output. Used for trifan.
131 uint32_t counter
{ 0 }; // state counter
132 bool reset
{ false }; // reset state
134 uint32_t primIDIncr
{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
137 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& state
, uint32_t slot
, simdvector verts
[]);
138 #if ENABLE_AVX512_SIMD16
139 typedef bool(*PFN_PA_FUNC_SIMD16
)(PA_STATE_OPT
& state
, uint32_t slot
, simd16vector verts
[]);
141 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
143 PFN_PA_FUNC pfnPaFunc
{ nullptr }; // PA state machine function for assembling 4 triangles.
144 #if ENABLE_AVX512_SIMD16
145 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16
{ nullptr };
147 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
{ nullptr }; // PA state machine function for assembling single triangle.
148 PFN_PA_FUNC pfnPaFuncReset
{ nullptr }; // initial state to set on reset
149 #if ENABLE_AVX512_SIMD16
150 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16
{ nullptr };
153 // state used to advance the PA when Next is called
154 PFN_PA_FUNC pfnPaNextFunc
{ nullptr };
155 #if ENABLE_AVX512_SIMD16
156 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
{ nullptr };
158 uint32_t nextNumSimdPrims
{ 0 };
159 uint32_t nextNumPrimsIncrement
{ 0 };
160 bool nextReset
{ false };
161 bool isStreaming
{ false };
163 SIMDMASK tmpIndices
{ 0 }; // temporary index store for unused virtual function
166 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
167 bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
171 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
174 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
176 simdvertex
* pVertex
= (simdvertex
*)pStreamBase
;
177 return pVertex
[index
].attrib
[slot
];
180 #if ENABLE_AVX512_SIMD16
181 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
183 simd16vertex
* pVertex
= (simd16vertex
*)pStreamBase
;
184 return pVertex
[index
].attrib
[slot
];
188 // Assembles 4 triangles. Each simdvector is a single vertex from 4
189 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
190 bool Assemble(uint32_t slot
, simdvector verts
[])
192 return this->pfnPaFunc(*this, slot
, verts
);
195 #if ENABLE_AVX512_SIMD16
196 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
198 return this->pfnPaFunc_simd16(*this, slot
, verts
);
202 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
203 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
205 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
210 this->pfnPaFunc
= this->pfnPaNextFunc
;
211 #if ENABLE_AVX512_SIMD16
212 this->pfnPaFunc_simd16
= this->pfnPaNextFunc_simd16
;
214 this->numSimdPrims
= this->nextNumSimdPrims
;
215 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
216 this->reset
= this->nextReset
;
218 if (this->isStreaming
)
223 bool morePrims
= false;
225 if (this->numSimdPrims
> 0)
228 this->numSimdPrims
--;
232 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
236 this->pfnPaFunc
= this->pfnPaNextFunc
;
240 morePrims
= false; // no more to do
246 SIMDVERTEX
& GetNextVsOutput()
248 // increment cur and prev indices
249 const uint32_t numSimdVerts
= this->streamSizeInVerts
/ SIMD_WIDTH
;
250 this->prev
= this->cur
; // prev is undefined for first state.
251 this->cur
= this->counter
% numSimdVerts
;
253 SIMDVERTEX
* pVertex
= (SIMDVERTEX
*)pStreamBase
;
254 return pVertex
[this->cur
];
257 SIMDMASK
& GetNextVsIndices()
259 // unused in optimized PA, pass tmp buffer back
263 bool GetNextStreamOutput()
265 this->prev
= this->cur
;
266 this->cur
= this->counter
;
273 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
274 (SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : SIMD_WIDTH
;
277 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
278 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
279 uint32_t numSimdPrims
= 0,
280 uint32_t numPrimsIncrement
= 0,
283 this->pfnPaNextFunc
= pfnPaNextFunc
;
284 this->nextNumSimdPrims
= numSimdPrims
;
285 this->nextNumPrimsIncrement
= numPrimsIncrement
;
286 this->nextReset
= reset
;
288 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
291 #if ENABLE_AVX512_SIMD16
292 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
293 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
294 uint32_t numSimdPrims
= 0,
295 uint32_t numPrimsIncrement
= 0,
298 this->pfnPaNextFunc_simd16
= pfnPaNextFunc_simd16
;
299 this->nextNumSimdPrims
= numSimdPrims
;
300 this->nextNumPrimsIncrement
= numPrimsIncrement
;
301 this->nextReset
= reset
;
303 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
309 #if ENABLE_AVX512_SIMD16
310 useAlternateOffset
= false;
313 this->pfnPaFunc
= this->pfnPaFuncReset
;
314 this->numPrimsComplete
= 0;
315 this->numSimdPrims
= 0;
323 SIMDSCALARI
GetPrimID(uint32_t startID
)
325 #if USE_SIMD16_FRONTEND
326 return _simd16_add_epi32(this->primID
,
327 _simd16_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
329 return _simd_add_epi32(this->primID
,
330 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
335 // helper C wrappers to avoid having to rewrite all the PA topology state functions
336 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
337 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
338 uint32_t numSimdPrims
= 0,
339 uint32_t numPrimsIncrement
= 0,
342 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
345 #if ENABLE_AVX512_SIMD16
346 INLINE
void SetNextPaState_simd16(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
347 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
348 uint32_t numSimdPrims
= 0,
349 uint32_t numPrimsIncrement
= 0,
352 return pa
.SetNextState_simd16(pfnPaNextFunc_simd16
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
356 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
358 return pa
.GetSimdVector(index
, slot
);
361 #if ENABLE_AVX512_SIMD16
362 INLINE simd16vector
& PaGetSimdVector_simd16(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
364 return pa
.GetSimdVector_simd16(index
, slot
);
368 // Cut-aware primitive assembler.
369 struct PA_STATE_CUT
: public PA_STATE
371 SIMDMASK
* pCutIndices
{ nullptr }; // cut indices buffer, 1 bit per vertex
372 uint32_t numVerts
{ 0 }; // number of vertices available in buffer store
373 uint32_t numAttribs
{ 0 }; // number of attributes
374 int32_t numRemainingVerts
{ 0 }; // number of verts remaining to be assembled
375 uint32_t numVertsToAssemble
{ 0 }; // total number of verts to assemble for the draw
376 #if ENABLE_AVX512_SIMD16
377 OSALIGNSIMD16(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
379 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
381 SIMDSCALARI vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
382 uint32_t numPrimsAssembled
{ 0 }; // number of primitives that are fully assembled
383 uint32_t headVertex
{ 0 }; // current unused vertex slot in vertex buffer store
384 uint32_t tailVertex
{ 0 }; // beginning vertex currently assembling
385 uint32_t curVertex
{ 0 }; // current unprocessed vertex
386 uint32_t startPrimId
{ 0 }; // starting prim id
387 SIMDSCALARI vPrimId
; // vector of prim ID
388 bool needOffsets
{ false }; // need to compute gather offsets for current SIMD
389 uint32_t vertsPerPrim
{ 0 };
390 bool processCutVerts
{ false }; // vertex indices with cuts should be processed as normal, otherwise they
391 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
392 // while the GS sends valid verts for every index
393 // Topology state tracking
394 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
395 uint32_t curIndex
{ 0 };
396 bool reverseWinding
{ false }; // indicates reverse winding for strips
397 int32_t adjExtraVert
{ 0 }; // extra vert uses for tristrip w/ adj
399 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
400 PFN_PA_FUNC pfnPa
{ nullptr }; // per-topology function that processes a single vert
403 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, SIMDMASK
* in_pIndices
, uint32_t in_numVerts
,
404 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
)
405 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
)
407 numVerts
= in_streamSizeInVerts
;
408 numAttribs
= in_numAttribs
;
411 processCutVerts
= in_processCutVerts
;
413 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
414 numPrimsAssembled
= 0;
415 headVertex
= tailVertex
= curVertex
= 0;
418 pCutIndices
= in_pIndices
;
419 memset(indices
, 0, sizeof(indices
));
420 #if USE_SIMD16_FRONTEND
421 vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
423 vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
425 reverseWinding
= false;
428 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
429 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
433 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
434 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
435 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
436 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
438 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
442 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
446 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
447 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
448 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
449 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
450 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
451 default: assert(0 && "Unimplemented topology");
455 SIMDVERTEX
& GetNextVsOutput()
457 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
458 this->headVertex
= (this->headVertex
+ SIMD_WIDTH
) % this->numVerts
;
459 this->needOffsets
= true;
460 return ((SIMDVERTEX
*)pStreamBase
)[vertexIndex
];
463 SIMDMASK
& GetNextVsIndices()
465 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
466 SIMDMASK
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
467 return *pCurCutIndex
;
470 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
473 SWR_ASSERT(0 && "Not implemented");
474 static simdvector junk
;
478 #if ENABLE_AVX512_SIMD16
479 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
482 SWR_ASSERT(0 && "Not implemented");
483 static simd16vector junk
;
488 bool GetNextStreamOutput()
490 this->headVertex
+= SIMD_WIDTH
;
491 this->needOffsets
= true;
495 SIMDSCALARI
GetPrimID(uint32_t startID
)
497 #if USE_SIMD16_FRONTEND
498 return _simd16_add_epi32(_simd16_set1_epi32(startID
), this->vPrimId
);
500 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
506 #if ENABLE_AVX512_SIMD16
507 useAlternateOffset
= false;
510 this->numRemainingVerts
= this->numVertsToAssemble
;
511 this->numPrimsAssembled
= 0;
514 this->tailVertex
= 0;
515 this->headVertex
= 0;
516 this->reverseWinding
= false;
517 this->adjExtraVert
= -1;
518 #if USE_SIMD16_FRONTEND
519 this->vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
521 this->vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
527 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
530 bool IsVertexStoreFull()
532 return ((this->headVertex
+ SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
535 void RestartTopology()
538 this->reverseWinding
= false;
539 this->adjExtraVert
= -1;
542 bool IsCutIndex(uint32_t vertex
)
544 uint32_t vertexIndex
= vertex
/ SIMD_WIDTH
;
545 uint32_t vertexOffset
= vertex
& (SIMD_WIDTH
- 1);
546 return _bittest((const LONG
*)&this->pCutIndices
[vertexIndex
], vertexOffset
) == 1;
549 // iterates across the unprocessed verts until we hit the end or we
550 // have assembled SIMD prims
553 while (this->numPrimsAssembled
!= SIMD_WIDTH
&&
554 this->numRemainingVerts
> 0 &&
555 this->curVertex
!= this->headVertex
)
557 // if cut index, restart topology
558 if (IsCutIndex(this->curVertex
))
560 if (this->processCutVerts
)
562 (this->*pfnPa
)(this->curVertex
, false);
564 // finish off tri strip w/ adj before restarting topo
565 if (this->adjExtraVert
!= -1)
567 (this->*pfnPa
)(this->curVertex
, true);
573 (this->*pfnPa
)(this->curVertex
, false);
577 if (this->curVertex
>= this->numVerts
) {
580 this->numRemainingVerts
--;
583 // special case last primitive for tri strip w/ adj
584 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
586 (this->*pfnPa
)(this->curVertex
, true);
592 // done with current batch
593 // advance tail to the current unsubmitted vertex
594 this->tailVertex
= this->curVertex
;
595 this->numPrimsAssembled
= 0;
596 #if USE_SIMD16_FRONTEND
597 this->vPrimId
= _simd16_add_epi32(vPrimId
, _simd16_set1_epi32(SIMD_WIDTH
));
599 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(SIMD_WIDTH
));
605 // if we've assembled enough prims, we can advance to the next set of verts
606 if (this->numPrimsAssembled
== SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
613 void ComputeOffsets()
615 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
617 SIMDSCALARI vIndices
= *(SIMDSCALARI
*)&this->indices
[v
][0];
619 // step to simdvertex batch
620 const uint32_t simdShift
= SIMD_WIDTH_LOG2
;
621 #if USE_SIMD16_FRONTEND
622 SIMDSCALARI vVertexBatch
= _simd16_srai_epi32(vIndices
, simdShift
);
623 this->vOffsets
[v
] = _simd16_mullo_epi32(vVertexBatch
, _simd16_set1_epi32(sizeof(SIMDVERTEX
)));
625 SIMDSCALARI vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
626 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(sizeof(SIMDVERTEX
)));
630 const uint32_t simdMask
= SIMD_WIDTH
- 1;
631 #if USE_SIMD16_FRONTEND
632 SIMDSCALARI vVertexIndex
= _simd16_and_si(vIndices
, _simd16_set1_epi32(simdMask
));
633 this->vOffsets
[v
] = _simd16_add_epi32(this->vOffsets
[v
], _simd16_mullo_epi32(vVertexIndex
, _simd16_set1_epi32(sizeof(float))));
635 SIMDSCALARI vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
636 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
641 bool Assemble(uint32_t slot
, simdvector verts
[])
643 // process any outstanding verts
646 // return false if we don't have enough prims assembled
647 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
652 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
653 if (this->needOffsets
)
656 this->needOffsets
= false;
659 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
661 SIMDSCALARI offsets
= this->vOffsets
[v
];
664 #if USE_SIMD16_FRONTEND
665 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
667 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
670 float* pBase
= (float*)this->pStreamBase
;
671 for (uint32_t c
= 0; c
< 4; ++c
)
673 #if USE_SIMD16_FRONTEND
674 simd16scalar temp
= _simd16_i32gather_ps(pBase
, offsets
, 1);
676 verts
[v
].v
[c
] = useAlternateOffset
? temp
.hi
: temp
.lo
;
678 verts
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
681 // move base to next component
689 #if ENABLE_AVX512_SIMD16
690 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
692 // process any outstanding verts
695 // return false if we don't have enough prims assembled
696 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
701 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
702 if (this->needOffsets
)
705 this->needOffsets
= false;
708 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
710 SIMDSCALARI offsets
= this->vOffsets
[v
];
713 #if USE_SIMD16_FRONTEND
714 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
716 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
719 float* pBase
= (float*)this->pStreamBase
;
720 for (uint32_t c
= 0; c
< 4; ++c
)
722 #if USE_SIMD16_FRONTEND
723 verts
[v
].v
[c
] = _simd16_i32gather_ps(pBase
, offsets
, 1);
725 verts
[v
].v
[c
].lo
= _simd_i32gather_ps(pBase
, offsets
, 1);
726 verts
[v
].v
[c
].hi
= _simd_setzero_ps();
729 // move base to next component
738 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, __m128 tri
[3])
741 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
743 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
744 #if USE_SIMD16_FRONTEND
745 uint32_t offset
= useAlternateOffset
? pOffset
[triIndex
+ SIMD_WIDTH_DIV2
] : pOffset
[triIndex
];
747 uint32_t offset
= pOffset
[triIndex
];
749 offset
+= sizeof(SIMDVECTOR
) * slot
;
750 float* pVert
= (float*)&tri
[v
];
751 for (uint32_t c
= 0; c
< 4; ++c
)
753 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
754 pVert
[c
] = *pComponent
;
755 offset
+= SIMD_WIDTH
* sizeof(float);
762 return this->numPrimsAssembled
;
765 // Per-topology functions
766 void ProcessVertTriStrip(uint32_t index
, bool finish
)
768 this->vert
[this->curIndex
] = index
;
770 if (this->curIndex
== 3)
772 // assembled enough verts for prim, add to gather indices
773 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
776 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
777 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
781 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
782 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
785 // increment numPrimsAssembled
786 this->numPrimsAssembled
++;
788 // set up next prim state
789 this->vert
[0] = this->vert
[1];
790 this->vert
[1] = this->vert
[2];
792 this->reverseWinding
^= 1;
796 template<bool gsEnabled
>
797 void AssembleTriStripAdj()
801 this->vert
[1] = this->vert
[2];
802 this->vert
[2] = this->vert
[4];
804 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
805 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
806 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
808 this->vert
[4] = this->vert
[2];
809 this->vert
[2] = this->vert
[1];
813 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
814 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
815 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
816 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
817 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
818 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
820 this->numPrimsAssembled
++;
824 template<bool gsEnabled
>
825 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
827 // handle last primitive of tristrip
828 if (finish
&& this->adjExtraVert
!= -1)
830 this->vert
[3] = this->adjExtraVert
;
831 AssembleTriStripAdj
<gsEnabled
>();
832 this->adjExtraVert
= -1;
836 switch (this->curIndex
)
842 this->vert
[this->curIndex
] = index
;
846 this->vert
[5] = index
;
850 if (this->adjExtraVert
== -1)
852 this->adjExtraVert
= index
;
856 this->vert
[3] = index
;
859 AssembleTriStripAdj
<gsEnabled
>();
862 if (this->reverseWinding
)
864 nextTri
[0] = this->vert
[4];
865 nextTri
[1] = this->vert
[0];
866 nextTri
[2] = this->vert
[2];
867 nextTri
[4] = this->vert
[3];
868 nextTri
[5] = this->adjExtraVert
;
872 nextTri
[0] = this->vert
[2];
873 nextTri
[1] = this->adjExtraVert
;
874 nextTri
[2] = this->vert
[3];
875 nextTri
[4] = this->vert
[4];
876 nextTri
[5] = this->vert
[0];
878 for (uint32_t i
= 0; i
< 6; ++i
)
880 this->vert
[i
] = nextTri
[i
];
883 this->adjExtraVert
= -1;
884 this->reverseWinding
^= 1;
893 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
894 AssembleTriStripAdj
<gsEnabled
>();
897 if (this->reverseWinding
)
899 nextTri
[0] = this->vert
[4];
900 nextTri
[1] = this->vert
[0];
901 nextTri
[2] = this->vert
[2];
902 nextTri
[4] = this->vert
[3];
903 nextTri
[5] = this->adjExtraVert
;
907 nextTri
[0] = this->vert
[2];
908 nextTri
[1] = this->adjExtraVert
;
909 nextTri
[2] = this->vert
[3];
910 nextTri
[4] = this->vert
[4];
911 nextTri
[5] = this->vert
[0];
913 for (uint32_t i
= 0; i
< 6; ++i
)
915 this->vert
[i
] = nextTri
[i
];
917 this->reverseWinding
^= 1;
918 this->adjExtraVert
= index
;
924 void ProcessVertTriList(uint32_t index
, bool finish
)
926 this->vert
[this->curIndex
] = index
;
928 if (this->curIndex
== 3)
930 // assembled enough verts for prim, add to gather indices
931 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
932 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
933 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
935 // increment numPrimsAssembled
936 this->numPrimsAssembled
++;
938 // set up next prim state
943 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
945 this->vert
[this->curIndex
] = index
;
947 if (this->curIndex
== 6)
949 // assembled enough verts for prim, add to gather indices
950 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
951 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
952 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
953 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
954 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
955 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
957 // increment numPrimsAssembled
958 this->numPrimsAssembled
++;
960 // set up next prim state
965 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
967 this->vert
[this->curIndex
] = index
;
969 if (this->curIndex
== 6)
971 // assembled enough verts for prim, add to gather indices
972 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
973 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
974 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
976 // increment numPrimsAssembled
977 this->numPrimsAssembled
++;
979 // set up next prim state
985 void ProcessVertLineList(uint32_t index
, bool finish
)
987 this->vert
[this->curIndex
] = index
;
989 if (this->curIndex
== 2)
991 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
992 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
994 this->numPrimsAssembled
++;
999 void ProcessVertLineStrip(uint32_t index
, bool finish
)
1001 this->vert
[this->curIndex
] = index
;
1003 if (this->curIndex
== 2)
1005 // assembled enough verts for prim, add to gather indices
1006 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1007 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1009 // increment numPrimsAssembled
1010 this->numPrimsAssembled
++;
1012 // set up next prim state
1013 this->vert
[0] = this->vert
[1];
1018 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
1020 this->vert
[this->curIndex
] = index
;
1022 if (this->curIndex
== 4)
1024 // assembled enough verts for prim, add to gather indices
1025 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1026 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1027 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1028 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1030 // increment numPrimsAssembled
1031 this->numPrimsAssembled
++;
1033 // set up next prim state
1034 this->vert
[0] = this->vert
[1];
1035 this->vert
[1] = this->vert
[2];
1036 this->vert
[2] = this->vert
[3];
1041 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
1043 this->vert
[this->curIndex
] = index
;
1045 if (this->curIndex
== 4)
1047 // assembled enough verts for prim, add to gather indices
1048 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1049 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1051 // increment numPrimsAssembled
1052 this->numPrimsAssembled
++;
1054 // set up next prim state
1055 this->vert
[0] = this->vert
[1];
1056 this->vert
[1] = this->vert
[2];
1057 this->vert
[2] = this->vert
[3];
1062 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
1064 this->vert
[this->curIndex
] = index
;
1066 if (this->curIndex
== 4)
1068 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1069 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1070 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1071 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1073 this->numPrimsAssembled
++;
1078 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
1080 this->vert
[this->curIndex
] = index
;
1082 if (this->curIndex
== 4)
1084 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1085 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1087 this->numPrimsAssembled
++;
1092 void ProcessVertPointList(uint32_t index
, bool finish
)
1094 this->vert
[this->curIndex
] = index
;
1096 if (this->curIndex
== 1)
1098 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1099 this->numPrimsAssembled
++;
1105 // Primitive Assembly for data output from the DomainShader.
1106 struct PA_TESS
: PA_STATE
1109 DRAW_CONTEXT
*in_pDC
,
1110 const SIMDSCALAR
* in_pVertData
,
1111 uint32_t in_attributeStrideInVectors
,
1112 uint32_t in_numAttributes
,
1113 uint32_t* (&in_ppIndices
)[3],
1114 uint32_t in_numPrims
,
1115 PRIMITIVE_TOPOLOGY in_binTopology
) :
1117 PA_STATE(in_pDC
, nullptr, 0),
1118 m_pVertexData(in_pVertData
),
1119 m_attributeStrideInVectors(in_attributeStrideInVectors
),
1120 m_numAttributes(in_numAttributes
),
1121 m_numPrims(in_numPrims
)
1123 #if USE_SIMD16_FRONTEND
1124 m_vPrimId
= _simd16_setzero_si();
1126 m_vPrimId
= _simd_setzero_si();
1128 binTopology
= in_binTopology
;
1129 m_ppIndices
[0] = in_ppIndices
[0];
1130 m_ppIndices
[1] = in_ppIndices
[1];
1131 m_ppIndices
[2] = in_ppIndices
[2];
1133 switch (binTopology
)
1135 case TOP_POINT_LIST
:
1136 m_numVertsPerPrim
= 1;
1140 m_numVertsPerPrim
= 2;
1143 case TOP_TRIANGLE_LIST
:
1144 m_numVertsPerPrim
= 3;
1148 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1155 return m_numPrims
!= 0;
1158 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1160 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__
);
1161 static simdvector junk
;
1165 #if ENABLE_AVX512_SIMD16
1166 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
1168 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__
);
1169 static simd16vector junk
;
1174 static SIMDSCALARI
GenPrimMask(uint32_t numPrims
)
1176 SWR_ASSERT(numPrims
<= SIMD_WIDTH
);
1177 #if USE_SIMD16_FRONTEND
1178 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1180 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1184 return _simd16_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1186 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1188 -1, -1, -1, -1, -1, -1, -1, -1,
1189 0, 0, 0, 0, 0, 0, 0, 0
1192 return _simd_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1196 bool Assemble(uint32_t slot
, simdvector verts
[])
1198 SWR_ASSERT(slot
< m_numAttributes
);
1200 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1201 if (0 == numPrimsToAssemble
)
1206 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1208 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1209 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1211 #if USE_SIMD16_FRONTEND
1212 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1214 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1217 const float* pBase
= pBaseAttrib
;
1218 for (uint32_t c
= 0; c
< 4; ++c
)
1220 #if USE_SIMD16_FRONTEND
1221 simd16scalar temp
= _simd16_mask_i32gather_ps(
1222 _simd16_setzero_ps(),
1226 4 /* gcc doesn't like sizeof(float) */);
1228 verts
[i
].v
[c
] = useAlternateOffset
? temp
.hi
: temp
.lo
;
1230 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1234 _simd_castsi_ps(mask
),
1235 4 /* gcc doesn't like sizeof(float) */);
1237 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1244 #if ENABLE_AVX512_SIMD16
1245 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
1247 SWR_ASSERT(slot
< m_numAttributes
);
1249 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1250 if (0 == numPrimsToAssemble
)
1255 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1257 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1258 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1260 #if USE_SIMD16_FRONTEND
1261 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1263 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1266 const float* pBase
= pBaseAttrib
;
1267 for (uint32_t c
= 0; c
< 4; ++c
)
1269 #if USE_SIMD16_FRONTEND
1270 verts
[i
].v
[c
] = _simd16_mask_i32gather_ps(
1271 _simd16_setzero_ps(),
1275 4 /* gcc doesn't like sizeof(float) */);
1277 verts
[i
].v
[c
].lo
= _simd_mask_i32gather_ps(
1281 _simd_castsi_ps(mask
),
1282 4 /* gcc doesn't like sizeof(float) */);
1283 verts
[i
].v
[c
].hi
= _simd_setzero_ps();
1285 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1293 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1295 SWR_ASSERT(slot
< m_numAttributes
);
1296 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1298 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1299 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1301 #if USE_SIMD16_FRONTEND
1302 uint32_t index
= useAlternateOffset
? m_ppIndices
[i
][primIndex
+ SIMD_WIDTH_DIV2
] : m_ppIndices
[i
][primIndex
];
1304 uint32_t index
= m_ppIndices
[i
][primIndex
];
1306 const float* pVertData
= pVertDataBase
;
1307 float* pVert
= (float*)&verts
[i
];
1309 for (uint32_t c
= 0; c
< 4; ++c
)
1311 pVert
[c
] = pVertData
[index
];
1312 pVertData
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1319 uint32_t numPrims
= PA_TESS::NumPrims();
1320 m_numPrims
-= numPrims
;
1321 m_ppIndices
[0] += numPrims
;
1322 m_ppIndices
[1] += numPrims
;
1323 m_ppIndices
[2] += numPrims
;
1328 SIMDVERTEX
& GetNextVsOutput()
1330 SWR_ASSERT(0, "%s", __FUNCTION__
);
1331 static SIMDVERTEX junk
;
1335 bool GetNextStreamOutput()
1337 SWR_ASSERT(0, "%s", __FUNCTION__
);
1341 SIMDMASK
& GetNextVsIndices()
1343 SWR_ASSERT(0, "%s", __FUNCTION__
);
1344 static SIMDMASK junk
;
1350 return std::min
<uint32_t>(m_numPrims
, SIMD_WIDTH
);
1353 void Reset() { SWR_ASSERT(0); };
1355 SIMDSCALARI
GetPrimID(uint32_t startID
)
1357 #if USE_SIMD16_FRONTEND
1358 return _simd16_add_epi32(_simd16_set1_epi32(startID
), m_vPrimId
);
1360 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1365 const SIMDSCALAR
* m_pVertexData
= nullptr;
1366 uint32_t m_attributeStrideInVectors
= 0;
1367 uint32_t m_numAttributes
= 0;
1368 uint32_t m_numPrims
= 0;
1369 uint32_t* m_ppIndices
[3];
1371 uint32_t m_numVertsPerPrim
= 0;
1373 SIMDSCALARI m_vPrimId
;
1376 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1378 template <typename IsIndexedT
, typename IsCutIndexEnabledT
>
1381 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
) : topo(in_topo
)
1383 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1384 const API_STATE
& state
= GetApiState(pDC
);
1385 if ((IsIndexedT::value
&& IsCutIndexEnabledT::value
&& (
1386 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1387 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1388 topo
== TOP_TRIANGLE_LIST
)) ||
1390 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1391 // for them in the optimized PA
1392 (topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
))
1394 memset(&indexStore
, 0, sizeof(indexStore
));
1395 uint32_t numAttribs
= state
.feNumAttributes
;
1397 new (&this->paCut
) PA_STATE_CUT(pDC
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* PA_STATE::SIMD_WIDTH
,
1398 &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false);
1404 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1405 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* PA_STATE::SIMD_WIDTH
, false);
1413 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1427 bool cutPA
{ false };
1429 PRIMITIVE_TOPOLOGY topo
{ TOP_UNKNOWN
};
1431 PA_STATE::SIMDVERTEX vertexStore
[MAX_NUM_VERTS_PER_PRIM
];
1432 PA_STATE::SIMDMASK indexStore
[MAX_NUM_VERTS_PER_PRIM
];