1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 DRAW_CONTEXT
*pDC
; // draw context
38 uint8_t* pStreamBase
; // vertex stream
39 uint32_t streamSizeInVerts
; // total size of the input stream in verts
41 // The topology the binner will use. In some cases the FE changes the topology from the api state.
42 PRIMITIVE_TOPOLOGY binTopology
;
45 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
) :
46 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
) {}
48 virtual bool HasWork() = 0;
49 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
50 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
51 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[]) = 0;
52 virtual bool NextPrim() = 0;
53 virtual simdvertex
& GetNextVsOutput() = 0;
54 virtual bool GetNextStreamOutput() = 0;
55 virtual simdmask
& GetNextVsIndices() = 0;
56 virtual uint32_t NumPrims() = 0;
57 virtual void Reset() = 0;
58 virtual simdscalari
GetPrimID(uint32_t startID
) = 0;
61 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
62 // output. Here is the sequence
63 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
64 // 2. Execute PA function to assemble and bin triangles.
65 // a. The PA function is a set of functions that collectively make up the
66 // state machine for a given topology.
67 // 1. We use a state index to track which PA function to call.
68 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
69 // 1. We call this the current and previous simd vertex.
70 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
71 // order to assemble the second triangle, for a triangle list, we'll need the
72 // last vertex from the previous simd and the first 2 vertices from the current simd.
73 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
75 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
77 struct PA_STATE_OPT
: public PA_STATE
79 simdvertex leadingVertex
; // For tri-fan
80 uint32_t numPrims
; // Total number of primitives for draw.
81 uint32_t numPrimsComplete
; // Total number of complete primitives.
83 uint32_t numSimdPrims
; // Number of prims in current simd.
85 uint32_t cur
; // index to current VS output.
86 uint32_t prev
; // index to prev VS output. Not really needed in the state.
87 uint32_t first
; // index to first VS output. Used for trifan.
89 uint32_t counter
; // state counter
90 bool reset
; // reset state
92 uint32_t primIDIncr
; // how much to increment for each vector (typically vector / {1, 2})
95 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& state
, uint32_t slot
, simdvector verts
[]);
96 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
98 PFN_PA_FUNC pfnPaFunc
; // PA state machine function for assembling 4 triangles.
99 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
; // PA state machine function for assembling single triangle.
100 PFN_PA_FUNC pfnPaFuncReset
; // initial state to set on reset
102 // state used to advance the PA when Next is called
103 PFN_PA_FUNC pfnPaNextFunc
;
104 uint32_t nextNumSimdPrims
;
105 uint32_t nextNumPrimsIncrement
;
109 simdmask tmpIndices
; // temporary index store for unused virtual function
112 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
113 bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
117 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
120 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
122 simdvertex
* pVertex
= (simdvertex
*)pStreamBase
;
123 return pVertex
[index
].attrib
[slot
];
126 // Assembles 4 triangles. Each simdvector is a single vertex from 4
127 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
128 bool Assemble(uint32_t slot
, simdvector verts
[])
130 return this->pfnPaFunc(*this, slot
, verts
);
133 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
134 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
136 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
141 this->pfnPaFunc
= this->pfnPaNextFunc
;
142 this->numSimdPrims
= this->nextNumSimdPrims
;
143 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
144 this->reset
= this->nextReset
;
146 if (this->isStreaming
)
151 bool morePrims
= false;
153 if (this->numSimdPrims
> 0)
156 this->numSimdPrims
--;
160 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
164 this->pfnPaFunc
= this->pfnPaNextFunc
;
168 morePrims
= false; // no more to do
174 simdvertex
& GetNextVsOutput()
176 // increment cur and prev indices
177 const uint32_t numSimdVerts
= this->streamSizeInVerts
/ KNOB_SIMD_WIDTH
;
178 this->prev
= this->cur
; // prev is undefined for first state.
179 this->cur
= this->counter
% numSimdVerts
;
181 simdvertex
* pVertex
= (simdvertex
*)pStreamBase
;
182 return pVertex
[this->cur
];
185 simdmask
& GetNextVsIndices()
187 // unused in optimized PA, pass tmp buffer back
191 bool GetNextStreamOutput()
193 this->prev
= this->cur
;
194 this->cur
= this->counter
;
201 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
202 (KNOB_SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : KNOB_SIMD_WIDTH
;
205 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
206 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
207 uint32_t numSimdPrims
= 0,
208 uint32_t numPrimsIncrement
= 0,
211 this->pfnPaNextFunc
= pfnPaNextFunc
;
212 this->nextNumSimdPrims
= numSimdPrims
;
213 this->nextNumPrimsIncrement
= numPrimsIncrement
;
214 this->nextReset
= reset
;
216 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
221 this->pfnPaFunc
= this->pfnPaFuncReset
;
222 this->numPrimsComplete
= 0;
223 this->numSimdPrims
= 0;
231 simdscalari
GetPrimID(uint32_t startID
)
233 return _simd_add_epi32(this->primID
,
234 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ KNOB_SIMD_WIDTH
)));
238 // helper C wrappers to avoid having to rewrite all the PA topology state functions
239 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
240 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
241 uint32_t numSimdPrims
= 0,
242 uint32_t numPrimsIncrement
= 0,
245 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
247 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
249 return pa
.GetSimdVector(index
, slot
);
252 INLINE __m128
swizzleLane0(const simdvector
&a
)
254 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
255 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
256 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
259 INLINE __m128
swizzleLane1(const simdvector
&a
)
261 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
262 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
263 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
266 INLINE __m128
swizzleLane2(const simdvector
&a
)
268 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
269 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
270 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
273 INLINE __m128
swizzleLane3(const simdvector
&a
)
275 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
276 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
277 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
280 INLINE __m128
swizzleLane4(const simdvector
&a
)
282 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
283 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
284 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
288 INLINE __m128
swizzleLane5(const simdvector
&a
)
290 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
291 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
292 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
295 INLINE __m128
swizzleLane6(const simdvector
&a
)
297 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
298 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
299 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
302 INLINE __m128
swizzleLane7(const simdvector
&a
)
304 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
305 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
306 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
309 INLINE __m128
swizzleLaneN(const simdvector
&a
, int lane
)
313 return swizzleLane0(a
);
315 return swizzleLane1(a
);
317 return swizzleLane2(a
);
319 return swizzleLane3(a
);
321 return swizzleLane4(a
);
323 return swizzleLane5(a
);
325 return swizzleLane6(a
);
327 return swizzleLane7(a
);
329 return _mm_setzero_ps();
333 // Cut-aware primitive assembler.
334 struct PA_STATE_CUT
: public PA_STATE
336 simdmask
* pCutIndices
; // cut indices buffer, 1 bit per vertex
337 uint32_t numVerts
; // number of vertices available in buffer store
338 uint32_t numAttribs
; // number of attributes
339 int32_t numRemainingVerts
; // number of verts remaining to be assembled
340 uint32_t numVertsToAssemble
; // total number of verts to assemble for the draw
341 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][KNOB_SIMD_WIDTH
]; // current index buffer for gather
342 simdscalari vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
343 uint32_t numPrimsAssembled
; // number of primitives that are fully assembled
344 uint32_t headVertex
; // current unused vertex slot in vertex buffer store
345 uint32_t tailVertex
; // beginning vertex currently assembling
346 uint32_t curVertex
; // current unprocessed vertex
347 uint32_t startPrimId
; // starting prim id
348 simdscalari vPrimId
; // vector of prim ID
349 bool needOffsets
; // need to compute gather offsets for current SIMD
350 uint32_t vertsPerPrim
;
351 simdvertex tmpVertex
; // temporary simdvertex for unimplemented API
352 bool processCutVerts
; // vertex indices with cuts should be processed as normal, otherwise they
353 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
354 // while the GS sends valid verts for every index
355 // Topology state tracking
356 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
358 bool reverseWinding
; // indicates reverse winding for strips
359 int32_t adjExtraVert
; // extra vert uses for tristrip w/ adj
361 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
362 PFN_PA_FUNC pfnPa
; // per-topology function that processes a single vert
365 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, simdmask
* in_pIndices
, uint32_t in_numVerts
,
366 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
)
367 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
)
369 numVerts
= in_streamSizeInVerts
;
370 numAttribs
= in_numAttribs
;
373 processCutVerts
= in_processCutVerts
;
375 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
376 numPrimsAssembled
= 0;
377 headVertex
= tailVertex
= curVertex
= 0;
380 pCutIndices
= in_pIndices
;
381 memset(indices
, 0, sizeof(indices
));
382 vPrimId
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
383 reverseWinding
= false;
386 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
387 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
391 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
392 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
393 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
394 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
396 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
400 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
404 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
405 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
406 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
407 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
408 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
409 default: assert(0 && "Unimplemented topology");
413 simdvertex
& GetNextVsOutput()
415 uint32_t vertexIndex
= this->headVertex
/ KNOB_SIMD_WIDTH
;
416 this->headVertex
= (this->headVertex
+ KNOB_SIMD_WIDTH
) % this->numVerts
;
417 this->needOffsets
= true;
418 return ((simdvertex
*)pStreamBase
)[vertexIndex
];
421 simdmask
& GetNextVsIndices()
423 uint32_t vertexIndex
= this->headVertex
/ KNOB_SIMD_WIDTH
;
424 simdmask
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
425 return *pCurCutIndex
;
428 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
431 SWR_ASSERT(0 && "Not implemented");
432 return this->tmpVertex
.attrib
[0];
435 bool GetNextStreamOutput()
437 this->headVertex
+= KNOB_SIMD_WIDTH
;
438 this->needOffsets
= true;
442 simdscalari
GetPrimID(uint32_t startID
)
444 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
449 this->numRemainingVerts
= this->numVertsToAssemble
;
450 this->numPrimsAssembled
= 0;
453 this->tailVertex
= 0;
454 this->headVertex
= 0;
455 this->reverseWinding
= false;
456 this->adjExtraVert
= -1;
457 this->vPrimId
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
462 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
465 bool IsVertexStoreFull()
467 return ((this->headVertex
+ KNOB_SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
470 void RestartTopology()
473 this->reverseWinding
= false;
474 this->adjExtraVert
= -1;
477 bool IsCutIndex(uint32_t vertex
)
479 uint32_t vertexIndex
= vertex
/ KNOB_SIMD_WIDTH
;
480 uint32_t vertexOffset
= vertex
& (KNOB_SIMD_WIDTH
- 1);
481 return _bittest((const LONG
*)&this->pCutIndices
[vertexIndex
], vertexOffset
) == 1;
484 // iterates across the unprocessed verts until we hit the end or we
485 // have assembled SIMD prims
488 while (this->numPrimsAssembled
!= KNOB_SIMD_WIDTH
&&
489 this->numRemainingVerts
> 0 &&
490 this->curVertex
!= this->headVertex
)
492 // if cut index, restart topology
493 if (IsCutIndex(this->curVertex
))
495 if (this->processCutVerts
)
497 (this->*pfnPa
)(this->curVertex
, false);
499 // finish off tri strip w/ adj before restarting topo
500 if (this->adjExtraVert
!= -1)
502 (this->*pfnPa
)(this->curVertex
, true);
508 (this->*pfnPa
)(this->curVertex
, false);
511 this->curVertex
= (this->curVertex
+ 1) % this->numVerts
;
512 this->numRemainingVerts
--;
515 // special case last primitive for tri strip w/ adj
516 if (this->numPrimsAssembled
!= KNOB_SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
518 (this->*pfnPa
)(this->curVertex
, true);
524 // done with current batch
525 // advance tail to the current unsubmitted vertex
526 this->tailVertex
= this->curVertex
;
527 this->numPrimsAssembled
= 0;
528 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
533 // if we've assembled enough prims, we can advance to the next set of verts
534 if (this->numPrimsAssembled
== KNOB_SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
541 void ComputeOffsets()
543 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
545 simdscalari vIndices
= *(simdscalari
*)&this->indices
[v
][0];
547 // step to simdvertex batch
548 const uint32_t simdShift
= 3; // @todo make knob
549 simdscalari vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
550 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(sizeof(simdvertex
)));
553 const uint32_t simdMask
= 0x7; // @todo make knob
554 simdscalari vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
555 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
559 bool Assemble(uint32_t slot
, simdvector result
[])
561 // process any outstanding verts
564 // return false if we don't have enough prims assembled
565 if (this->numPrimsAssembled
!= KNOB_SIMD_WIDTH
&& this->numRemainingVerts
> 0)
570 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
571 if (this->needOffsets
)
574 this->needOffsets
= false;
577 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
579 simdscalari offsets
= this->vOffsets
[v
];
582 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
584 float* pBase
= (float*)this->pStreamBase
;
585 for (uint32_t c
= 0; c
< 4; ++c
)
587 result
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
589 // move base to next component
590 pBase
+= KNOB_SIMD_WIDTH
;
597 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, __m128 tri
[3])
600 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
602 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
603 uint32_t offset
= pOffset
[triIndex
];
604 offset
+= sizeof(simdvector
) * slot
;
605 float* pVert
= (float*)&tri
[v
];
606 for (uint32_t c
= 0; c
< 4; ++c
)
608 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
609 pVert
[c
] = *pComponent
;
610 offset
+= KNOB_SIMD_WIDTH
* sizeof(float);
617 return this->numPrimsAssembled
;
620 // Per-topology functions
621 void ProcessVertTriStrip(uint32_t index
, bool finish
)
623 this->vert
[this->curIndex
] = index
;
625 if (this->curIndex
== 3)
627 // assembled enough verts for prim, add to gather indices
628 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
631 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
632 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
636 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
637 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
640 // increment numPrimsAssembled
641 this->numPrimsAssembled
++;
643 // set up next prim state
644 this->vert
[0] = this->vert
[1];
645 this->vert
[1] = this->vert
[2];
647 this->reverseWinding
^= 1;
651 template<bool gsEnabled
>
652 void AssembleTriStripAdj()
656 this->vert
[1] = this->vert
[2];
657 this->vert
[2] = this->vert
[4];
659 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
660 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
661 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
663 this->vert
[4] = this->vert
[2];
664 this->vert
[2] = this->vert
[1];
668 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
669 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
670 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
671 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
672 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
673 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
675 this->numPrimsAssembled
++;
679 template<bool gsEnabled
>
680 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
682 // handle last primitive of tristrip
683 if (finish
&& this->adjExtraVert
!= -1)
685 this->vert
[3] = this->adjExtraVert
;
686 AssembleTriStripAdj
<gsEnabled
>();
687 this->adjExtraVert
= -1;
691 switch (this->curIndex
)
697 this->vert
[this->curIndex
] = index
;
701 this->vert
[5] = index
;
705 if (this->adjExtraVert
== -1)
707 this->adjExtraVert
= index
;
711 this->vert
[3] = index
;
714 AssembleTriStripAdj
<gsEnabled
>();
717 if (this->reverseWinding
)
719 nextTri
[0] = this->vert
[4];
720 nextTri
[1] = this->vert
[0];
721 nextTri
[2] = this->vert
[2];
722 nextTri
[4] = this->vert
[3];
723 nextTri
[5] = this->adjExtraVert
;
727 nextTri
[0] = this->vert
[2];
728 nextTri
[1] = this->adjExtraVert
;
729 nextTri
[2] = this->vert
[3];
730 nextTri
[4] = this->vert
[4];
731 nextTri
[5] = this->vert
[0];
733 for (uint32_t i
= 0; i
< 6; ++i
)
735 this->vert
[i
] = nextTri
[i
];
738 this->adjExtraVert
= -1;
739 this->reverseWinding
^= 1;
748 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
749 AssembleTriStripAdj
<gsEnabled
>();
752 if (this->reverseWinding
)
754 nextTri
[0] = this->vert
[4];
755 nextTri
[1] = this->vert
[0];
756 nextTri
[2] = this->vert
[2];
757 nextTri
[4] = this->vert
[3];
758 nextTri
[5] = this->adjExtraVert
;
762 nextTri
[0] = this->vert
[2];
763 nextTri
[1] = this->adjExtraVert
;
764 nextTri
[2] = this->vert
[3];
765 nextTri
[4] = this->vert
[4];
766 nextTri
[5] = this->vert
[0];
768 for (uint32_t i
= 0; i
< 6; ++i
)
770 this->vert
[i
] = nextTri
[i
];
772 this->reverseWinding
^= 1;
773 this->adjExtraVert
= index
;
779 void ProcessVertTriList(uint32_t index
, bool finish
)
781 this->vert
[this->curIndex
] = index
;
783 if (this->curIndex
== 3)
785 // assembled enough verts for prim, add to gather indices
786 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
787 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
788 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
790 // increment numPrimsAssembled
791 this->numPrimsAssembled
++;
793 // set up next prim state
798 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
800 this->vert
[this->curIndex
] = index
;
802 if (this->curIndex
== 6)
804 // assembled enough verts for prim, add to gather indices
805 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
806 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
807 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
808 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
809 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
810 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
812 // increment numPrimsAssembled
813 this->numPrimsAssembled
++;
815 // set up next prim state
820 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
822 this->vert
[this->curIndex
] = index
;
824 if (this->curIndex
== 6)
826 // assembled enough verts for prim, add to gather indices
827 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
828 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
829 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
831 // increment numPrimsAssembled
832 this->numPrimsAssembled
++;
834 // set up next prim state
840 void ProcessVertLineList(uint32_t index
, bool finish
)
842 this->vert
[this->curIndex
] = index
;
844 if (this->curIndex
== 2)
846 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
847 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
849 this->numPrimsAssembled
++;
854 void ProcessVertLineStrip(uint32_t index
, bool finish
)
856 this->vert
[this->curIndex
] = index
;
858 if (this->curIndex
== 2)
860 // assembled enough verts for prim, add to gather indices
861 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
862 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
864 // increment numPrimsAssembled
865 this->numPrimsAssembled
++;
867 // set up next prim state
868 this->vert
[0] = this->vert
[1];
873 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
875 this->vert
[this->curIndex
] = index
;
877 if (this->curIndex
== 4)
879 // assembled enough verts for prim, add to gather indices
880 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
881 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
882 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
883 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
885 // increment numPrimsAssembled
886 this->numPrimsAssembled
++;
888 // set up next prim state
889 this->vert
[0] = this->vert
[1];
890 this->vert
[1] = this->vert
[2];
891 this->vert
[2] = this->vert
[3];
896 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
898 this->vert
[this->curIndex
] = index
;
900 if (this->curIndex
== 4)
902 // assembled enough verts for prim, add to gather indices
903 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
904 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
906 // increment numPrimsAssembled
907 this->numPrimsAssembled
++;
909 // set up next prim state
910 this->vert
[0] = this->vert
[1];
911 this->vert
[1] = this->vert
[2];
912 this->vert
[2] = this->vert
[3];
917 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
919 this->vert
[this->curIndex
] = index
;
921 if (this->curIndex
== 4)
923 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
924 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
925 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
926 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
928 this->numPrimsAssembled
++;
933 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
935 this->vert
[this->curIndex
] = index
;
937 if (this->curIndex
== 4)
939 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
940 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
942 this->numPrimsAssembled
++;
947 void ProcessVertPointList(uint32_t index
, bool finish
)
949 this->vert
[this->curIndex
] = index
;
951 if (this->curIndex
== 1)
953 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
954 this->numPrimsAssembled
++;
960 // Primitive Assembly for data output from the DomainShader.
961 struct PA_TESS
: PA_STATE
964 DRAW_CONTEXT
*in_pDC
,
965 const simdscalar
* in_pVertData
,
966 uint32_t in_attributeStrideInVectors
,
967 uint32_t in_numAttributes
,
968 uint32_t* (&in_ppIndices
)[3],
969 uint32_t in_numPrims
,
970 PRIMITIVE_TOPOLOGY in_binTopology
) :
972 PA_STATE(in_pDC
, nullptr, 0),
973 m_pVertexData(in_pVertData
),
974 m_attributeStrideInVectors(in_attributeStrideInVectors
),
975 m_numAttributes(in_numAttributes
),
976 m_numPrims(in_numPrims
)
978 m_vPrimId
= _simd_setzero_si();
979 binTopology
= in_binTopology
;
980 m_ppIndices
[0] = in_ppIndices
[0];
981 m_ppIndices
[1] = in_ppIndices
[1];
982 m_ppIndices
[2] = in_ppIndices
[2];
987 m_numVertsPerPrim
= 1;
991 m_numVertsPerPrim
= 2;
994 case TOP_TRIANGLE_LIST
:
995 m_numVertsPerPrim
= 3;
999 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1006 return m_numPrims
!= 0;
1009 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1011 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__
);
1012 static simdvector junk
= { 0 };
1016 static simdscalari
GenPrimMask(uint32_t numPrims
)
1018 SWR_ASSERT(numPrims
<= KNOB_SIMD_WIDTH
);
1019 #if KNOB_SIMD_WIDTH == 8
1020 static const OSALIGN(int32_t, 64) maskGen
[KNOB_SIMD_WIDTH
* 2] =
1022 -1, -1, -1, -1, -1, -1, -1, -1,
1023 0, 0, 0, 0, 0, 0, 0, 0
1025 #elif KNOB_SIMD_WIDTH == 16
1026 static const OSALIGN(int32_t, 128) maskGen
[KNOB_SIMD_WIDTH
* 2] =
1028 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1029 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1032 #error "Help, help, I can't get up!"
1035 return _simd_loadu_si((const simdscalari
*)&maskGen
[KNOB_SIMD_WIDTH
- numPrims
]);
1038 bool Assemble(uint32_t slot
, simdvector verts
[])
1040 static_assert(KNOB_SIMD_WIDTH
== 8, "Need to revisit this when AVX512 is implemented");
1041 SWR_ASSERT(slot
< m_numAttributes
);
1043 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1044 if (0 == numPrimsToAssemble
)
1049 simdscalari mask
= GenPrimMask(numPrimsToAssemble
);
1051 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1052 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1054 simdscalari indices
= _simd_load_si((const simdscalari
*)m_ppIndices
[i
]);
1056 const float* pBase
= pBaseAttrib
;
1057 for (uint32_t c
= 0; c
< 4; ++c
)
1059 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1063 _simd_castsi_ps(mask
),
1064 4 /* gcc doesn't like sizeof(float) */);
1065 pBase
+= m_attributeStrideInVectors
* KNOB_SIMD_WIDTH
;
1072 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1074 SWR_ASSERT(slot
< m_numAttributes
);
1075 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1077 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1078 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1080 uint32_t index
= m_ppIndices
[i
][primIndex
];
1081 const float* pVertData
= pVertDataBase
;
1082 float* pVert
= (float*)&verts
[i
];
1084 for (uint32_t c
= 0; c
< 4; ++c
)
1086 pVert
[c
] = pVertData
[index
];
1087 pVertData
+= m_attributeStrideInVectors
* KNOB_SIMD_WIDTH
;
1094 uint32_t numPrims
= PA_TESS::NumPrims();
1095 m_numPrims
-= numPrims
;
1096 m_ppIndices
[0] += numPrims
;
1097 m_ppIndices
[1] += numPrims
;
1098 m_ppIndices
[2] += numPrims
;
1103 simdvertex
& GetNextVsOutput()
1105 SWR_ASSERT(0, "%s", __FUNCTION__
);
1106 static simdvertex junk
;
1110 bool GetNextStreamOutput()
1112 SWR_ASSERT(0, "%s", __FUNCTION__
);
1116 simdmask
& GetNextVsIndices()
1118 SWR_ASSERT(0, "%s", __FUNCTION__
);
1119 static simdmask junk
;
1125 return std::min
<uint32_t>(m_numPrims
, KNOB_SIMD_WIDTH
);
1128 void Reset() { SWR_ASSERT(0); };
1130 simdscalari
GetPrimID(uint32_t startID
)
1132 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1136 const simdscalar
* m_pVertexData
= nullptr;
1137 uint32_t m_attributeStrideInVectors
= 0;
1138 uint32_t m_numAttributes
= 0;
1139 uint32_t m_numPrims
= 0;
1140 uint32_t* m_ppIndices
[3];
1142 uint32_t m_numVertsPerPrim
= 0;
1144 simdscalari m_vPrimId
;
1147 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1149 template <bool IsIndexedT
>
1152 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
) : topo(in_topo
)
1154 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1155 const API_STATE
& state
= GetApiState(pDC
);
1156 if ((IsIndexedT
&& (
1157 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1158 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1159 topo
== TOP_TRIANGLE_LIST
|| topo
== TOP_LINE_LIST_ADJ
||
1160 topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
||
1161 topo
== TOP_TRI_STRIP_ADJ
)) ||
1163 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1164 // for them in the optimized PA
1166 topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
)))
1168 memset(&indexStore
, 0, sizeof(indexStore
));
1170 _BitScanReverse(&numAttribs
, state
.feAttribMask
);
1172 new (&this->paCut
) PA_STATE_CUT(pDC
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* KNOB_SIMD_WIDTH
,
1173 &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false);
1179 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1180 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* KNOB_SIMD_WIDTH
, false);
1188 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1204 PRIMITIVE_TOPOLOGY topo
;
1206 simdvertex vertexStore
[MAX_NUM_VERTS_PER_PRIM
];
1207 simdmask indexStore
[MAX_NUM_VERTS_PER_PRIM
];