1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Definitions for primitive assembly.
26 * N primitives are assembled at a time, where N is the SIMD width.
27 * A state machine, that is specific for a given topology, drives the
28 * assembly of vertices into triangles.
30 ******************************************************************************/
37 #if USE_SIMD16_FRONTEND
40 SIMD_WIDTH
= KNOB_SIMD16_WIDTH
,
41 SIMD_WIDTH_DIV2
= KNOB_SIMD16_WIDTH
/ 2,
45 typedef simd16mask SIMDMASK
;
47 typedef simd16scalar SIMDSCALAR
;
48 typedef simd16vector SIMDVECTOR
;
49 typedef simd16vertex SIMDVERTEX
;
51 typedef simd16scalari SIMDSCALARI
;
56 SIMD_WIDTH
= KNOB_SIMD_WIDTH
,
57 SIMD_WIDTH_DIV2
= KNOB_SIMD_WIDTH
/ 2,
61 typedef simdmask SIMDMASK
;
63 typedef simdscalar SIMDSCALAR
;
64 typedef simdvector SIMDVECTOR
;
65 typedef simdvertex SIMDVERTEX
;
67 typedef simdscalari SIMDSCALARI
;
70 DRAW_CONTEXT
*pDC
{ nullptr }; // draw context
71 uint8_t* pStreamBase
{ nullptr }; // vertex stream
72 uint32_t streamSizeInVerts
{ 0 }; // total size of the input stream in verts
74 // The topology the binner will use. In some cases the FE changes the topology from the api state.
75 PRIMITIVE_TOPOLOGY binTopology
{ TOP_UNKNOWN
};
77 #if ENABLE_AVX512_SIMD16
78 bool useAlternateOffset
{ false };
82 PA_STATE(DRAW_CONTEXT
*in_pDC
, uint8_t* in_pStreamBase
, uint32_t in_streamSizeInVerts
) :
83 pDC(in_pDC
), pStreamBase(in_pStreamBase
), streamSizeInVerts(in_streamSizeInVerts
) {}
85 virtual bool HasWork() = 0;
86 virtual simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
) = 0;
87 #if ENABLE_AVX512_SIMD16
88 virtual simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
) = 0;
90 virtual bool Assemble(uint32_t slot
, simdvector verts
[]) = 0;
91 #if ENABLE_AVX512_SIMD16
92 virtual bool Assemble_simd16(uint32_t slot
, simd16vector verts
[]) = 0;
94 virtual void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[]) = 0;
95 virtual bool NextPrim() = 0;
96 virtual SIMDVERTEX
& GetNextVsOutput() = 0;
97 virtual bool GetNextStreamOutput() = 0;
98 virtual SIMDMASK
& GetNextVsIndices() = 0;
99 virtual uint32_t NumPrims() = 0;
100 virtual void Reset() = 0;
101 virtual SIMDSCALARI
GetPrimID(uint32_t startID
) = 0;
104 // The Optimized PA is a state machine that assembles triangles from vertex shader simd
105 // output. Here is the sequence
106 // 1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
107 // 2. Execute PA function to assemble and bin triangles.
108 // a. The PA function is a set of functions that collectively make up the
109 // state machine for a given topology.
110 // 1. We use a state index to track which PA function to call.
111 // b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
112 // 1. We call this the current and previous simd vertex.
113 // 2. The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
114 // order to assemble the second triangle, for a triangle list, we'll need the
115 // last vertex from the previous simd and the first 2 vertices from the current simd.
116 // 3. At times the PA can assemble multiple triangles from the 2 simd vertices.
118 // This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
120 struct PA_STATE_OPT
: public PA_STATE
122 SIMDVERTEX leadingVertex
; // For tri-fan
124 uint32_t numPrims
{ 0 }; // Total number of primitives for draw.
125 uint32_t numPrimsComplete
{ 0 }; // Total number of complete primitives.
127 uint32_t numSimdPrims
{ 0 }; // Number of prims in current simd.
129 uint32_t cur
{ 0 }; // index to current VS output.
130 uint32_t prev
{ 0 }; // index to prev VS output. Not really needed in the state.
131 uint32_t first
{ 0 }; // index to first VS output. Used for trifan.
133 uint32_t counter
{ 0 }; // state counter
134 bool reset
{ false }; // reset state
136 uint32_t primIDIncr
{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
139 typedef bool(*PFN_PA_FUNC
)(PA_STATE_OPT
& state
, uint32_t slot
, simdvector verts
[]);
140 #if ENABLE_AVX512_SIMD16
141 typedef bool(*PFN_PA_FUNC_SIMD16
)(PA_STATE_OPT
& state
, uint32_t slot
, simd16vector verts
[]);
143 typedef void(*PFN_PA_SINGLE_FUNC
)(PA_STATE_OPT
& pa
, uint32_t slot
, uint32_t primIndex
, __m128 verts
[]);
145 PFN_PA_FUNC pfnPaFunc
{ nullptr }; // PA state machine function for assembling 4 triangles.
146 #if ENABLE_AVX512_SIMD16
147 PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16
{ nullptr };
149 PFN_PA_SINGLE_FUNC pfnPaSingleFunc
{ nullptr }; // PA state machine function for assembling single triangle.
150 PFN_PA_FUNC pfnPaFuncReset
{ nullptr }; // initial state to set on reset
151 #if ENABLE_AVX512_SIMD16
152 PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16
{ nullptr };
155 // state used to advance the PA when Next is called
156 PFN_PA_FUNC pfnPaNextFunc
{ nullptr };
157 #if ENABLE_AVX512_SIMD16
158 PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
{ nullptr };
160 uint32_t nextNumSimdPrims
{ 0 };
161 uint32_t nextNumPrimsIncrement
{ 0 };
162 bool nextReset
{ false };
163 bool isStreaming
{ false };
165 SIMDMASK tmpIndices
{ 0 }; // temporary index store for unused virtual function
168 PA_STATE_OPT(DRAW_CONTEXT
* pDC
, uint32_t numPrims
, uint8_t* pStream
, uint32_t streamSizeInVerts
,
169 bool in_isStreaming
, PRIMITIVE_TOPOLOGY topo
= TOP_UNKNOWN
);
173 return (this->numPrimsComplete
< this->numPrims
) ? true : false;
176 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
178 simdvertex
* pVertex
= (simdvertex
*)pStreamBase
;
179 return pVertex
[index
].attrib
[slot
];
182 #if ENABLE_AVX512_SIMD16
183 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
185 simd16vertex
* pVertex
= (simd16vertex
*)pStreamBase
;
186 return pVertex
[index
].attrib
[slot
];
190 // Assembles 4 triangles. Each simdvector is a single vertex from 4
191 // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
192 bool Assemble(uint32_t slot
, simdvector verts
[])
194 return this->pfnPaFunc(*this, slot
, verts
);
197 #if ENABLE_AVX512_SIMD16
198 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
200 return this->pfnPaFunc_simd16(*this, slot
, verts
);
204 // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
205 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
207 return this->pfnPaSingleFunc(*this, slot
, primIndex
, verts
);
212 this->pfnPaFunc
= this->pfnPaNextFunc
;
213 #if ENABLE_AVX512_SIMD16
214 this->pfnPaFunc_simd16
= this->pfnPaNextFunc_simd16
;
216 this->numSimdPrims
= this->nextNumSimdPrims
;
217 this->numPrimsComplete
+= this->nextNumPrimsIncrement
;
218 this->reset
= this->nextReset
;
220 if (this->isStreaming
)
225 bool morePrims
= false;
227 if (this->numSimdPrims
> 0)
230 this->numSimdPrims
--;
234 this->counter
= (this->reset
) ? 0 : (this->counter
+ 1);
238 this->pfnPaFunc
= this->pfnPaNextFunc
;
242 morePrims
= false; // no more to do
248 SIMDVERTEX
& GetNextVsOutput()
250 // increment cur and prev indices
251 const uint32_t numSimdVerts
= this->streamSizeInVerts
/ SIMD_WIDTH
;
252 this->prev
= this->cur
; // prev is undefined for first state.
253 this->cur
= this->counter
% numSimdVerts
;
255 SIMDVERTEX
* pVertex
= (SIMDVERTEX
*)pStreamBase
;
256 return pVertex
[this->cur
];
259 SIMDMASK
& GetNextVsIndices()
261 // unused in optimized PA, pass tmp buffer back
265 bool GetNextStreamOutput()
267 this->prev
= this->cur
;
268 this->cur
= this->counter
;
275 return (this->numPrimsComplete
+ this->nextNumPrimsIncrement
> this->numPrims
) ?
276 (SIMD_WIDTH
- (this->numPrimsComplete
+ this->nextNumPrimsIncrement
- this->numPrims
)) : SIMD_WIDTH
;
279 void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
280 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
281 uint32_t numSimdPrims
= 0,
282 uint32_t numPrimsIncrement
= 0,
285 this->pfnPaNextFunc
= pfnPaNextFunc
;
286 this->nextNumSimdPrims
= numSimdPrims
;
287 this->nextNumPrimsIncrement
= numPrimsIncrement
;
288 this->nextReset
= reset
;
290 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
293 #if ENABLE_AVX512_SIMD16
294 void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
295 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
296 uint32_t numSimdPrims
= 0,
297 uint32_t numPrimsIncrement
= 0,
300 this->pfnPaNextFunc_simd16
= pfnPaNextFunc_simd16
;
301 this->nextNumSimdPrims
= numSimdPrims
;
302 this->nextNumPrimsIncrement
= numPrimsIncrement
;
303 this->nextReset
= reset
;
305 this->pfnPaSingleFunc
= pfnPaNextSingleFunc
;
311 #if ENABLE_AVX512_SIMD16
312 useAlternateOffset
= false;
315 this->pfnPaFunc
= this->pfnPaFuncReset
;
316 this->numPrimsComplete
= 0;
317 this->numSimdPrims
= 0;
325 SIMDSCALARI
GetPrimID(uint32_t startID
)
327 #if USE_SIMD16_FRONTEND
328 return _simd16_add_epi32(this->primID
,
329 _simd16_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
331 return _simd_add_epi32(this->primID
,
332 _simd_set1_epi32(startID
+ this->primIDIncr
* (this->numPrimsComplete
/ SIMD_WIDTH
)));
337 // helper C wrappers to avoid having to rewrite all the PA topology state functions
338 INLINE
void SetNextPaState(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc
,
339 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
340 uint32_t numSimdPrims
= 0,
341 uint32_t numPrimsIncrement
= 0,
344 return pa
.SetNextState(pfnPaNextFunc
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
347 #if ENABLE_AVX512_SIMD16
348 INLINE
void SetNextPaState_simd16(PA_STATE_OPT
& pa
, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16
,
349 PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc
,
350 uint32_t numSimdPrims
= 0,
351 uint32_t numPrimsIncrement
= 0,
354 return pa
.SetNextState_simd16(pfnPaNextFunc_simd16
, pfnPaNextSingleFunc
, numSimdPrims
, numPrimsIncrement
, reset
);
358 INLINE simdvector
& PaGetSimdVector(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
360 return pa
.GetSimdVector(index
, slot
);
363 #if ENABLE_AVX512_SIMD16
364 INLINE simd16vector
& PaGetSimdVector_simd16(PA_STATE
& pa
, uint32_t index
, uint32_t slot
)
366 return pa
.GetSimdVector_simd16(index
, slot
);
370 INLINE __m128
swizzleLane0(const simdvector
&a
)
372 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
373 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
374 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
377 INLINE __m128
swizzleLane1(const simdvector
&a
)
379 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
380 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
381 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
384 INLINE __m128
swizzleLane2(const simdvector
&a
)
386 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
387 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
388 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 0);
391 INLINE __m128
swizzleLane3(const simdvector
&a
)
393 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
394 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
395 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 0);
398 INLINE __m128
swizzleLane4(const simdvector
&a
)
400 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
401 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
402 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
406 INLINE __m128
swizzleLane5(const simdvector
&a
)
408 simdscalar tmp0
= _mm256_unpacklo_ps(a
.x
, a
.z
);
409 simdscalar tmp1
= _mm256_unpacklo_ps(a
.y
, a
.w
);
410 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
413 INLINE __m128
swizzleLane6(const simdvector
&a
)
415 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
416 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
417 return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0
, tmp1
), 1);
420 INLINE __m128
swizzleLane7(const simdvector
&a
)
422 simdscalar tmp0
= _mm256_unpackhi_ps(a
.x
, a
.z
);
423 simdscalar tmp1
= _mm256_unpackhi_ps(a
.y
, a
.w
);
424 return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0
, tmp1
), 1);
427 INLINE __m128
swizzleLaneN(const simdvector
&a
, int lane
)
431 return swizzleLane0(a
);
433 return swizzleLane1(a
);
435 return swizzleLane2(a
);
437 return swizzleLane3(a
);
439 return swizzleLane4(a
);
441 return swizzleLane5(a
);
443 return swizzleLane6(a
);
445 return swizzleLane7(a
);
447 return _mm_setzero_ps();
451 // Cut-aware primitive assembler.
452 struct PA_STATE_CUT
: public PA_STATE
454 SIMDMASK
* pCutIndices
{ nullptr }; // cut indices buffer, 1 bit per vertex
455 uint32_t numVerts
{ 0 }; // number of vertices available in buffer store
456 uint32_t numAttribs
{ 0 }; // number of attributes
457 int32_t numRemainingVerts
{ 0 }; // number of verts remaining to be assembled
458 uint32_t numVertsToAssemble
{ 0 }; // total number of verts to assemble for the draw
459 #if ENABLE_AVX512_SIMD16
460 OSALIGNSIMD16(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
462 OSALIGNSIMD(uint32_t) indices
[MAX_NUM_VERTS_PER_PRIM
][SIMD_WIDTH
]; // current index buffer for gather
464 SIMDSCALARI vOffsets
[MAX_NUM_VERTS_PER_PRIM
]; // byte offsets for currently assembling simd
465 uint32_t numPrimsAssembled
{ 0 }; // number of primitives that are fully assembled
466 uint32_t headVertex
{ 0 }; // current unused vertex slot in vertex buffer store
467 uint32_t tailVertex
{ 0 }; // beginning vertex currently assembling
468 uint32_t curVertex
{ 0 }; // current unprocessed vertex
469 uint32_t startPrimId
{ 0 }; // starting prim id
470 SIMDSCALARI vPrimId
; // vector of prim ID
471 bool needOffsets
{ false }; // need to compute gather offsets for current SIMD
472 uint32_t vertsPerPrim
{ 0 };
473 SIMDVERTEX tmpVertex
; // temporary simdvertex for unimplemented API
474 bool processCutVerts
{ false }; // vertex indices with cuts should be processed as normal, otherwise they
475 // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
476 // while the GS sends valid verts for every index
477 // Topology state tracking
478 uint32_t vert
[MAX_NUM_VERTS_PER_PRIM
];
479 uint32_t curIndex
{ 0 };
480 bool reverseWinding
{ false }; // indicates reverse winding for strips
481 int32_t adjExtraVert
{ 0 }; // extra vert uses for tristrip w/ adj
483 typedef void(PA_STATE_CUT::* PFN_PA_FUNC
)(uint32_t vert
, bool finish
);
484 PFN_PA_FUNC pfnPa
{ nullptr }; // per-topology function that processes a single vert
487 PA_STATE_CUT(DRAW_CONTEXT
* pDC
, uint8_t* in_pStream
, uint32_t in_streamSizeInVerts
, SIMDMASK
* in_pIndices
, uint32_t in_numVerts
,
488 uint32_t in_numAttribs
, PRIMITIVE_TOPOLOGY topo
, bool in_processCutVerts
)
489 : PA_STATE(pDC
, in_pStream
, in_streamSizeInVerts
)
491 numVerts
= in_streamSizeInVerts
;
492 numAttribs
= in_numAttribs
;
495 processCutVerts
= in_processCutVerts
;
497 numVertsToAssemble
= numRemainingVerts
= in_numVerts
;
498 numPrimsAssembled
= 0;
499 headVertex
= tailVertex
= curVertex
= 0;
502 pCutIndices
= in_pIndices
;
503 memset(indices
, 0, sizeof(indices
));
504 #if USE_SIMD16_FRONTEND
505 vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
507 vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
509 reverseWinding
= false;
512 bool gsEnabled
= pDC
->pState
->state
.gsState
.gsEnable
;
513 vertsPerPrim
= NumVertsPerPrim(topo
, gsEnabled
);
517 case TOP_TRIANGLE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertTriList
; break;
518 case TOP_TRI_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertTriListAdj
: &PA_STATE_CUT::ProcessVertTriListAdjNoGs
; break;
519 case TOP_TRIANGLE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertTriStrip
; break;
520 case TOP_TRI_STRIP_ADJ
: if (gsEnabled
)
522 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< true > ;
526 pfnPa
= &PA_STATE_CUT::ProcessVertTriStripAdj
< false > ;
530 case TOP_POINT_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertPointList
; break;
531 case TOP_LINE_LIST
: pfnPa
= &PA_STATE_CUT::ProcessVertLineList
; break;
532 case TOP_LINE_LIST_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineListAdj
: &PA_STATE_CUT::ProcessVertLineListAdjNoGs
; break;
533 case TOP_LINE_STRIP
: pfnPa
= &PA_STATE_CUT::ProcessVertLineStrip
; break;
534 case TOP_LISTSTRIP_ADJ
: pfnPa
= gsEnabled
? &PA_STATE_CUT::ProcessVertLineStripAdj
: &PA_STATE_CUT::ProcessVertLineStripAdjNoGs
; break;
535 default: assert(0 && "Unimplemented topology");
539 SIMDVERTEX
& GetNextVsOutput()
541 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
542 this->headVertex
= (this->headVertex
+ SIMD_WIDTH
) % this->numVerts
;
543 this->needOffsets
= true;
544 return ((SIMDVERTEX
*)pStreamBase
)[vertexIndex
];
547 SIMDMASK
& GetNextVsIndices()
549 uint32_t vertexIndex
= this->headVertex
/ SIMD_WIDTH
;
550 SIMDMASK
* pCurCutIndex
= this->pCutIndices
+ vertexIndex
;
551 return *pCurCutIndex
;
554 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
557 SWR_ASSERT(0 && "Not implemented");
558 static simdvector junk
;
562 #if ENABLE_AVX512_SIMD16
563 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
566 SWR_ASSERT(0 && "Not implemented");
567 static simd16vector junk
;
572 bool GetNextStreamOutput()
574 this->headVertex
+= SIMD_WIDTH
;
575 this->needOffsets
= true;
579 SIMDSCALARI
GetPrimID(uint32_t startID
)
581 #if USE_SIMD16_FRONTEND
582 return _simd16_add_epi32(_simd16_set1_epi32(startID
), this->vPrimId
);
584 return _simd_add_epi32(_simd_set1_epi32(startID
), this->vPrimId
);
590 #if ENABLE_AVX512_SIMD16
591 useAlternateOffset
= false;
594 this->numRemainingVerts
= this->numVertsToAssemble
;
595 this->numPrimsAssembled
= 0;
598 this->tailVertex
= 0;
599 this->headVertex
= 0;
600 this->reverseWinding
= false;
601 this->adjExtraVert
= -1;
602 #if USE_SIMD16_FRONTEND
603 this->vPrimId
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
605 this->vPrimId
= _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
611 return this->numRemainingVerts
> 0 || this->adjExtraVert
!= -1;
614 bool IsVertexStoreFull()
616 return ((this->headVertex
+ SIMD_WIDTH
) % this->numVerts
) == this->tailVertex
;
619 void RestartTopology()
622 this->reverseWinding
= false;
623 this->adjExtraVert
= -1;
626 bool IsCutIndex(uint32_t vertex
)
628 uint32_t vertexIndex
= vertex
/ SIMD_WIDTH
;
629 uint32_t vertexOffset
= vertex
& (SIMD_WIDTH
- 1);
630 return _bittest((const LONG
*)&this->pCutIndices
[vertexIndex
], vertexOffset
) == 1;
633 // iterates across the unprocessed verts until we hit the end or we
634 // have assembled SIMD prims
637 while (this->numPrimsAssembled
!= SIMD_WIDTH
&&
638 this->numRemainingVerts
> 0 &&
639 this->curVertex
!= this->headVertex
)
641 // if cut index, restart topology
642 if (IsCutIndex(this->curVertex
))
644 if (this->processCutVerts
)
646 (this->*pfnPa
)(this->curVertex
, false);
648 // finish off tri strip w/ adj before restarting topo
649 if (this->adjExtraVert
!= -1)
651 (this->*pfnPa
)(this->curVertex
, true);
657 (this->*pfnPa
)(this->curVertex
, false);
661 if (this->curVertex
>= this->numVerts
) {
664 this->numRemainingVerts
--;
667 // special case last primitive for tri strip w/ adj
668 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
== 0 && this->adjExtraVert
!= -1)
670 (this->*pfnPa
)(this->curVertex
, true);
676 // done with current batch
677 // advance tail to the current unsubmitted vertex
678 this->tailVertex
= this->curVertex
;
679 this->numPrimsAssembled
= 0;
680 #if USE_SIMD16_FRONTEND
681 this->vPrimId
= _simd16_add_epi32(vPrimId
, _simd16_set1_epi32(SIMD_WIDTH
));
683 this->vPrimId
= _simd_add_epi32(vPrimId
, _simd_set1_epi32(SIMD_WIDTH
));
689 // if we've assembled enough prims, we can advance to the next set of verts
690 if (this->numPrimsAssembled
== SIMD_WIDTH
|| this->numRemainingVerts
<= 0)
697 void ComputeOffsets()
699 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
701 SIMDSCALARI vIndices
= *(SIMDSCALARI
*)&this->indices
[v
][0];
703 // step to simdvertex batch
704 const uint32_t simdShift
= SIMD_WIDTH_LOG2
;
705 #if USE_SIMD16_FRONTEND
706 SIMDSCALARI vVertexBatch
= _simd16_srai_epi32(vIndices
, simdShift
);
707 this->vOffsets
[v
] = _simd16_mullo_epi32(vVertexBatch
, _simd16_set1_epi32(sizeof(SIMDVERTEX
)));
709 SIMDSCALARI vVertexBatch
= _simd_srai_epi32(vIndices
, simdShift
);
710 this->vOffsets
[v
] = _simd_mullo_epi32(vVertexBatch
, _simd_set1_epi32(sizeof(SIMDVERTEX
)));
714 const uint32_t simdMask
= SIMD_WIDTH
- 1;
715 #if USE_SIMD16_FRONTEND
716 SIMDSCALARI vVertexIndex
= _simd16_and_si(vIndices
, _simd16_set1_epi32(simdMask
));
717 this->vOffsets
[v
] = _simd16_add_epi32(this->vOffsets
[v
], _simd16_mullo_epi32(vVertexIndex
, _simd16_set1_epi32(sizeof(float))));
719 SIMDSCALARI vVertexIndex
= _simd_and_si(vIndices
, _simd_set1_epi32(simdMask
));
720 this->vOffsets
[v
] = _simd_add_epi32(this->vOffsets
[v
], _simd_mullo_epi32(vVertexIndex
, _simd_set1_epi32(sizeof(float))));
725 bool Assemble(uint32_t slot
, simdvector verts
[])
727 // process any outstanding verts
730 // return false if we don't have enough prims assembled
731 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
736 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
737 if (this->needOffsets
)
740 this->needOffsets
= false;
743 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
745 SIMDSCALARI offsets
= this->vOffsets
[v
];
748 #if USE_SIMD16_FRONTEND
749 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
751 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
754 float* pBase
= (float*)this->pStreamBase
;
755 for (uint32_t c
= 0; c
< 4; ++c
)
757 #if USE_SIMD16_FRONTEND
758 simd16scalar temp
= _simd16_i32gather_ps(pBase
, offsets
, 1);
760 verts
[v
].v
[c
] = useAlternateOffset
? temp
.hi
: temp
.lo
;
762 verts
[v
].v
[c
] = _simd_i32gather_ps(pBase
, offsets
, 1);
765 // move base to next component
773 #if ENABLE_AVX512_SIMD16
774 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
776 // process any outstanding verts
779 // return false if we don't have enough prims assembled
780 if (this->numPrimsAssembled
!= SIMD_WIDTH
&& this->numRemainingVerts
> 0)
785 // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
786 if (this->needOffsets
)
789 this->needOffsets
= false;
792 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
794 SIMDSCALARI offsets
= this->vOffsets
[v
];
797 #if USE_SIMD16_FRONTEND
798 offsets
= _simd16_add_epi32(offsets
, _simd16_set1_epi32(slot
* sizeof(SIMDVECTOR
)));
800 offsets
= _simd_add_epi32(offsets
, _simd_set1_epi32(slot
* sizeof(simdvector
)));
803 float* pBase
= (float*)this->pStreamBase
;
804 for (uint32_t c
= 0; c
< 4; ++c
)
806 #if USE_SIMD16_FRONTEND
807 verts
[v
].v
[c
] = _simd16_i32gather_ps(pBase
, offsets
, 1);
809 verts
[v
].v
[c
].lo
= _simd_i32gather_ps(pBase
, offsets
, 1);
810 verts
[v
].v
[c
].hi
= _simd_setzero_ps();
813 // move base to next component
822 void AssembleSingle(uint32_t slot
, uint32_t triIndex
, __m128 tri
[3])
825 for (uint32_t v
= 0; v
< this->vertsPerPrim
; ++v
)
827 uint32_t* pOffset
= (uint32_t*)&this->vOffsets
[v
];
828 #if USE_SIMD16_FRONTEND
829 uint32_t offset
= useAlternateOffset
? pOffset
[triIndex
+ SIMD_WIDTH_DIV2
] : pOffset
[triIndex
];
831 uint32_t offset
= pOffset
[triIndex
];
833 offset
+= sizeof(SIMDVECTOR
) * slot
;
834 float* pVert
= (float*)&tri
[v
];
835 for (uint32_t c
= 0; c
< 4; ++c
)
837 float* pComponent
= (float*)(this->pStreamBase
+ offset
);
838 pVert
[c
] = *pComponent
;
839 offset
+= SIMD_WIDTH
* sizeof(float);
846 return this->numPrimsAssembled
;
849 // Per-topology functions
850 void ProcessVertTriStrip(uint32_t index
, bool finish
)
852 this->vert
[this->curIndex
] = index
;
854 if (this->curIndex
== 3)
856 // assembled enough verts for prim, add to gather indices
857 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
860 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
861 this->indices
[2][this->numPrimsAssembled
] = this->vert
[1];
865 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
866 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
869 // increment numPrimsAssembled
870 this->numPrimsAssembled
++;
872 // set up next prim state
873 this->vert
[0] = this->vert
[1];
874 this->vert
[1] = this->vert
[2];
876 this->reverseWinding
^= 1;
880 template<bool gsEnabled
>
881 void AssembleTriStripAdj()
885 this->vert
[1] = this->vert
[2];
886 this->vert
[2] = this->vert
[4];
888 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
889 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
890 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
892 this->vert
[4] = this->vert
[2];
893 this->vert
[2] = this->vert
[1];
897 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
898 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
899 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
900 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
901 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
902 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
904 this->numPrimsAssembled
++;
908 template<bool gsEnabled
>
909 void ProcessVertTriStripAdj(uint32_t index
, bool finish
)
911 // handle last primitive of tristrip
912 if (finish
&& this->adjExtraVert
!= -1)
914 this->vert
[3] = this->adjExtraVert
;
915 AssembleTriStripAdj
<gsEnabled
>();
916 this->adjExtraVert
= -1;
920 switch (this->curIndex
)
926 this->vert
[this->curIndex
] = index
;
930 this->vert
[5] = index
;
934 if (this->adjExtraVert
== -1)
936 this->adjExtraVert
= index
;
940 this->vert
[3] = index
;
943 AssembleTriStripAdj
<gsEnabled
>();
946 if (this->reverseWinding
)
948 nextTri
[0] = this->vert
[4];
949 nextTri
[1] = this->vert
[0];
950 nextTri
[2] = this->vert
[2];
951 nextTri
[4] = this->vert
[3];
952 nextTri
[5] = this->adjExtraVert
;
956 nextTri
[0] = this->vert
[2];
957 nextTri
[1] = this->adjExtraVert
;
958 nextTri
[2] = this->vert
[3];
959 nextTri
[4] = this->vert
[4];
960 nextTri
[5] = this->vert
[0];
962 for (uint32_t i
= 0; i
< 6; ++i
)
964 this->vert
[i
] = nextTri
[i
];
967 this->adjExtraVert
= -1;
968 this->reverseWinding
^= 1;
977 SWR_ASSERT(this->adjExtraVert
!= -1, "Algorith failure!");
978 AssembleTriStripAdj
<gsEnabled
>();
981 if (this->reverseWinding
)
983 nextTri
[0] = this->vert
[4];
984 nextTri
[1] = this->vert
[0];
985 nextTri
[2] = this->vert
[2];
986 nextTri
[4] = this->vert
[3];
987 nextTri
[5] = this->adjExtraVert
;
991 nextTri
[0] = this->vert
[2];
992 nextTri
[1] = this->adjExtraVert
;
993 nextTri
[2] = this->vert
[3];
994 nextTri
[4] = this->vert
[4];
995 nextTri
[5] = this->vert
[0];
997 for (uint32_t i
= 0; i
< 6; ++i
)
999 this->vert
[i
] = nextTri
[i
];
1001 this->reverseWinding
^= 1;
1002 this->adjExtraVert
= index
;
1008 void ProcessVertTriList(uint32_t index
, bool finish
)
1010 this->vert
[this->curIndex
] = index
;
1012 if (this->curIndex
== 3)
1014 // assembled enough verts for prim, add to gather indices
1015 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1016 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1017 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1019 // increment numPrimsAssembled
1020 this->numPrimsAssembled
++;
1022 // set up next prim state
1027 void ProcessVertTriListAdj(uint32_t index
, bool finish
)
1029 this->vert
[this->curIndex
] = index
;
1031 if (this->curIndex
== 6)
1033 // assembled enough verts for prim, add to gather indices
1034 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1035 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1036 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1037 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1038 this->indices
[4][this->numPrimsAssembled
] = this->vert
[4];
1039 this->indices
[5][this->numPrimsAssembled
] = this->vert
[5];
1041 // increment numPrimsAssembled
1042 this->numPrimsAssembled
++;
1044 // set up next prim state
1049 void ProcessVertTriListAdjNoGs(uint32_t index
, bool finish
)
1051 this->vert
[this->curIndex
] = index
;
1053 if (this->curIndex
== 6)
1055 // assembled enough verts for prim, add to gather indices
1056 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1057 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1058 this->indices
[2][this->numPrimsAssembled
] = this->vert
[4];
1060 // increment numPrimsAssembled
1061 this->numPrimsAssembled
++;
1063 // set up next prim state
1069 void ProcessVertLineList(uint32_t index
, bool finish
)
1071 this->vert
[this->curIndex
] = index
;
1073 if (this->curIndex
== 2)
1075 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1076 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1078 this->numPrimsAssembled
++;
1083 void ProcessVertLineStrip(uint32_t index
, bool finish
)
1085 this->vert
[this->curIndex
] = index
;
1087 if (this->curIndex
== 2)
1089 // assembled enough verts for prim, add to gather indices
1090 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1091 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1093 // increment numPrimsAssembled
1094 this->numPrimsAssembled
++;
1096 // set up next prim state
1097 this->vert
[0] = this->vert
[1];
1102 void ProcessVertLineStripAdj(uint32_t index
, bool finish
)
1104 this->vert
[this->curIndex
] = index
;
1106 if (this->curIndex
== 4)
1108 // assembled enough verts for prim, add to gather indices
1109 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1110 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1111 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1112 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1114 // increment numPrimsAssembled
1115 this->numPrimsAssembled
++;
1117 // set up next prim state
1118 this->vert
[0] = this->vert
[1];
1119 this->vert
[1] = this->vert
[2];
1120 this->vert
[2] = this->vert
[3];
1125 void ProcessVertLineStripAdjNoGs(uint32_t index
, bool finish
)
1127 this->vert
[this->curIndex
] = index
;
1129 if (this->curIndex
== 4)
1131 // assembled enough verts for prim, add to gather indices
1132 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1133 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1135 // increment numPrimsAssembled
1136 this->numPrimsAssembled
++;
1138 // set up next prim state
1139 this->vert
[0] = this->vert
[1];
1140 this->vert
[1] = this->vert
[2];
1141 this->vert
[2] = this->vert
[3];
1146 void ProcessVertLineListAdj(uint32_t index
, bool finish
)
1148 this->vert
[this->curIndex
] = index
;
1150 if (this->curIndex
== 4)
1152 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1153 this->indices
[1][this->numPrimsAssembled
] = this->vert
[1];
1154 this->indices
[2][this->numPrimsAssembled
] = this->vert
[2];
1155 this->indices
[3][this->numPrimsAssembled
] = this->vert
[3];
1157 this->numPrimsAssembled
++;
1162 void ProcessVertLineListAdjNoGs(uint32_t index
, bool finish
)
1164 this->vert
[this->curIndex
] = index
;
1166 if (this->curIndex
== 4)
1168 this->indices
[0][this->numPrimsAssembled
] = this->vert
[1];
1169 this->indices
[1][this->numPrimsAssembled
] = this->vert
[2];
1171 this->numPrimsAssembled
++;
1176 void ProcessVertPointList(uint32_t index
, bool finish
)
1178 this->vert
[this->curIndex
] = index
;
1180 if (this->curIndex
== 1)
1182 this->indices
[0][this->numPrimsAssembled
] = this->vert
[0];
1183 this->numPrimsAssembled
++;
1189 // Primitive Assembly for data output from the DomainShader.
1190 struct PA_TESS
: PA_STATE
1193 DRAW_CONTEXT
*in_pDC
,
1194 const SIMDSCALAR
* in_pVertData
,
1195 uint32_t in_attributeStrideInVectors
,
1196 uint32_t in_numAttributes
,
1197 uint32_t* (&in_ppIndices
)[3],
1198 uint32_t in_numPrims
,
1199 PRIMITIVE_TOPOLOGY in_binTopology
) :
1201 PA_STATE(in_pDC
, nullptr, 0),
1202 m_pVertexData(in_pVertData
),
1203 m_attributeStrideInVectors(in_attributeStrideInVectors
),
1204 m_numAttributes(in_numAttributes
),
1205 m_numPrims(in_numPrims
)
1207 #if USE_SIMD16_FRONTEND
1208 m_vPrimId
= _simd16_setzero_si();
1210 m_vPrimId
= _simd_setzero_si();
1212 binTopology
= in_binTopology
;
1213 m_ppIndices
[0] = in_ppIndices
[0];
1214 m_ppIndices
[1] = in_ppIndices
[1];
1215 m_ppIndices
[2] = in_ppIndices
[2];
1217 switch (binTopology
)
1219 case TOP_POINT_LIST
:
1220 m_numVertsPerPrim
= 1;
1224 m_numVertsPerPrim
= 2;
1227 case TOP_TRIANGLE_LIST
:
1228 m_numVertsPerPrim
= 3;
1232 SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology
, __FUNCTION__
);
1239 return m_numPrims
!= 0;
1242 simdvector
& GetSimdVector(uint32_t index
, uint32_t slot
)
1244 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__
);
1245 static simdvector junk
;
1249 #if ENABLE_AVX512_SIMD16
1250 simd16vector
& GetSimdVector_simd16(uint32_t index
, uint32_t slot
)
1252 SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__
);
1253 static simd16vector junk
;
1258 static SIMDSCALARI
GenPrimMask(uint32_t numPrims
)
1260 SWR_ASSERT(numPrims
<= SIMD_WIDTH
);
1261 #if USE_SIMD16_FRONTEND
1262 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1264 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1265 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
1268 return _simd16_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1270 static const OSALIGNLINE(int32_t) maskGen
[SIMD_WIDTH
* 2] =
1272 -1, -1, -1, -1, -1, -1, -1, -1,
1273 0, 0, 0, 0, 0, 0, 0, 0
1276 return _simd_loadu_si((const SIMDSCALARI
*)&maskGen
[SIMD_WIDTH
- numPrims
]);
1280 bool Assemble(uint32_t slot
, simdvector verts
[])
1282 SWR_ASSERT(slot
< m_numAttributes
);
1284 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1285 if (0 == numPrimsToAssemble
)
1290 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1292 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1293 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1295 #if USE_SIMD16_FRONTEND
1296 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1298 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1301 const float* pBase
= pBaseAttrib
;
1302 for (uint32_t c
= 0; c
< 4; ++c
)
1304 #if USE_SIMD16_FRONTEND
1305 simd16scalar temp
= _simd16_mask_i32gather_ps(
1306 _simd16_setzero_ps(),
1310 4 /* gcc doesn't like sizeof(float) */);
1312 verts
[i
].v
[c
] = useAlternateOffset
? temp
.hi
: temp
.lo
;
1314 verts
[i
].v
[c
] = _simd_mask_i32gather_ps(
1318 _simd_castsi_ps(mask
),
1319 4 /* gcc doesn't like sizeof(float) */);
1321 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1328 #if ENABLE_AVX512_SIMD16
1329 bool Assemble_simd16(uint32_t slot
, simd16vector verts
[])
1331 SWR_ASSERT(slot
< m_numAttributes
);
1333 uint32_t numPrimsToAssemble
= PA_TESS::NumPrims();
1334 if (0 == numPrimsToAssemble
)
1339 SIMDSCALARI mask
= GenPrimMask(numPrimsToAssemble
);
1341 const float* pBaseAttrib
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1342 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1344 #if USE_SIMD16_FRONTEND
1345 SIMDSCALARI indices
= _simd16_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1347 SIMDSCALARI indices
= _simd_load_si((const SIMDSCALARI
*)m_ppIndices
[i
]);
1350 const float* pBase
= pBaseAttrib
;
1351 for (uint32_t c
= 0; c
< 4; ++c
)
1353 #if USE_SIMD16_FRONTEND
1354 verts
[i
].v
[c
] = _simd16_mask_i32gather_ps(
1355 _simd16_setzero_ps(),
1359 4 /* gcc doesn't like sizeof(float) */);
1361 verts
[i
].v
[c
].lo
= _simd_mask_i32gather_ps(
1365 _simd_castsi_ps(mask
),
1366 4 /* gcc doesn't like sizeof(float) */);
1367 verts
[i
].v
[c
].hi
= _simd_setzero_ps();
1369 pBase
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1377 void AssembleSingle(uint32_t slot
, uint32_t primIndex
, __m128 verts
[])
1379 SWR_ASSERT(slot
< m_numAttributes
);
1380 SWR_ASSERT(primIndex
< PA_TESS::NumPrims());
1382 const float* pVertDataBase
= (const float*)&m_pVertexData
[slot
* m_attributeStrideInVectors
* 4];
1383 for (uint32_t i
= 0; i
< m_numVertsPerPrim
; ++i
)
1385 #if USE_SIMD16_FRONTEND
1386 uint32_t index
= useAlternateOffset
? m_ppIndices
[i
][primIndex
+ SIMD_WIDTH_DIV2
] : m_ppIndices
[i
][primIndex
];
1388 uint32_t index
= m_ppIndices
[i
][primIndex
];
1390 const float* pVertData
= pVertDataBase
;
1391 float* pVert
= (float*)&verts
[i
];
1393 for (uint32_t c
= 0; c
< 4; ++c
)
1395 pVert
[c
] = pVertData
[index
];
1396 pVertData
+= m_attributeStrideInVectors
* SIMD_WIDTH
;
1403 uint32_t numPrims
= PA_TESS::NumPrims();
1404 m_numPrims
-= numPrims
;
1405 m_ppIndices
[0] += numPrims
;
1406 m_ppIndices
[1] += numPrims
;
1407 m_ppIndices
[2] += numPrims
;
1412 SIMDVERTEX
& GetNextVsOutput()
1414 SWR_ASSERT(0, "%s", __FUNCTION__
);
1415 static SIMDVERTEX junk
;
1419 bool GetNextStreamOutput()
1421 SWR_ASSERT(0, "%s", __FUNCTION__
);
1425 SIMDMASK
& GetNextVsIndices()
1427 SWR_ASSERT(0, "%s", __FUNCTION__
);
1428 static SIMDMASK junk
;
1434 return std::min
<uint32_t>(m_numPrims
, SIMD_WIDTH
);
1437 void Reset() { SWR_ASSERT(0); };
1439 SIMDSCALARI
GetPrimID(uint32_t startID
)
1441 #if USE_SIMD16_FRONTEND
1442 return _simd16_add_epi32(_simd16_set1_epi32(startID
), m_vPrimId
);
1444 return _simd_add_epi32(_simd_set1_epi32(startID
), m_vPrimId
);
1449 const SIMDSCALAR
* m_pVertexData
= nullptr;
1450 uint32_t m_attributeStrideInVectors
= 0;
1451 uint32_t m_numAttributes
= 0;
1452 uint32_t m_numPrims
= 0;
1453 uint32_t* m_ppIndices
[3];
1455 uint32_t m_numVertsPerPrim
= 0;
1457 SIMDSCALARI m_vPrimId
;
1460 // Primitive Assembler factory class, responsible for creating and initializing the correct assembler
1462 template <typename IsIndexedT
, typename IsCutIndexEnabledT
>
1465 PA_FACTORY(DRAW_CONTEXT
* pDC
, PRIMITIVE_TOPOLOGY in_topo
, uint32_t numVerts
) : topo(in_topo
)
1467 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1468 const API_STATE
& state
= GetApiState(pDC
);
1469 if ((IsIndexedT::value
&& IsCutIndexEnabledT::value
&& (
1470 topo
== TOP_TRIANGLE_STRIP
|| topo
== TOP_POINT_LIST
||
1471 topo
== TOP_LINE_LIST
|| topo
== TOP_LINE_STRIP
||
1472 topo
== TOP_TRIANGLE_LIST
)) ||
1474 // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
1475 // for them in the optimized PA
1476 (topo
== TOP_LINE_LIST_ADJ
|| topo
== TOP_LISTSTRIP_ADJ
|| topo
== TOP_TRI_LIST_ADJ
|| topo
== TOP_TRI_STRIP_ADJ
))
1478 memset(&indexStore
, 0, sizeof(indexStore
));
1479 uint32_t numAttribs
= state
.feNumAttributes
;
1481 new (&this->paCut
) PA_STATE_CUT(pDC
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* PA_STATE::SIMD_WIDTH
,
1482 &this->indexStore
[0], numVerts
, numAttribs
, state
.topology
, false);
1488 uint32_t numPrims
= GetNumPrims(in_topo
, numVerts
);
1489 new (&this->paOpt
) PA_STATE_OPT(pDC
, numPrims
, (uint8_t*)&this->vertexStore
[0], MAX_NUM_VERTS_PER_PRIM
* PA_STATE::SIMD_WIDTH
, false);
1497 #if KNOB_ENABLE_CUT_AWARE_PA == TRUE
1511 bool cutPA
{ false };
1513 PRIMITIVE_TOPOLOGY topo
{ TOP_UNKNOWN
};
1515 PA_STATE::SIMDVERTEX vertexStore
[MAX_NUM_VERTS_PER_PRIM
];
1516 PA_STATE::SIMDMASK indexStore
[MAX_NUM_VERTS_PER_PRIM
];