1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE
uint32_t GenMask(uint32_t numBits
)
48 SWR_ASSERT(numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
49 return ((1U << numBits
) - 1);
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief FE handler for SwrSync.
54 /// @param pContext - pointer to SWR context.
55 /// @param pDC - pointer to draw context.
56 /// @param workerId - thread's worker id. Even thread has a unique id.
57 /// @param pUserData - Pointer to user data passed back to sync callback.
58 /// @todo This should go away when we switch this to use compute threading.
60 SWR_CONTEXT
*pContext
,
67 work
.pfnWork
= ProcessSyncBE
;
69 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
70 pTileMgr
->enqueue(0, 0, &work
);
73 //////////////////////////////////////////////////////////////////////////
74 /// @brief FE handler for SwrDestroyContext.
75 /// @param pContext - pointer to SWR context.
76 /// @param pDC - pointer to draw context.
77 /// @param workerId - thread's worker id. Even thread has a unique id.
78 /// @param pUserData - Pointer to user data passed back to sync callback.
80 SWR_CONTEXT
*pContext
,
87 work
.pfnWork
= ProcessShutdownBE
;
89 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
90 // Enqueue at least 1 work item for each worker thread
91 // account for number of numa nodes
92 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
94 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
96 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
98 pTileMgr
->enqueue(i
, n
, &work
);
103 //////////////////////////////////////////////////////////////////////////
104 /// @brief FE handler for SwrClearRenderTarget.
105 /// @param pContext - pointer to SWR context.
106 /// @param pDC - pointer to draw context.
107 /// @param workerId - thread's worker id. Even thread has a unique id.
108 /// @param pUserData - Pointer to user data passed back to clear callback.
109 /// @todo This should go away when we switch this to use compute threading.
111 SWR_CONTEXT
*pContext
,
116 CLEAR_DESC
*pDesc
= (CLEAR_DESC
*)pUserData
;
117 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
119 // queue a clear to each macro tile
120 // compute macro tile bounds for the specified rect
121 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
122 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
123 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
124 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
128 work
.pfnWork
= ProcessClearBE
;
129 work
.desc
.clear
= *pDesc
;
131 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
133 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
135 pTileMgr
->enqueue(x
, y
, &work
);
140 //////////////////////////////////////////////////////////////////////////
141 /// @brief FE handler for SwrStoreTiles.
142 /// @param pContext - pointer to SWR context.
143 /// @param pDC - pointer to draw context.
144 /// @param workerId - thread's worker id. Even thread has a unique id.
145 /// @param pUserData - Pointer to user data passed back to callback.
146 /// @todo This should go away when we switch this to use compute threading.
147 void ProcessStoreTiles(
148 SWR_CONTEXT
*pContext
,
153 RDTSC_BEGIN(FEProcessStoreTiles
, pDC
->drawId
);
154 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
155 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
157 // queue a store to each macro tile
158 // compute macro tile bounds for the specified rect
159 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
160 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
161 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
162 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
166 work
.type
= STORETILES
;
167 work
.pfnWork
= ProcessStoreTilesBE
;
168 work
.desc
.storeTiles
= *pDesc
;
170 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
172 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
174 pTileMgr
->enqueue(x
, y
, &work
);
178 RDTSC_END(FEProcessStoreTiles
, 0);
181 //////////////////////////////////////////////////////////////////////////
182 /// @brief FE handler for SwrInvalidateTiles.
183 /// @param pContext - pointer to SWR context.
184 /// @param pDC - pointer to draw context.
185 /// @param workerId - thread's worker id. Even thread has a unique id.
186 /// @param pUserData - Pointer to user data passed back to callback.
187 /// @todo This should go away when we switch this to use compute threading.
188 void ProcessDiscardInvalidateTiles(
189 SWR_CONTEXT
*pContext
,
194 RDTSC_BEGIN(FEProcessInvalidateTiles
, pDC
->drawId
);
195 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
196 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
198 // compute macro tile bounds for the specified rect
199 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
200 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
201 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
202 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
204 if (pDesc
->fullTilesOnly
== false)
206 // include partial tiles
207 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
208 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
209 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
210 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
213 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
214 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
216 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
217 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
221 work
.type
= DISCARDINVALIDATETILES
;
222 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
223 work
.desc
.discardInvalidateTiles
= *pDesc
;
225 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
227 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
229 pTileMgr
->enqueue(x
, y
, &work
);
233 RDTSC_END(FEProcessInvalidateTiles
, 0);
236 //////////////////////////////////////////////////////////////////////////
237 /// @brief Computes the number of primitives given the number of verts.
238 /// @param mode - primitive topology for draw operation.
239 /// @param numPrims - number of vertices or indices for draw.
240 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
241 uint32_t GetNumPrims(
242 PRIMITIVE_TOPOLOGY mode
,
247 case TOP_POINT_LIST
: return numPrims
;
248 case TOP_TRIANGLE_LIST
: return numPrims
/ 3;
249 case TOP_TRIANGLE_STRIP
: return numPrims
< 3 ? 0 : numPrims
- 2;
250 case TOP_TRIANGLE_FAN
: return numPrims
< 3 ? 0 : numPrims
- 2;
251 case TOP_TRIANGLE_DISC
: return numPrims
< 2 ? 0 : numPrims
- 1;
252 case TOP_QUAD_LIST
: return numPrims
/ 4;
253 case TOP_QUAD_STRIP
: return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
254 case TOP_LINE_STRIP
: return numPrims
< 2 ? 0 : numPrims
- 1;
255 case TOP_LINE_LIST
: return numPrims
/ 2;
256 case TOP_LINE_LOOP
: return numPrims
;
257 case TOP_RECT_LIST
: return numPrims
/ 3;
258 case TOP_LINE_LIST_ADJ
: return numPrims
/ 4;
259 case TOP_LISTSTRIP_ADJ
: return numPrims
< 3 ? 0 : numPrims
- 3;
260 case TOP_TRI_LIST_ADJ
: return numPrims
/ 6;
261 case TOP_TRI_STRIP_ADJ
: return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
263 case TOP_PATCHLIST_1
:
264 case TOP_PATCHLIST_2
:
265 case TOP_PATCHLIST_3
:
266 case TOP_PATCHLIST_4
:
267 case TOP_PATCHLIST_5
:
268 case TOP_PATCHLIST_6
:
269 case TOP_PATCHLIST_7
:
270 case TOP_PATCHLIST_8
:
271 case TOP_PATCHLIST_9
:
272 case TOP_PATCHLIST_10
:
273 case TOP_PATCHLIST_11
:
274 case TOP_PATCHLIST_12
:
275 case TOP_PATCHLIST_13
:
276 case TOP_PATCHLIST_14
:
277 case TOP_PATCHLIST_15
:
278 case TOP_PATCHLIST_16
:
279 case TOP_PATCHLIST_17
:
280 case TOP_PATCHLIST_18
:
281 case TOP_PATCHLIST_19
:
282 case TOP_PATCHLIST_20
:
283 case TOP_PATCHLIST_21
:
284 case TOP_PATCHLIST_22
:
285 case TOP_PATCHLIST_23
:
286 case TOP_PATCHLIST_24
:
287 case TOP_PATCHLIST_25
:
288 case TOP_PATCHLIST_26
:
289 case TOP_PATCHLIST_27
:
290 case TOP_PATCHLIST_28
:
291 case TOP_PATCHLIST_29
:
292 case TOP_PATCHLIST_30
:
293 case TOP_PATCHLIST_31
:
294 case TOP_PATCHLIST_32
:
295 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
298 case TOP_POINT_LIST_BF
:
299 case TOP_LINE_STRIP_CONT
:
300 case TOP_LINE_STRIP_BF
:
301 case TOP_LINE_STRIP_CONT_BF
:
302 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
303 case TOP_TRI_STRIP_REVERSE
:
304 case TOP_PATCHLIST_BASE
:
306 SWR_INVALID("Unsupported topology: %d", mode
);
313 //////////////////////////////////////////////////////////////////////////
314 /// @brief Computes the number of verts given the number of primitives.
315 /// @param mode - primitive topology for draw operation.
316 /// @param numPrims - number of primitives for draw.
317 uint32_t GetNumVerts(
318 PRIMITIVE_TOPOLOGY mode
,
323 case TOP_POINT_LIST
: return numPrims
;
324 case TOP_TRIANGLE_LIST
: return numPrims
* 3;
325 case TOP_TRIANGLE_STRIP
: return numPrims
? numPrims
+ 2 : 0;
326 case TOP_TRIANGLE_FAN
: return numPrims
? numPrims
+ 2 : 0;
327 case TOP_TRIANGLE_DISC
: return numPrims
? numPrims
+ 1 : 0;
328 case TOP_QUAD_LIST
: return numPrims
* 4;
329 case TOP_QUAD_STRIP
: return numPrims
? numPrims
* 2 + 2 : 0;
330 case TOP_LINE_STRIP
: return numPrims
? numPrims
+ 1 : 0;
331 case TOP_LINE_LIST
: return numPrims
* 2;
332 case TOP_LINE_LOOP
: return numPrims
;
333 case TOP_RECT_LIST
: return numPrims
* 3;
334 case TOP_LINE_LIST_ADJ
: return numPrims
* 4;
335 case TOP_LISTSTRIP_ADJ
: return numPrims
? numPrims
+ 3 : 0;
336 case TOP_TRI_LIST_ADJ
: return numPrims
* 6;
337 case TOP_TRI_STRIP_ADJ
: return numPrims
? (numPrims
+ 2) * 2 : 0;
339 case TOP_PATCHLIST_1
:
340 case TOP_PATCHLIST_2
:
341 case TOP_PATCHLIST_3
:
342 case TOP_PATCHLIST_4
:
343 case TOP_PATCHLIST_5
:
344 case TOP_PATCHLIST_6
:
345 case TOP_PATCHLIST_7
:
346 case TOP_PATCHLIST_8
:
347 case TOP_PATCHLIST_9
:
348 case TOP_PATCHLIST_10
:
349 case TOP_PATCHLIST_11
:
350 case TOP_PATCHLIST_12
:
351 case TOP_PATCHLIST_13
:
352 case TOP_PATCHLIST_14
:
353 case TOP_PATCHLIST_15
:
354 case TOP_PATCHLIST_16
:
355 case TOP_PATCHLIST_17
:
356 case TOP_PATCHLIST_18
:
357 case TOP_PATCHLIST_19
:
358 case TOP_PATCHLIST_20
:
359 case TOP_PATCHLIST_21
:
360 case TOP_PATCHLIST_22
:
361 case TOP_PATCHLIST_23
:
362 case TOP_PATCHLIST_24
:
363 case TOP_PATCHLIST_25
:
364 case TOP_PATCHLIST_26
:
365 case TOP_PATCHLIST_27
:
366 case TOP_PATCHLIST_28
:
367 case TOP_PATCHLIST_29
:
368 case TOP_PATCHLIST_30
:
369 case TOP_PATCHLIST_31
:
370 case TOP_PATCHLIST_32
:
371 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
374 case TOP_POINT_LIST_BF
:
375 case TOP_LINE_STRIP_CONT
:
376 case TOP_LINE_STRIP_BF
:
377 case TOP_LINE_STRIP_CONT_BF
:
378 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
379 case TOP_TRI_STRIP_REVERSE
:
380 case TOP_PATCHLIST_BASE
:
382 SWR_INVALID("Unsupported topology: %d", mode
);
389 //////////////////////////////////////////////////////////////////////////
390 /// @brief Return number of verts per primitive.
391 /// @param topology - topology
392 /// @param includeAdjVerts - include adjacent verts in primitive vertices
393 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
395 uint32_t numVerts
= 0;
399 case TOP_POINT_LIST_BF
:
404 case TOP_LINE_LIST_ADJ
:
406 case TOP_LINE_STRIP_CONT
:
407 case TOP_LINE_STRIP_BF
:
408 case TOP_LISTSTRIP_ADJ
:
411 case TOP_TRIANGLE_LIST
:
412 case TOP_TRIANGLE_STRIP
:
413 case TOP_TRIANGLE_FAN
:
414 case TOP_TRI_LIST_ADJ
:
415 case TOP_TRI_STRIP_ADJ
:
416 case TOP_TRI_STRIP_REVERSE
:
424 case TOP_PATCHLIST_1
:
425 case TOP_PATCHLIST_2
:
426 case TOP_PATCHLIST_3
:
427 case TOP_PATCHLIST_4
:
428 case TOP_PATCHLIST_5
:
429 case TOP_PATCHLIST_6
:
430 case TOP_PATCHLIST_7
:
431 case TOP_PATCHLIST_8
:
432 case TOP_PATCHLIST_9
:
433 case TOP_PATCHLIST_10
:
434 case TOP_PATCHLIST_11
:
435 case TOP_PATCHLIST_12
:
436 case TOP_PATCHLIST_13
:
437 case TOP_PATCHLIST_14
:
438 case TOP_PATCHLIST_15
:
439 case TOP_PATCHLIST_16
:
440 case TOP_PATCHLIST_17
:
441 case TOP_PATCHLIST_18
:
442 case TOP_PATCHLIST_19
:
443 case TOP_PATCHLIST_20
:
444 case TOP_PATCHLIST_21
:
445 case TOP_PATCHLIST_22
:
446 case TOP_PATCHLIST_23
:
447 case TOP_PATCHLIST_24
:
448 case TOP_PATCHLIST_25
:
449 case TOP_PATCHLIST_26
:
450 case TOP_PATCHLIST_27
:
451 case TOP_PATCHLIST_28
:
452 case TOP_PATCHLIST_29
:
453 case TOP_PATCHLIST_30
:
454 case TOP_PATCHLIST_31
:
455 case TOP_PATCHLIST_32
:
456 numVerts
= topology
- TOP_PATCHLIST_BASE
;
459 SWR_INVALID("Unsupported topology: %d", topology
);
467 case TOP_LISTSTRIP_ADJ
:
468 case TOP_LINE_LIST_ADJ
: numVerts
= 4; break;
469 case TOP_TRI_STRIP_ADJ
:
470 case TOP_TRI_LIST_ADJ
: numVerts
= 6; break;
478 //////////////////////////////////////////////////////////////////////////
479 /// @brief Generate mask from remaining work.
480 /// @param numWorkItems - Number of items being worked on by a SIMD.
481 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
483 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
484 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
485 return _simd_castps_si(_simd_vmask_ps(mask
));
488 static INLINE simd16scalari
GenerateMask16(uint32_t numItemsRemaining
)
490 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: numItemsRemaining
;
491 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
492 return _simd16_castps_si(_simd16_vmask_ps(mask
));
495 //////////////////////////////////////////////////////////////////////////
496 /// @brief StreamOut - Streams vertex data out to SO buffers.
497 /// Generally, we are only streaming out a SIMDs worth of triangles.
498 /// @param pDC - pointer to draw context.
499 /// @param workerId - thread's worker id. Even thread has a unique id.
500 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
501 static void StreamOut(
506 uint32_t streamIndex
)
508 RDTSC_BEGIN(FEStreamout
, pDC
->drawId
);
510 const API_STATE
& state
= GetApiState(pDC
);
511 const SWR_STREAMOUT_STATE
&soState
= state
.soState
;
513 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
515 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
516 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
518 SWR_STREAMOUT_CONTEXT soContext
= { 0 };
520 // Setup buffer state pointers.
521 for (uint32_t i
= 0; i
< 4; ++i
)
523 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
526 uint32_t numPrims
= pa
.NumPrims();
528 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
531 uint32_t soMask
= soState
.streamMasks
[streamIndex
];
533 // Write all entries into primitive data buffer for SOS.
534 while (_BitScanForward(&slot
, soMask
))
536 simd4scalar attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
537 uint32_t paSlot
= slot
+ soState
.vertexAttribOffset
[streamIndex
];
538 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
540 // Attribute offset is relative offset from start of vertex.
541 // Note that attributes start at slot 1 in the PA buffer. We need to write this
542 // to prim data starting at slot 0. Which is why we do (slot - 1).
543 // Also note: GL works slightly differently, and needs slot 0
544 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
546 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
547 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
549 uint32_t* pPrimDataAttrib
= pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
551 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
554 soMask
&= ~(1 << slot
);
557 // Update pPrimData pointer
558 soContext
.pPrimData
= pPrimData
;
561 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr, "Trying to execute uninitialized streamout jit function.");
562 state
.pfnSoFunc
[streamIndex
](soContext
);
565 // Update SO write offset. The driver provides memory for the update.
566 for (uint32_t i
= 0; i
< 4; ++i
)
568 if (state
.soBuffer
[i
].pWriteOffset
)
570 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
573 if (state
.soBuffer
[i
].soWriteEnable
)
575 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
576 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
580 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
581 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
583 RDTSC_END(FEStreamout
, 1);
586 #if USE_SIMD16_FRONTEND
587 //////////////////////////////////////////////////////////////////////////
588 /// Is value an even number (a multiple of two)
590 template <typename T
>
591 INLINE
static bool IsEven(T value
)
593 return (value
& 1) == 0;
596 //////////////////////////////////////////////////////////////////////////
597 /// Round up value to an even number (a multiple of two)
599 template <typename T
>
600 INLINE
static T
RoundUpEven(T value
)
602 return (value
+ 1) & ~1;
605 //////////////////////////////////////////////////////////////////////////
606 /// Round down value to an even number (a multiple of two)
608 template <typename T
>
609 INLINE
static T
RoundDownEven(T value
)
614 //////////////////////////////////////////////////////////////////////////
615 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
617 /// vertexCount is in terms of the source simdvertexes and must be even
619 /// attribCount will limit the vector copies to those attribs specified
621 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
623 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
*vertex_simd16
, const simdvertex
*vertex
, uint32_t vertexCount
, uint32_t attribCount
)
626 SWR_ASSERT(vertex_simd16
);
627 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
631 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
633 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
635 for (uint32_t k
= 0; k
< 4; k
+= 1)
637 temp
.attrib
[j
][k
] = _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
639 if ((i
+ 1) < vertexCount
)
641 temp
.attrib
[j
][k
] = _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
646 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
648 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
654 //////////////////////////////////////////////////////////////////////////
655 /// @brief Computes number of invocations. The current index represents
656 /// the start of the SIMD. The max index represents how much work
657 /// items are remaining. If there is less then a SIMD's xmin of work
658 /// then return the remaining amount of work.
659 /// @param curIndex - The start index for the SIMD.
660 /// @param maxIndex - The last index for all work items.
661 static INLINE
uint32_t GetNumInvocations(
665 uint32_t remainder
= (maxIndex
- curIndex
);
666 #if USE_SIMD16_FRONTEND
667 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
669 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
675 /// The geometry shader will loop over each active streamout buffer, assembling
676 /// primitives for the downstream stages. When multistream output is enabled,
677 /// the generated stream ID buffer from the GS needs to be converted to a cut
678 /// buffer for the primitive assembler.
679 /// @param stream - stream id to generate the cut buffer for
680 /// @param pStreamIdBase - pointer to the stream ID buffer
681 /// @param numEmittedVerts - Number of total verts emitted by the GS
682 /// @param pCutBuffer - output buffer to write cuts to
683 void ProcessStreamIdBuffer(uint32_t stream
, uint8_t* pStreamIdBase
, uint32_t numEmittedVerts
, uint8_t *pCutBuffer
)
685 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
687 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
688 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
690 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
692 uint8_t curInputByte
= pStreamIdBase
[2*b
];
694 for (uint32_t i
= 0; i
< 4; ++i
)
696 if ((curInputByte
& 0x3) != stream
)
703 curInputByte
= pStreamIdBase
[2 * b
+ 1];
704 for (uint32_t i
= 0; i
< 4; ++i
)
706 if ((curInputByte
& 0x3) != stream
)
708 outByte
|= (1 << (i
+ 4));
713 *pCutBuffer
++ = outByte
;
717 // Buffers that are allocated if GS is enabled
721 uint8_t* pGsOut
[KNOB_SIMD_WIDTH
];
722 uint8_t* pGsTransposed
;
723 void* pStreamCutBuffer
;
726 //////////////////////////////////////////////////////////////////////////
727 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
728 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive assembler
729 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
730 /// @param numVerts - Number of vertices outputted by the GS
731 /// @param numAttribs - Number of attributes per vertex
732 template<typename SIMD_T
, uint32_t SimdWidth
>
733 void TransposeSOAtoAOS(uint8_t* pDst
, uint8_t* pSrc
, uint32_t numVerts
, uint32_t numAttribs
)
735 uint32_t srcVertexStride
= numAttribs
* sizeof(float) * 4;
736 uint32_t dstVertexStride
= numAttribs
* sizeof(Float
<SIMD_T
>) * 4;
738 OSALIGNSIMD16(uint32_t) gatherOffsets
[SimdWidth
];
740 for (uint32_t i
= 0; i
< SimdWidth
; ++i
)
742 gatherOffsets
[i
] = srcVertexStride
* i
;
744 auto vGatherOffsets
= SIMD_T::load_si((Integer
<SIMD_T
>*)&gatherOffsets
[0]);
746 uint32_t numSimd
= AlignUp(numVerts
, SimdWidth
) / SimdWidth
;
747 uint32_t remainingVerts
= numVerts
;
749 for (uint32_t s
= 0; s
< numSimd
; ++s
)
751 uint8_t* pSrcBase
= pSrc
+ s
* srcVertexStride
* SimdWidth
;
752 uint8_t* pDstBase
= pDst
+ s
* dstVertexStride
;
754 // Compute mask to prevent src overflow
755 uint32_t mask
= std::min(remainingVerts
, SimdWidth
);
756 mask
= GenMask(mask
);
757 auto vMask
= SIMD_T::vmask_ps(mask
);
758 auto viMask
= SIMD_T::castps_si(vMask
);
760 for (uint32_t a
= 0; a
< numAttribs
; ++a
)
762 auto attribGatherX
= SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD_T::setzero_ps(), (const float*)pSrcBase
, vGatherOffsets
, vMask
);
763 auto attribGatherY
= SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase
+ sizeof(float)), vGatherOffsets
, vMask
);
764 auto attribGatherZ
= SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase
+ sizeof(float) * 2), vGatherOffsets
, vMask
);
765 auto attribGatherW
= SIMD_T::template mask_i32gather_ps
<ScaleFactor
<SIMD_T
>(1)>(SIMD_T::setzero_ps(), (const float*)(pSrcBase
+ sizeof(float) * 3), vGatherOffsets
, vMask
);
767 SIMD_T::maskstore_ps((float*)pDstBase
, viMask
, attribGatherX
);
768 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>)), viMask
, attribGatherY
);
769 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 2), viMask
, attribGatherZ
);
770 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 3), viMask
, attribGatherW
);
772 pSrcBase
+= sizeof(float) * 4;
773 pDstBase
+= sizeof(Float
<SIMD_T
>) * 4;
775 remainingVerts
-= SimdWidth
;
780 //////////////////////////////////////////////////////////////////////////
781 /// @brief Implements GS stage.
782 /// @param pDC - pointer to draw context.
783 /// @param workerId - thread's worker id. Even thread has a unique id.
784 /// @param pa - The primitive assembly object.
785 /// @param pGsOut - output stream for GS
787 typename HasStreamOutT
,
789 static void GeometryShaderStage(
793 GsBuffers
* pGsBuffers
,
794 uint32_t* pSoPrimData
,
795 #if USE_SIMD16_FRONTEND
796 uint32_t numPrims_simd8
,
798 simdscalari
const &primID
)
800 RDTSC_BEGIN(FEGeometryShader
, pDC
->drawId
);
802 const API_STATE
& state
= GetApiState(pDC
);
803 const SWR_GS_STATE
* pState
= &state
.gsState
;
804 SWR_GS_CONTEXT gsContext
;
806 static uint8_t sNullBuffer
[128] = { 0 };
808 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
810 gsContext
.pStreams
[i
] = pGsBuffers
->pGsOut
[i
];
812 gsContext
.pVerts
= (simdvector
*)pGsBuffers
->pGsIn
;
813 gsContext
.PrimitiveID
= primID
;
815 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
816 simdvector attrib
[MAX_NUM_VERTS_PER_PRIM
];
818 // assemble all attributes for the input primitive
819 gsContext
.inputVertStride
= pState
->inputVertStride
;
820 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
822 uint32_t srcAttribSlot
= pState
->srcVertexAttribOffset
+ slot
;
823 uint32_t attribSlot
= pState
->vertexAttribOffset
+ slot
;
824 pa
.Assemble(srcAttribSlot
, attrib
);
826 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
828 gsContext
.pVerts
[attribSlot
+ pState
->inputVertStride
* i
] = attrib
[i
];
833 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
834 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
836 gsContext
.pVerts
[VERTEX_POSITION_SLOT
+ pState
->inputVertStride
* i
] = attrib
[i
];
839 // record valid prims from the frontend to avoid over binning the newly generated
841 #if USE_SIMD16_FRONTEND
842 uint32_t numInputPrims
= numPrims_simd8
;
844 uint32_t numInputPrims
= pa
.NumPrims();
847 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
849 gsContext
.InstanceID
= instance
;
850 gsContext
.mask
= GenerateMask(numInputPrims
);
852 // execute the geometry shader
853 state
.pfnGsFunc(GetPrivateState(pDC
), &gsContext
);
854 AR_EVENT(GSStats(gsContext
.stats
.numInstExecuted
));
856 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
858 gsContext
.pStreams
[i
] += pState
->allocationSize
;
862 // set up new binner and state for the GS output topology
863 #if USE_SIMD16_FRONTEND
864 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
867 switch (pState
->outputTopology
)
869 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles_simd16
; break;
870 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines_simd16
; break;
871 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints_simd16
; break;
872 default: SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
877 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
880 switch (pState
->outputTopology
)
882 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles
; break;
883 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines
; break;
884 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
885 default: SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
890 // foreach input prim:
891 // - setup a new PA based on the emitted verts for that prim
892 // - loop over the new verts, calling PA to assemble each prim
893 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
895 uint32_t totalPrimsGenerated
= 0;
896 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
898 uint8_t* pInstanceBase
= (uint8_t*)pGsBuffers
->pGsOut
[inputPrim
];
900 // Vertex count is either emitted by shader or static
901 uint32_t vertexCount
= 0;
902 if (pState
->staticVertexCount
)
904 vertexCount
= pState
->staticVertexCount
;
908 // If emitted in shader, it should be the stored in the first dword of the output buffer
909 vertexCount
= *(uint32_t*)pInstanceBase
;
912 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
914 uint32_t numEmittedVerts
= vertexCount
;
915 if (numEmittedVerts
== 0)
920 uint8_t* pBase
= pInstanceBase
+ instance
* pState
->allocationSize
;
921 uint8_t* pCutBase
= pState
->controlDataSize
== 0 ? &sNullBuffer
[0] : pBase
+ pState
->controlDataOffset
;
922 uint8_t* pVertexBaseAOS
= pBase
+ pState
->outputVertexOffset
;
924 #if USE_SIMD16_FRONTEND
925 TransposeSOAtoAOS
<SIMD512
, KNOB_SIMD16_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
, pVertexBaseAOS
, vertexCount
, pState
->outputVertexSize
);
927 TransposeSOAtoAOS
<SIMD256
, KNOB_SIMD_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
, pVertexBaseAOS
, vertexCount
, pState
->outputVertexSize
);
930 uint32_t numAttribs
= state
.feNumAttributes
;
932 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
934 bool processCutVerts
= false;
935 uint8_t* pCutBuffer
= pCutBase
;
937 // assign default stream ID, only relevant when GS is outputting a single stream
938 uint32_t streamID
= 0;
939 if (pState
->isSingleStream
)
941 processCutVerts
= true;
942 streamID
= pState
->singleStreamID
;
943 if (streamID
!= stream
) continue;
947 // early exit if this stream is not enabled for streamout
948 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
953 // multi-stream output, need to translate StreamID buffer to a cut buffer
954 ProcessStreamIdBuffer(stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pGsBuffers
->pStreamCutBuffer
);
955 pCutBuffer
= (uint8_t*)pGsBuffers
->pStreamCutBuffer
;
956 processCutVerts
= false;
959 #if USE_SIMD16_FRONTEND
960 PA_STATE_CUT
gsPa(pDC
, (uint8_t*)pGsBuffers
->pGsTransposed
, numEmittedVerts
, pState
->outputVertexSize
, reinterpret_cast<simd16mask
*>(pCutBuffer
), numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
, pa
.numVertsPerPrim
);
963 PA_STATE_CUT
gsPa(pDC
, (uint8_t*)pGsBuffers
->pGsTransposed
, numEmittedVerts
, pState
->outputVertexSize
, pCutBuffer
, numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
, pa
.numVertsPerPrim
);
966 while (gsPa
.GetNextStreamOutput())
970 #if USE_SIMD16_FRONTEND
971 simd16vector attrib_simd16
[3];
973 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib_simd16
);
976 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
981 totalPrimsGenerated
+= gsPa
.NumPrims();
983 if (HasStreamOutT::value
)
985 #if ENABLE_AVX512_SIMD16
986 gsPa
.useAlternateOffset
= false;
988 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
991 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
993 #if USE_SIMD16_FRONTEND
994 simd16scalari vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
996 // Gather data from the SVG if provided.
997 simd16scalari vViewportIdx
= SIMD16::setzero_si();
998 simd16scalari vRtIdx
= SIMD16::setzero_si();
999 SIMD16::Vec4 svgAttrib
[4];
1001 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
1003 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1007 if (state
.backendState
.readViewportArrayIndex
)
1009 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1010 gsPa
.viewportArrayActive
= true;
1012 if (state
.backendState
.readRenderTargetArrayIndex
)
1014 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1015 gsPa
.rtArrayActive
= true;
1019 // OOB VPAI indices => forced to zero.
1020 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1021 simd16scalari vNumViewports
= SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1022 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1023 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1025 gsPa
.useAlternateOffset
= false;
1026 pfnClipFunc(pDC
, gsPa
, workerId
, attrib_simd16
, GenMask(gsPa
.NumPrims()), vPrimId
, vViewportIdx
, vRtIdx
);
1029 simdscalari vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1031 // Gather data from the SVG if provided.
1032 simdscalari vViewportIdx
= SIMD::setzero_si();
1033 simdscalari vRtIdx
= SIMD::setzero_si();
1034 SIMD::Vec4 svgAttrib
[4];
1036 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
1038 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1042 if (state
.backendState
.readViewportArrayIndex
)
1044 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1046 // OOB VPAI indices => forced to zero.
1047 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1048 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1049 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1050 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1051 gsPa
.viewportArrayActive
= true;
1053 if (state
.backendState
.readRenderTargetArrayIndex
)
1055 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1056 gsPa
.rtArrayActive
= true;
1059 pfnClipFunc(pDC
, gsPa
, workerId
, attrib
, GenMask(gsPa
.NumPrims()), vPrimId
, vViewportIdx
, vRtIdx
);
1063 } while (gsPa
.NextPrim());
1069 // update GS pipeline stats
1070 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1071 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1072 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
*numInputPrims
));
1073 RDTSC_END(FEGeometryShader
, 1);
1076 //////////////////////////////////////////////////////////////////////////
1077 /// @brief Allocate GS buffers
1078 /// @param pDC - pointer to draw context.
1079 /// @param state - API state
1080 /// @param ppGsOut - pointer to GS output buffer allocation
1081 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1082 template<typename SIMD_T
, uint32_t SIMD_WIDTH
>
1083 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
, const API_STATE
& state
, uint32_t vertsPerPrim
, GsBuffers
* pGsBuffers
)
1085 auto pArena
= pDC
->pArena
;
1086 SWR_ASSERT(pArena
!= nullptr);
1087 SWR_ASSERT(state
.gsState
.gsEnable
);
1089 const SWR_GS_STATE
& gsState
= state
.gsState
;
1091 // Allocate storage for vertex inputs
1092 uint32_t vertexInBufferSize
= gsState
.inputVertStride
* sizeof(simdvector
) * vertsPerPrim
;
1093 pGsBuffers
->pGsIn
= (uint8_t*)pArena
->AllocAligned(vertexInBufferSize
, 32);
1095 // Allocate arena space to hold GS output verts
1096 const uint32_t vertexBufferSize
= gsState
.instanceCount
* gsState
.allocationSize
;
1098 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1100 pGsBuffers
->pGsOut
[i
] = (uint8_t*)pArena
->AllocAligned(vertexBufferSize
, 32);
1103 // Allocate storage for transposed GS output
1104 uint32_t numSimdBatches
= AlignUp(gsState
.maxNumVerts
, SIMD_WIDTH
) / SIMD_WIDTH
;
1105 uint32_t transposedBufferSize
= numSimdBatches
* gsState
.outputVertexSize
* sizeof(Vec4
<SIMD_T
>);
1106 pGsBuffers
->pGsTransposed
= (uint8_t*)pArena
->AllocAligned(transposedBufferSize
, 32);
1108 // Allocate storage to hold temporary stream->cut buffer, if necessary
1109 if (state
.gsState
.isSingleStream
)
1111 pGsBuffers
->pStreamCutBuffer
= nullptr;
1115 pGsBuffers
->pStreamCutBuffer
= (uint8_t*)pArena
->AllocAligned(AlignUp(gsState
.maxNumVerts
* 2, 32), 32);
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief Contains all data generated by the HS and passed to the
1121 /// tessellator and DS.
1122 struct TessellationThreadLocalData
1124 SWR_HS_CONTEXT hsContext
;
1125 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
1129 simdscalar
* pDSOutput
;
1130 size_t dsOutputAllocSize
;
1133 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1135 //////////////////////////////////////////////////////////////////////////
1136 /// @brief Allocate tessellation data for this worker thread.
1138 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1140 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1141 if (gt_pTessellationThreadData
== nullptr)
1143 gt_pTessellationThreadData
= (TessellationThreadLocalData
*)
1144 AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1145 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief Implements Tessellation Stages.
1151 /// @param pDC - pointer to draw context.
1152 /// @param workerId - thread's worker id. Even thread has a unique id.
1153 /// @param pa - The primitive assembly object.
1154 /// @param pGsOut - output stream for GS
1156 typename HasGeometryShaderT
,
1157 typename HasStreamOutT
,
1159 static void TessellationStages(
1163 GsBuffers
* pGsBuffers
,
1164 uint32_t* pSoPrimData
,
1165 #if USE_SIMD16_FRONTEND
1166 uint32_t numPrims_simd8
,
1168 simdscalari
const &primID
)
1170 const API_STATE
& state
= GetApiState(pDC
);
1171 const SWR_TS_STATE
& tsState
= state
.tsState
;
1173 SWR_ASSERT(gt_pTessellationThreadData
);
1175 HANDLE tsCtx
= TSInitCtx(
1177 tsState
.partitioning
,
1178 tsState
.tsOutputTopology
,
1179 gt_pTessellationThreadData
->pTxCtx
,
1180 gt_pTessellationThreadData
->tsCtxSize
);
1181 if (tsCtx
== nullptr)
1183 gt_pTessellationThreadData
->pTxCtx
= AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1186 tsState
.partitioning
,
1187 tsState
.tsOutputTopology
,
1188 gt_pTessellationThreadData
->pTxCtx
,
1189 gt_pTessellationThreadData
->tsCtxSize
);
1193 #if USE_SIMD16_FRONTEND
1194 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1195 if (HasRastT::value
)
1197 switch (tsState
.postDSTopology
)
1199 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles_simd16
; break;
1200 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines_simd16
; break;
1201 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints_simd16
; break;
1202 default: SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1207 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1208 if (HasRastT::value
)
1210 switch (tsState
.postDSTopology
)
1212 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles
; break;
1213 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines
; break;
1214 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
1215 default: SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1220 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1221 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
1222 hsContext
.PrimitiveID
= primID
;
1224 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1225 // Max storage for one attribute for an entire simdprimitive
1226 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1228 // assemble all attributes for the input primitives
1229 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1231 uint32_t attribSlot
= tsState
.vertexAttribOffset
+ slot
;
1232 pa
.Assemble(attribSlot
, simdattrib
);
1234 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1236 hsContext
.vert
[i
].attrib
[VERTEX_ATTRIB_START_SLOT
+ slot
] = simdattrib
[i
];
1241 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1244 #if USE_SIMD16_FRONTEND
1245 uint32_t numPrims
= numPrims_simd8
;
1247 uint32_t numPrims
= pa
.NumPrims();
1249 hsContext
.mask
= GenerateMask(numPrims
);
1252 RDTSC_BEGIN(FEHullShader
, pDC
->drawId
);
1253 state
.pfnHsFunc(GetPrivateState(pDC
), &hsContext
);
1254 RDTSC_END(FEHullShader
, 0);
1256 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1257 AR_EVENT(HSStats(hsContext
.stats
.numInstExecuted
));
1259 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1261 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1264 SWR_TS_TESSELLATED_DATA tsData
= { 0 };
1265 RDTSC_BEGIN(FETessellation
, pDC
->drawId
);
1266 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1267 AR_EVENT(TessPrimCount(1));
1268 RDTSC_END(FETessellation
, 0);
1270 if (tsData
.NumPrimitives
== 0)
1274 SWR_ASSERT(tsData
.NumDomainPoints
);
1276 // Allocate DS Output memory
1277 uint32_t requiredDSVectorInvocations
= AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1278 #if USE_SIMD16_FRONTEND
1279 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) * tsState
.dsAllocationSize
; // simd8 -> simd16, padding
1281 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.dsAllocationSize
;
1282 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1284 if (requiredAllocSize
> gt_pTessellationThreadData
->dsOutputAllocSize
)
1286 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1287 gt_pTessellationThreadData
->pDSOutput
= (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1288 gt_pTessellationThreadData
->dsOutputAllocSize
= requiredAllocSize
;
1290 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1291 SWR_ASSERT(gt_pTessellationThreadData
->dsOutputAllocSize
>= requiredAllocSize
);
1294 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1297 // Run Domain Shader
1298 SWR_DS_CONTEXT dsContext
;
1299 dsContext
.PrimitiveID
= pPrimId
[p
];
1300 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1301 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1302 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1303 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1304 dsContext
.outVertexAttribOffset
= tsState
.dsOutVtxAttribOffset
;
1305 #if USE_SIMD16_FRONTEND
1306 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1308 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1311 uint32_t dsInvocations
= 0;
1313 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
; ++dsContext
.vectorOffset
)
1315 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1317 RDTSC_BEGIN(FEDomainShader
, pDC
->drawId
);
1318 state
.pfnDsFunc(GetPrivateState(pDC
), &dsContext
);
1319 RDTSC_END(FEDomainShader
, 0);
1321 AR_EVENT(DSStats(dsContext
.stats
.numInstExecuted
));
1323 dsInvocations
+= KNOB_SIMD_WIDTH
;
1325 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1327 #if USE_SIMD16_FRONTEND
1328 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1333 #if USE_SIMD16_FRONTEND
1334 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1335 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1337 dsContext
.pOutputData
,
1338 dsContext
.vectorStride
,
1341 tsState
.numDsOutputAttribs
+ tsState
.dsOutVtxAttribOffset
,
1343 tsData
.NumPrimitives
,
1344 tsState
.postDSTopology
,
1345 NumVertsPerPrim(tsState
.postDSTopology
, false));
1347 while (tessPa
.HasWork())
1349 #if USE_SIMD16_FRONTEND
1350 const uint32_t numPrims
= tessPa
.NumPrims();
1351 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1352 const uint32_t numPrims_hi
= std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1354 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1355 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1356 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1359 if (HasGeometryShaderT::value
)
1361 #if USE_SIMD16_FRONTEND
1362 tessPa
.useAlternateOffset
= false;
1363 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1367 tessPa
.useAlternateOffset
= true;
1368 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1371 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1372 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, _simd_set1_epi32(dsContext
.PrimitiveID
));
1377 if (HasStreamOutT::value
)
1379 #if ENABLE_AVX512_SIMD16
1380 tessPa
.useAlternateOffset
= false;
1382 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1385 if (HasRastT::value
)
1387 #if USE_SIMD16_FRONTEND
1388 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1390 simdvector prim
[3]; // Only deal with triangles, lines, or points
1392 RDTSC_BEGIN(FEPAAssemble
, pDC
->drawId
);
1394 #if USE_SIMD16_FRONTEND
1395 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1397 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1399 RDTSC_END(FEPAAssemble
, 1);
1400 SWR_ASSERT(assemble
);
1402 SWR_ASSERT(pfnClipFunc
);
1403 #if USE_SIMD16_FRONTEND
1404 // Gather data from the SVG if provided.
1405 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1406 simd16scalari vRtIdx
= SIMD16::setzero_si();
1407 SIMD16::Vec4 svgAttrib
[4];
1409 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
1411 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1415 if (state
.backendState
.readViewportArrayIndex
)
1417 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1418 tessPa
.viewportArrayActive
= true;
1420 if (state
.backendState
.readRenderTargetArrayIndex
)
1422 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1423 tessPa
.rtArrayActive
= true;
1428 // OOB VPAI indices => forced to zero.
1429 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1430 simd16scalari vNumViewports
= SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1431 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1432 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1434 tessPa
.useAlternateOffset
= false;
1435 pfnClipFunc(pDC
, tessPa
, workerId
, prim_simd16
, GenMask(numPrims
), primID
, vViewportIdx
, vRtIdx
);
1438 // Gather data from the SGV if provided.
1439 simdscalari vViewportIdx
= SIMD::setzero_si();
1440 simdscalari vRtIdx
= SIMD::setzero_si();
1441 SIMD::Vec4 svgAttrib
[4];
1443 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
1445 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1448 if (state
.backendState
.readViewportArrayIndex
)
1450 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1452 // OOB VPAI indices => forced to zero.
1453 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1454 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1455 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1456 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1457 tessPa
.viewportArrayActive
= true;
1459 if (state
.backendState
.readRenderTargetArrayIndex
)
1461 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1462 tessPa
.rtArrayActive
= true;
1464 pfnClipFunc(pDC
, tessPa
, workerId
, prim
,
1465 GenMask(tessPa
.NumPrims()), _simd_set1_epi32(dsContext
.PrimitiveID
), vViewportIdx
, vRtIdx
);
1472 } // while (tessPa.HasWork())
1473 } // for (uint32_t p = 0; p < numPrims; ++p)
1475 #if USE_SIMD16_FRONTEND
1476 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1478 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1479 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1481 gt_pTessellationThreadData
->dsOutputAllocSize
= 0;
1484 TSDestroyCtx(tsCtx
);
1487 THREAD
PA_STATE::SIMDVERTEX
*gpVertexStore
= nullptr;
1488 THREAD
uint32_t gVertexStoreSize
= 0;
1490 //////////////////////////////////////////////////////////////////////////
1491 /// @brief FE handler for SwrDraw.
1492 /// @tparam IsIndexedT - Is indexed drawing enabled
1493 /// @tparam HasTessellationT - Is tessellation enabled
1494 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1495 /// @tparam HasStreamOutT - Is stream-out enabled
1496 /// @tparam HasRastT - Is rasterization enabled
1497 /// @param pContext - pointer to SWR context.
1498 /// @param pDC - pointer to draw context.
1499 /// @param workerId - thread's worker id.
1500 /// @param pUserData - Pointer to DRAW_WORK
1502 typename IsIndexedT
,
1503 typename IsCutIndexEnabledT
,
1504 typename HasTessellationT
,
1505 typename HasGeometryShaderT
,
1506 typename HasStreamOutT
,
1509 SWR_CONTEXT
*pContext
,
1515 #if KNOB_ENABLE_TOSS_POINTS
1516 if (KNOB_TOSS_QUEUE_FE
)
1522 RDTSC_BEGIN(FEProcessDraw
, pDC
->drawId
);
1524 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1525 const API_STATE
& state
= GetApiState(pDC
);
1527 uint32_t indexSize
= 0;
1528 uint32_t endVertex
= work
.numVerts
;
1530 gfxptr_t xpLastRequestedIndex
= 0;
1531 if (IsIndexedT::value
)
1536 indexSize
= sizeof(uint32_t);
1539 indexSize
= sizeof(uint16_t);
1542 indexSize
= sizeof(uint8_t);
1545 SWR_INVALID("Invalid work.type: %d", work
.type
);
1547 xpLastRequestedIndex
= work
.xpIB
+ endVertex
* indexSize
;
1551 // No cuts, prune partial primitives.
1552 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1555 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1556 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1559 GsBuffers gsBuffers
;
1560 if (HasGeometryShaderT::value
)
1562 #if USE_SIMD16_FRONTEND
1563 AllocateGsBuffers
<SIMD512
, KNOB_SIMD16_WIDTH
>(pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1565 AllocateGsBuffers
<SIMD256
, KNOB_SIMD_WIDTH
>(pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1569 if (HasTessellationT::value
)
1571 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1572 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1573 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1575 AllocateTessellationData(pContext
);
1579 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1580 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1581 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1584 // allocate space for streamout input prim data
1585 uint32_t* pSoPrimData
= nullptr;
1586 if (HasStreamOutT::value
)
1588 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1591 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, true);
1592 #if USE_SIMD16_FRONTEND
1593 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simd16vector
);
1595 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simdvector
);
1598 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1600 // Compute storage requirements for vertex store
1601 // TODO: allocation needs to be rethought for better cut support
1602 uint32_t numVerts
= vertexCount
+ 2; // Need extra space for PA state machine
1603 uint32_t vertexStoreSize
= numVerts
* simdVertexSizeBytes
;
1605 // grow the vertex store for the PA as necessary
1606 if (gVertexStoreSize
< vertexStoreSize
)
1608 if (gpVertexStore
!= nullptr)
1610 AlignedFree(gpVertexStore
);
1611 gpVertexStore
= nullptr;
1614 SWR_ASSERT(gpVertexStore
== nullptr);
1616 gpVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(vertexStoreSize
, 64));
1617 gVertexStoreSize
= vertexStoreSize
;
1619 SWR_ASSERT(gpVertexStore
!= nullptr);
1622 // choose primitive assembler
1624 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
, state
.topology
, work
.numVerts
, gpVertexStore
, numVerts
, state
.frontendState
.vsVertexSize
, GetNumVerts(state
.topology
, 1));
1625 PA_STATE
& pa
= paFactory
.GetPA();
1627 #if USE_SIMD16_FRONTEND
1628 #if USE_SIMD16_SHADERS
1634 SWR_VS_CONTEXT vsContext_lo
;
1635 SWR_VS_CONTEXT vsContext_hi
;
1637 #if USE_SIMD16_SHADERS
1638 vsContext_lo
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1639 vsContext_hi
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1641 vsContext_lo
.pVin
= &vin_lo
;
1642 vsContext_hi
.pVin
= &vin_hi
;
1644 vsContext_lo
.AlternateOffset
= 0;
1645 vsContext_hi
.AlternateOffset
= 1;
1647 SWR_FETCH_CONTEXT fetchInfo_lo
= { 0 };
1649 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1650 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1651 fetchInfo_lo
.StartVertex
= 0;
1653 if (IsIndexedT::value
)
1655 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1657 // if the entire index buffer isn't being consumed, set the last index
1658 // so that fetches < a SIMD wide will be masked off
1659 fetchInfo_lo
.xpLastIndex
= state
.indexBuffer
.xpIndices
+ state
.indexBuffer
.size
;
1660 if (xpLastRequestedIndex
< fetchInfo_lo
.xpLastIndex
)
1662 fetchInfo_lo
.xpLastIndex
= xpLastRequestedIndex
;
1667 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1670 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1672 const simd16scalari vScale
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1674 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1678 simd16scalari vIndex
;
1680 if (IsIndexedT::value
)
1682 fetchInfo_lo
.xpIndices
= work
.xpIB
;
1683 fetchInfo_hi
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD_WIDTH
* indexSize
; // 1/2 of KNOB_SIMD16_WIDTH
1687 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1689 fetchInfo_lo
.xpIndices
= (gfxptr_t
)&vIndex
;
1690 fetchInfo_hi
.xpIndices
= (gfxptr_t
)&vIndex
+ KNOB_SIMD_WIDTH
* sizeof(int32_t); // 1/2 of KNOB_SIMD16_WIDTH
1693 fetchInfo_lo
.CurInstance
= instanceNum
;
1694 fetchInfo_hi
.CurInstance
= instanceNum
;
1696 vsContext_lo
.InstanceID
= instanceNum
;
1697 vsContext_hi
.InstanceID
= instanceNum
;
1699 while (pa
.HasWork())
1701 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1702 // So we need to keep this outside of (i < endVertex) check.
1704 simdmask
*pvCutIndices_lo
= nullptr;
1705 simdmask
*pvCutIndices_hi
= nullptr;
1707 if (IsIndexedT::value
)
1709 // simd16mask <=> simdmask[2]
1711 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1712 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1715 simd16vertex
&vout
= pa
.GetNextVsOutput();
1717 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1718 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1722 if (!IsIndexedT::value
)
1724 fetchInfo_lo
.xpLastIndex
= fetchInfo_lo
.xpIndices
;
1726 offset
= std::min(endVertex
-i
, (uint32_t) KNOB_SIMD16_WIDTH
);
1727 #if USE_SIMD16_SHADERS
1728 offset
*= 4; // convert from index to address
1729 fetchInfo_lo
.xpLastIndex
+= offset
;
1731 fetchInfo_lo
.xpLastIndex
+= std::min(offset
, (uint32_t) KNOB_SIMD_WIDTH
) * 4; // * 4 for converting index to address
1732 uint32_t offset2
= std::min(offset
, (uint32_t) KNOB_SIMD16_WIDTH
)-KNOB_SIMD_WIDTH
;
1733 assert(offset
>= 0);
1734 fetchInfo_hi
.xpLastIndex
= fetchInfo_hi
.xpIndices
;
1735 fetchInfo_hi
.xpLastIndex
+= offset2
* 4; // * 4 for converting index to address
1738 // 1. Execute FS/VS for a single SIMD.
1739 RDTSC_BEGIN(FEFetchShader
, pDC
->drawId
);
1740 #if USE_SIMD16_SHADERS
1741 state
.pfnFetchFunc(GetPrivateState(pDC
), fetchInfo_lo
, vin
);
1743 state
.pfnFetchFunc(GetPrivateState(pDC
), fetchInfo_lo
, vin_lo
);
1745 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1747 state
.pfnFetchFunc(GetPrivateState(pDC
), fetchInfo_hi
, vin_hi
);
1750 RDTSC_END(FEFetchShader
, 0);
1752 // forward fetch generated vertex IDs to the vertex shader
1753 #if USE_SIMD16_SHADERS
1755 vsContext_lo
.VertexID16
= _simd16_insert_si(
1756 vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID
, 0);
1757 vsContext_lo
.VertexID16
= _simd16_insert_si(
1758 vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID2
, 1);
1760 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1761 vsContext_hi
.VertexID
= fetchInfo_lo
.VertexID2
;
1764 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1765 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1768 // Setup active mask for vertex shader.
1770 vsContext_lo
.mask16
= GenerateMask16(endVertex
- i
);
1772 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1773 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1776 // forward cut mask to the PA
1777 if (IsIndexedT::value
)
1779 #if USE_SIMD16_SHADERS
1780 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1781 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask2
));
1783 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1784 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1788 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1790 #if KNOB_ENABLE_TOSS_POINTS
1791 if (!KNOB_TOSS_FETCH
)
1794 RDTSC_BEGIN(FEVertexShader
, pDC
->drawId
);
1796 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext_lo
);
1797 AR_EVENT(VSStats(vsContext_lo
.stats
.numInstExecuted
));
1799 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext_lo
);
1800 AR_EVENT(VSStats(vsContext_lo
.stats
.numInstExecuted
));
1802 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1804 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext_hi
);
1805 AR_EVENT(VSStats(vsContext_hi
.stats
.numInstExecuted
));
1808 RDTSC_END(FEVertexShader
, 0);
1810 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1814 // 2. Assemble primitives given the last two SIMD.
1817 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1819 RDTSC_START(FEPAAssemble
);
1820 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1821 RDTSC_STOP(FEPAAssemble
, 1, 0);
1823 #if KNOB_ENABLE_TOSS_POINTS
1824 if (!KNOB_TOSS_FETCH
)
1827 #if KNOB_ENABLE_TOSS_POINTS
1833 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1835 const uint32_t numPrims
= pa
.NumPrims();
1836 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1837 const uint32_t numPrims_hi
= std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1839 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
1840 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1841 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1843 if (HasTessellationT::value
)
1845 pa
.useAlternateOffset
= false;
1846 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1850 pa
.useAlternateOffset
= true;
1851 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1854 else if (HasGeometryShaderT::value
)
1856 pa
.useAlternateOffset
= false;
1857 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1861 pa
.useAlternateOffset
= true;
1862 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1867 // If streamout is enabled then stream vertices out to memory.
1868 if (HasStreamOutT::value
)
1870 pa
.useAlternateOffset
= false;
1871 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1874 if (HasRastT::value
)
1876 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
1877 // Gather data from the SVG if provided.
1878 simd16scalari vpai
= SIMD16::setzero_si();
1879 simd16scalari rtai
= SIMD16::setzero_si();
1880 SIMD16::Vec4 svgAttrib
[4];
1882 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
1884 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1888 if (state
.backendState
.readViewportArrayIndex
)
1890 vpai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1891 pa
.viewportArrayActive
= true;
1893 if (state
.backendState
.readRenderTargetArrayIndex
)
1895 rtai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1896 pa
.rtArrayActive
= true;
1900 // OOB VPAI indices => forced to zero.
1901 vpai
= SIMD16::max_epi32(vpai
, SIMD16::setzero_si());
1902 simd16scalari vNumViewports
= SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1903 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vpai
, vNumViewports
);
1904 vpai
= SIMD16::and_si(vClearMask
, vpai
);
1906 pa
.useAlternateOffset
= false;
1907 pDC
->pState
->pfnProcessPrims_simd16(pDC
, pa
, workerId
, prim_simd16
, GenMask(numPrims
), primID
, vpai
, rtai
);
1914 } while (pa
.NextPrim());
1916 if (IsIndexedT::value
)
1918 fetchInfo_lo
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
1919 fetchInfo_hi
.xpIndices
= fetchInfo_hi
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
1923 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
1926 i
+= KNOB_SIMD16_WIDTH
;
1933 SWR_VS_CONTEXT vsContext
;
1934 SWR_FETCH_CONTEXT fetchInfo
= { 0 };
1936 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
1937 fetchInfo
.StartInstance
= work
.startInstance
;
1938 fetchInfo
.StartVertex
= 0;
1940 if (IsIndexedT::value
)
1942 fetchInfo
.BaseVertex
= work
.baseVertex
;
1944 // if the entire index buffer isn't being consumed, set the last index
1945 // so that fetches < a SIMD wide will be masked off
1946 fetchInfo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1947 if (xpLastRequestedIndex
< fetchInfo
.pLastIndex
)
1949 fetchInfo
.pLastIndex
= xpLastRequestedIndex
;
1954 fetchInfo
.StartVertex
= work
.startVertex
;
1957 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1959 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1960 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1965 if (IsIndexedT::value
)
1967 fetchInfo
.pIndices
= work
.pIB
;
1971 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
1972 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
1975 fetchInfo
.CurInstance
= instanceNum
;
1976 vsContext
.InstanceID
= instanceNum
;
1978 while (pa
.HasWork())
1980 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1981 // So we need to keep this outside of (i < endVertex) check.
1982 simdmask
* pvCutIndices
= nullptr;
1983 if (IsIndexedT::value
)
1985 pvCutIndices
= &pa
.GetNextVsIndices();
1988 simdvertex
& vout
= pa
.GetNextVsOutput();
1989 vsContext
.pVin
= &vout
;
1990 vsContext
.pVout
= &vout
;
1995 // 1. Execute FS/VS for a single SIMD.
1996 RDTSC_BEGIN(FEFetchShader
, pDC
->drawId
);
1997 state
.pfnFetchFunc(GetPrivateState(pDC
), fetchInfo
, vout
);
1998 RDTSC_END(FEFetchShader
, 0);
2000 // forward fetch generated vertex IDs to the vertex shader
2001 vsContext
.VertexID
= fetchInfo
.VertexID
;
2003 // Setup active mask for vertex shader.
2004 vsContext
.mask
= GenerateMask(endVertex
- i
);
2006 // forward cut mask to the PA
2007 if (IsIndexedT::value
)
2009 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
2012 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
2014 #if KNOB_ENABLE_TOSS_POINTS
2015 if (!KNOB_TOSS_FETCH
)
2018 RDTSC_BEGIN(FEVertexShader
, pDC
->drawId
);
2019 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext
);
2020 RDTSC_END(FEVertexShader
, 0);
2022 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
2023 AR_EVENT(VSStats(vsContext
.stats
.numInstExecuted
));
2027 // 2. Assemble primitives given the last two SIMD.
2030 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
2031 // PaAssemble returns false if there is not enough verts to assemble.
2032 RDTSC_BEGIN(FEPAAssemble
, pDC
->drawId
);
2033 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
2034 RDTSC_END(FEPAAssemble
, 1);
2036 #if KNOB_ENABLE_TOSS_POINTS
2037 if (!KNOB_TOSS_FETCH
)
2040 #if KNOB_ENABLE_TOSS_POINTS
2046 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2048 if (HasTessellationT::value
)
2050 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2051 pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
2053 else if (HasGeometryShaderT::value
)
2055 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
2056 pDC
, workerId
, pa
, &gsBuffers
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
2060 // If streamout is enabled then stream vertices out to memory.
2061 if (HasStreamOutT::value
)
2063 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2066 if (HasRastT::value
)
2068 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
2070 // Gather data from the SVG if provided.
2071 simdscalari vViewportIdx
= SIMD::setzero_si();
2072 simdscalari vRtIdx
= SIMD::setzero_si();
2073 SIMD::Vec4 svgAttrib
[4];
2075 if (state
.backendState
.readViewportArrayIndex
|| state
.backendState
.readRenderTargetArrayIndex
)
2077 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2080 if (state
.backendState
.readViewportArrayIndex
)
2082 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2084 // OOB VPAI indices => forced to zero.
2085 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
2086 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2087 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
2088 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
2089 pa
.viewportArrayActive
= true;
2091 if (state
.backendState
.readRenderTargetArrayIndex
)
2093 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2094 pa
.rtArrayActive
= true;
2097 pDC
->pState
->pfnProcessPrims(pDC
, pa
, workerId
, prim
,
2098 GenMask(pa
.NumPrims()), pa
.GetPrimID(work
.startPrimID
), vViewportIdx
, vRtIdx
);
2104 } while (pa
.NextPrim());
2106 if (IsIndexedT::value
)
2108 fetchInfo
.pIndices
= (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
2112 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
2115 i
+= KNOB_SIMD_WIDTH
;
2122 RDTSC_END(FEProcessDraw
, numPrims
* work
.numInstances
);
2125 struct FEDrawChooser
2127 typedef PFN_FE_WORK_FUNC FuncType
;
2129 template <typename
... ArgsB
>
2130 static FuncType
GetFunc()
2132 return ProcessDraw
<ArgsB
...>;
2137 // Selector for correct templated Draw front-end function
2138 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
2140 bool IsCutIndexEnabled
,
2141 bool HasTessellation
,
2142 bool HasGeometryShader
,
2144 bool HasRasterization
)
2146 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
, IsCutIndexEnabled
, HasTessellation
, HasGeometryShader
, HasStreamOut
, HasRasterization
);