1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Helper macro to generate a bitmask
45 static INLINE
uint32_t GenMask(uint32_t numBits
)
47 SWR_ASSERT(numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
48 return ((1U << numBits
) - 1);
51 //////////////////////////////////////////////////////////////////////////
52 /// @brief FE handler for SwrSync.
53 /// @param pContext - pointer to SWR context.
54 /// @param pDC - pointer to draw context.
55 /// @param workerId - thread's worker id. Even thread has a unique id.
56 /// @param pUserData - Pointer to user data passed back to sync callback.
57 /// @todo This should go away when we switch this to use compute threading.
59 SWR_CONTEXT
*pContext
,
66 work
.pfnWork
= ProcessSyncBE
;
68 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
69 pTileMgr
->enqueue(0, 0, &work
);
72 //////////////////////////////////////////////////////////////////////////
73 /// @brief FE handler for SwrDestroyContext.
74 /// @param pContext - pointer to SWR context.
75 /// @param pDC - pointer to draw context.
76 /// @param workerId - thread's worker id. Even thread has a unique id.
77 /// @param pUserData - Pointer to user data passed back to sync callback.
79 SWR_CONTEXT
*pContext
,
86 work
.pfnWork
= ProcessShutdownBE
;
88 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
89 // Enqueue at least 1 work item for each worker thread
90 // account for number of numa nodes
91 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
93 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
95 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
97 pTileMgr
->enqueue(i
, n
, &work
);
102 //////////////////////////////////////////////////////////////////////////
103 /// @brief FE handler for SwrClearRenderTarget.
104 /// @param pContext - pointer to SWR context.
105 /// @param pDC - pointer to draw context.
106 /// @param workerId - thread's worker id. Even thread has a unique id.
107 /// @param pUserData - Pointer to user data passed back to clear callback.
108 /// @todo This should go away when we switch this to use compute threading.
110 SWR_CONTEXT
*pContext
,
115 CLEAR_DESC
*pDesc
= (CLEAR_DESC
*)pUserData
;
116 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
118 // queue a clear to each macro tile
119 // compute macro tile bounds for the specified rect
120 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
121 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
122 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
123 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
127 work
.pfnWork
= ProcessClearBE
;
128 work
.desc
.clear
= *pDesc
;
130 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
132 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
134 pTileMgr
->enqueue(x
, y
, &work
);
139 //////////////////////////////////////////////////////////////////////////
140 /// @brief FE handler for SwrStoreTiles.
141 /// @param pContext - pointer to SWR context.
142 /// @param pDC - pointer to draw context.
143 /// @param workerId - thread's worker id. Even thread has a unique id.
144 /// @param pUserData - Pointer to user data passed back to callback.
145 /// @todo This should go away when we switch this to use compute threading.
146 void ProcessStoreTiles(
147 SWR_CONTEXT
*pContext
,
152 AR_BEGIN(FEProcessStoreTiles
, pDC
->drawId
);
153 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
154 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
156 // queue a store to each macro tile
157 // compute macro tile bounds for the specified rect
158 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
159 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
160 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
161 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
165 work
.type
= STORETILES
;
166 work
.pfnWork
= ProcessStoreTilesBE
;
167 work
.desc
.storeTiles
= *pDesc
;
169 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
171 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
173 pTileMgr
->enqueue(x
, y
, &work
);
177 AR_END(FEProcessStoreTiles
, 0);
180 //////////////////////////////////////////////////////////////////////////
181 /// @brief FE handler for SwrInvalidateTiles.
182 /// @param pContext - pointer to SWR context.
183 /// @param pDC - pointer to draw context.
184 /// @param workerId - thread's worker id. Even thread has a unique id.
185 /// @param pUserData - Pointer to user data passed back to callback.
186 /// @todo This should go away when we switch this to use compute threading.
187 void ProcessDiscardInvalidateTiles(
188 SWR_CONTEXT
*pContext
,
193 AR_BEGIN(FEProcessInvalidateTiles
, pDC
->drawId
);
194 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
195 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
197 // compute macro tile bounds for the specified rect
198 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
199 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
200 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
201 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
203 if (pDesc
->fullTilesOnly
== false)
205 // include partial tiles
206 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
207 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
208 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
209 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
212 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
213 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
215 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
216 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
220 work
.type
= DISCARDINVALIDATETILES
;
221 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
222 work
.desc
.discardInvalidateTiles
= *pDesc
;
224 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
226 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
228 pTileMgr
->enqueue(x
, y
, &work
);
232 AR_END(FEProcessInvalidateTiles
, 0);
235 //////////////////////////////////////////////////////////////////////////
236 /// @brief Computes the number of primitives given the number of verts.
237 /// @param mode - primitive topology for draw operation.
238 /// @param numPrims - number of vertices or indices for draw.
239 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
240 uint32_t GetNumPrims(
241 PRIMITIVE_TOPOLOGY mode
,
246 case TOP_POINT_LIST
: return numPrims
;
247 case TOP_TRIANGLE_LIST
: return numPrims
/ 3;
248 case TOP_TRIANGLE_STRIP
: return numPrims
< 3 ? 0 : numPrims
- 2;
249 case TOP_TRIANGLE_FAN
: return numPrims
< 3 ? 0 : numPrims
- 2;
250 case TOP_TRIANGLE_DISC
: return numPrims
< 2 ? 0 : numPrims
- 1;
251 case TOP_QUAD_LIST
: return numPrims
/ 4;
252 case TOP_QUAD_STRIP
: return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
253 case TOP_LINE_STRIP
: return numPrims
< 2 ? 0 : numPrims
- 1;
254 case TOP_LINE_LIST
: return numPrims
/ 2;
255 case TOP_LINE_LOOP
: return numPrims
;
256 case TOP_RECT_LIST
: return numPrims
/ 3;
257 case TOP_LINE_LIST_ADJ
: return numPrims
/ 4;
258 case TOP_LISTSTRIP_ADJ
: return numPrims
< 3 ? 0 : numPrims
- 3;
259 case TOP_TRI_LIST_ADJ
: return numPrims
/ 6;
260 case TOP_TRI_STRIP_ADJ
: return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
262 case TOP_PATCHLIST_1
:
263 case TOP_PATCHLIST_2
:
264 case TOP_PATCHLIST_3
:
265 case TOP_PATCHLIST_4
:
266 case TOP_PATCHLIST_5
:
267 case TOP_PATCHLIST_6
:
268 case TOP_PATCHLIST_7
:
269 case TOP_PATCHLIST_8
:
270 case TOP_PATCHLIST_9
:
271 case TOP_PATCHLIST_10
:
272 case TOP_PATCHLIST_11
:
273 case TOP_PATCHLIST_12
:
274 case TOP_PATCHLIST_13
:
275 case TOP_PATCHLIST_14
:
276 case TOP_PATCHLIST_15
:
277 case TOP_PATCHLIST_16
:
278 case TOP_PATCHLIST_17
:
279 case TOP_PATCHLIST_18
:
280 case TOP_PATCHLIST_19
:
281 case TOP_PATCHLIST_20
:
282 case TOP_PATCHLIST_21
:
283 case TOP_PATCHLIST_22
:
284 case TOP_PATCHLIST_23
:
285 case TOP_PATCHLIST_24
:
286 case TOP_PATCHLIST_25
:
287 case TOP_PATCHLIST_26
:
288 case TOP_PATCHLIST_27
:
289 case TOP_PATCHLIST_28
:
290 case TOP_PATCHLIST_29
:
291 case TOP_PATCHLIST_30
:
292 case TOP_PATCHLIST_31
:
293 case TOP_PATCHLIST_32
:
294 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
297 case TOP_POINT_LIST_BF
:
298 case TOP_LINE_STRIP_CONT
:
299 case TOP_LINE_STRIP_BF
:
300 case TOP_LINE_STRIP_CONT_BF
:
301 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
302 case TOP_TRI_STRIP_REVERSE
:
303 case TOP_PATCHLIST_BASE
:
305 SWR_INVALID("Unsupported topology: %d", mode
);
312 //////////////////////////////////////////////////////////////////////////
313 /// @brief Computes the number of verts given the number of primitives.
314 /// @param mode - primitive topology for draw operation.
315 /// @param numPrims - number of primitives for draw.
316 uint32_t GetNumVerts(
317 PRIMITIVE_TOPOLOGY mode
,
322 case TOP_POINT_LIST
: return numPrims
;
323 case TOP_TRIANGLE_LIST
: return numPrims
* 3;
324 case TOP_TRIANGLE_STRIP
: return numPrims
? numPrims
+ 2 : 0;
325 case TOP_TRIANGLE_FAN
: return numPrims
? numPrims
+ 2 : 0;
326 case TOP_TRIANGLE_DISC
: return numPrims
? numPrims
+ 1 : 0;
327 case TOP_QUAD_LIST
: return numPrims
* 4;
328 case TOP_QUAD_STRIP
: return numPrims
? numPrims
* 2 + 2 : 0;
329 case TOP_LINE_STRIP
: return numPrims
? numPrims
+ 1 : 0;
330 case TOP_LINE_LIST
: return numPrims
* 2;
331 case TOP_LINE_LOOP
: return numPrims
;
332 case TOP_RECT_LIST
: return numPrims
* 3;
333 case TOP_LINE_LIST_ADJ
: return numPrims
* 4;
334 case TOP_LISTSTRIP_ADJ
: return numPrims
? numPrims
+ 3 : 0;
335 case TOP_TRI_LIST_ADJ
: return numPrims
* 6;
336 case TOP_TRI_STRIP_ADJ
: return numPrims
? (numPrims
+ 2) * 2 : 0;
338 case TOP_PATCHLIST_1
:
339 case TOP_PATCHLIST_2
:
340 case TOP_PATCHLIST_3
:
341 case TOP_PATCHLIST_4
:
342 case TOP_PATCHLIST_5
:
343 case TOP_PATCHLIST_6
:
344 case TOP_PATCHLIST_7
:
345 case TOP_PATCHLIST_8
:
346 case TOP_PATCHLIST_9
:
347 case TOP_PATCHLIST_10
:
348 case TOP_PATCHLIST_11
:
349 case TOP_PATCHLIST_12
:
350 case TOP_PATCHLIST_13
:
351 case TOP_PATCHLIST_14
:
352 case TOP_PATCHLIST_15
:
353 case TOP_PATCHLIST_16
:
354 case TOP_PATCHLIST_17
:
355 case TOP_PATCHLIST_18
:
356 case TOP_PATCHLIST_19
:
357 case TOP_PATCHLIST_20
:
358 case TOP_PATCHLIST_21
:
359 case TOP_PATCHLIST_22
:
360 case TOP_PATCHLIST_23
:
361 case TOP_PATCHLIST_24
:
362 case TOP_PATCHLIST_25
:
363 case TOP_PATCHLIST_26
:
364 case TOP_PATCHLIST_27
:
365 case TOP_PATCHLIST_28
:
366 case TOP_PATCHLIST_29
:
367 case TOP_PATCHLIST_30
:
368 case TOP_PATCHLIST_31
:
369 case TOP_PATCHLIST_32
:
370 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
373 case TOP_POINT_LIST_BF
:
374 case TOP_LINE_STRIP_CONT
:
375 case TOP_LINE_STRIP_BF
:
376 case TOP_LINE_STRIP_CONT_BF
:
377 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
378 case TOP_TRI_STRIP_REVERSE
:
379 case TOP_PATCHLIST_BASE
:
381 SWR_INVALID("Unsupported topology: %d", mode
);
388 //////////////////////////////////////////////////////////////////////////
389 /// @brief Return number of verts per primitive.
390 /// @param topology - topology
391 /// @param includeAdjVerts - include adjacent verts in primitive vertices
392 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
394 uint32_t numVerts
= 0;
398 case TOP_POINT_LIST_BF
:
403 case TOP_LINE_LIST_ADJ
:
405 case TOP_LINE_STRIP_CONT
:
406 case TOP_LINE_STRIP_BF
:
407 case TOP_LISTSTRIP_ADJ
:
410 case TOP_TRIANGLE_LIST
:
411 case TOP_TRIANGLE_STRIP
:
412 case TOP_TRIANGLE_FAN
:
413 case TOP_TRI_LIST_ADJ
:
414 case TOP_TRI_STRIP_ADJ
:
415 case TOP_TRI_STRIP_REVERSE
:
423 case TOP_PATCHLIST_1
:
424 case TOP_PATCHLIST_2
:
425 case TOP_PATCHLIST_3
:
426 case TOP_PATCHLIST_4
:
427 case TOP_PATCHLIST_5
:
428 case TOP_PATCHLIST_6
:
429 case TOP_PATCHLIST_7
:
430 case TOP_PATCHLIST_8
:
431 case TOP_PATCHLIST_9
:
432 case TOP_PATCHLIST_10
:
433 case TOP_PATCHLIST_11
:
434 case TOP_PATCHLIST_12
:
435 case TOP_PATCHLIST_13
:
436 case TOP_PATCHLIST_14
:
437 case TOP_PATCHLIST_15
:
438 case TOP_PATCHLIST_16
:
439 case TOP_PATCHLIST_17
:
440 case TOP_PATCHLIST_18
:
441 case TOP_PATCHLIST_19
:
442 case TOP_PATCHLIST_20
:
443 case TOP_PATCHLIST_21
:
444 case TOP_PATCHLIST_22
:
445 case TOP_PATCHLIST_23
:
446 case TOP_PATCHLIST_24
:
447 case TOP_PATCHLIST_25
:
448 case TOP_PATCHLIST_26
:
449 case TOP_PATCHLIST_27
:
450 case TOP_PATCHLIST_28
:
451 case TOP_PATCHLIST_29
:
452 case TOP_PATCHLIST_30
:
453 case TOP_PATCHLIST_31
:
454 case TOP_PATCHLIST_32
:
455 numVerts
= topology
- TOP_PATCHLIST_BASE
;
458 SWR_INVALID("Unsupported topology: %d", topology
);
466 case TOP_LISTSTRIP_ADJ
:
467 case TOP_LINE_LIST_ADJ
: numVerts
= 4; break;
468 case TOP_TRI_STRIP_ADJ
:
469 case TOP_TRI_LIST_ADJ
: numVerts
= 6; break;
477 //////////////////////////////////////////////////////////////////////////
478 /// @brief Generate mask from remaining work.
479 /// @param numWorkItems - Number of items being worked on by a SIMD.
480 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
482 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
483 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
484 return _simd_castps_si(vMask(mask
));
487 //////////////////////////////////////////////////////////////////////////
488 /// @brief StreamOut - Streams vertex data out to SO buffers.
489 /// Generally, we are only streaming out a SIMDs worth of triangles.
490 /// @param pDC - pointer to draw context.
491 /// @param workerId - thread's worker id. Even thread has a unique id.
492 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
493 static void StreamOut(
498 #if USE_SIMD16_FRONTEND
499 uint32_t numPrims_simd8
,
501 uint32_t streamIndex
)
503 SWR_CONTEXT
*pContext
= pDC
->pContext
;
505 AR_BEGIN(FEStreamout
, pDC
->drawId
);
507 const API_STATE
& state
= GetApiState(pDC
);
508 const SWR_STREAMOUT_STATE
&soState
= state
.soState
;
510 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
512 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
513 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
515 SWR_STREAMOUT_CONTEXT soContext
= { 0 };
517 // Setup buffer state pointers.
518 for (uint32_t i
= 0; i
< 4; ++i
)
520 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
523 #if USE_SIMD16_FRONTEND
524 uint32_t numPrims
= numPrims_simd8
;
526 uint32_t numPrims
= pa
.NumPrims();
529 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
532 uint32_t soMask
= soState
.streamMasks
[streamIndex
];
534 // Write all entries into primitive data buffer for SOS.
535 while (_BitScanForward(&slot
, soMask
))
537 __m128 attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
538 uint32_t paSlot
= slot
+ VERTEX_ATTRIB_START_SLOT
;
539 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
541 // Attribute offset is relative offset from start of vertex.
542 // Note that attributes start at slot 1 in the PA buffer. We need to write this
543 // to prim data starting at slot 0. Which is why we do (slot - 1).
544 // Also note: GL works slightly differently, and needs slot 0
545 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
547 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
548 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
550 uint32_t* pPrimDataAttrib
= pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
552 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
555 soMask
&= ~(1 << slot
);
558 // Update pPrimData pointer
559 soContext
.pPrimData
= pPrimData
;
562 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr, "Trying to execute uninitialized streamout jit function.");
563 state
.pfnSoFunc
[streamIndex
](soContext
);
566 // Update SO write offset. The driver provides memory for the update.
567 for (uint32_t i
= 0; i
< 4; ++i
)
569 if (state
.soBuffer
[i
].pWriteOffset
)
571 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
574 if (state
.soBuffer
[i
].soWriteEnable
)
576 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
577 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
581 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
582 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
584 AR_END(FEStreamout
, 1);
587 #if USE_SIMD16_FRONTEND
588 //////////////////////////////////////////////////////////////////////////
589 /// Is value an even number (a multiple of two)
591 template <typename T
>
592 INLINE
static bool IsEven(T value
)
594 return (value
& 1) == 0;
597 //////////////////////////////////////////////////////////////////////////
598 /// Round up value to an even number (a multiple of two)
600 template <typename T
>
601 INLINE
static T
RoundUpEven(T value
)
603 return (value
+ 1) & ~1;
606 //////////////////////////////////////////////////////////////////////////
607 /// Round down value to an even number (a multiple of two)
609 template <typename T
>
610 INLINE
static T
RoundDownEven(T value
)
615 //////////////////////////////////////////////////////////////////////////
616 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
618 /// vertexCount is in terms of the source simdvertexes and must be even
620 /// attribCount will limit the vector copies to those attribs specified
622 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
624 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
*vertex_simd16
, const simdvertex
*vertex
, uint32_t vertexCount
, uint32_t attribCount
)
627 SWR_ASSERT(vertex_simd16
);
628 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
632 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
634 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
636 for (uint32_t k
= 0; k
< 4; k
+= 1)
638 temp
.attrib
[j
][k
] = _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
640 if ((i
+ 1) < vertexCount
)
642 temp
.attrib
[j
][k
] = _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
647 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
649 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
655 //////////////////////////////////////////////////////////////////////////
656 /// @brief Computes number of invocations. The current index represents
657 /// the start of the SIMD. The max index represents how much work
658 /// items are remaining. If there is less then a SIMD's xmin of work
659 /// then return the remaining amount of work.
660 /// @param curIndex - The start index for the SIMD.
661 /// @param maxIndex - The last index for all work items.
662 static INLINE
uint32_t GetNumInvocations(
666 uint32_t remainder
= (maxIndex
- curIndex
);
667 #if USE_SIMD16_FRONTEND
668 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
670 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
674 //////////////////////////////////////////////////////////////////////////
675 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
676 /// The geometry shader will loop over each active streamout buffer, assembling
677 /// primitives for the downstream stages. When multistream output is enabled,
678 /// the generated stream ID buffer from the GS needs to be converted to a cut
679 /// buffer for the primitive assembler.
680 /// @param stream - stream id to generate the cut buffer for
681 /// @param pStreamIdBase - pointer to the stream ID buffer
682 /// @param numEmittedVerts - Number of total verts emitted by the GS
683 /// @param pCutBuffer - output buffer to write cuts to
684 void ProcessStreamIdBuffer(uint32_t stream
, uint8_t* pStreamIdBase
, uint32_t numEmittedVerts
, uint8_t *pCutBuffer
)
686 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
688 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
689 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
691 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
693 uint8_t curInputByte
= pStreamIdBase
[2*b
];
695 for (uint32_t i
= 0; i
< 4; ++i
)
697 if ((curInputByte
& 0x3) != stream
)
704 curInputByte
= pStreamIdBase
[2 * b
+ 1];
705 for (uint32_t i
= 0; i
< 4; ++i
)
707 if ((curInputByte
& 0x3) != stream
)
709 outByte
|= (1 << (i
+ 4));
714 *pCutBuffer
++ = outByte
;
718 THREAD SWR_GS_CONTEXT tlsGsContext
;
720 template<typename SIMDVERTEX
, uint32_t SIMD_WIDTH
>
723 GsBufferInfo(const SWR_GS_STATE
&gsState
)
725 const uint32_t vertexCount
= gsState
.maxNumVerts
;
726 const uint32_t vertexStride
= sizeof(SIMDVERTEX
);
727 const uint32_t numSimdBatches
= (vertexCount
+ SIMD_WIDTH
- 1) / SIMD_WIDTH
;
729 vertexPrimitiveStride
= vertexStride
* numSimdBatches
;
730 vertexInstanceStride
= vertexPrimitiveStride
* SIMD_WIDTH
;
732 if (gsState
.isSingleStream
)
734 cutPrimitiveStride
= (vertexCount
+ 7) / 8;
735 cutInstanceStride
= cutPrimitiveStride
* SIMD_WIDTH
;
737 streamCutPrimitiveStride
= 0;
738 streamCutInstanceStride
= 0;
742 cutPrimitiveStride
= AlignUp(vertexCount
* 2 / 8, 4);
743 cutInstanceStride
= cutPrimitiveStride
* SIMD_WIDTH
;
745 streamCutPrimitiveStride
= (vertexCount
+ 7) / 8;
746 streamCutInstanceStride
= streamCutPrimitiveStride
* SIMD_WIDTH
;
750 uint32_t vertexPrimitiveStride
;
751 uint32_t vertexInstanceStride
;
753 uint32_t cutPrimitiveStride
;
754 uint32_t cutInstanceStride
;
756 uint32_t streamCutPrimitiveStride
;
757 uint32_t streamCutInstanceStride
;
760 //////////////////////////////////////////////////////////////////////////
761 /// @brief Implements GS stage.
762 /// @param pDC - pointer to draw context.
763 /// @param workerId - thread's worker id. Even thread has a unique id.
764 /// @param pa - The primitive assembly object.
765 /// @param pGsOut - output stream for GS
767 typename HasStreamOutT
,
769 static void GeometryShaderStage(
775 void* pStreamCutBuffer
,
776 uint32_t* pSoPrimData
,
777 #if USE_SIMD16_FRONTEND
778 uint32_t numPrims_simd8
,
782 SWR_CONTEXT
*pContext
= pDC
->pContext
;
784 AR_BEGIN(FEGeometryShader
, pDC
->drawId
);
786 const API_STATE
& state
= GetApiState(pDC
);
787 const SWR_GS_STATE
* pState
= &state
.gsState
;
789 SWR_ASSERT(pGsOut
!= nullptr, "GS output buffer should be initialized");
790 SWR_ASSERT(pCutBuffer
!= nullptr, "GS output cut buffer should be initialized");
792 tlsGsContext
.pStream
= (uint8_t*)pGsOut
;
793 tlsGsContext
.pCutOrStreamIdBuffer
= (uint8_t*)pCutBuffer
;
794 tlsGsContext
.PrimitiveID
= primID
;
796 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
797 simdvector attrib
[MAX_ATTRIBUTES
];
799 // assemble all attributes for the input primitive
800 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
802 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
803 pa
.Assemble(attribSlot
, attrib
);
805 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
807 tlsGsContext
.vert
[i
].attrib
[attribSlot
] = attrib
[i
];
812 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
813 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
815 tlsGsContext
.vert
[i
].attrib
[VERTEX_POSITION_SLOT
] = attrib
[i
];
818 #if USE_SIMD16_FRONTEND
819 const GsBufferInfo
<simd16vertex
, KNOB_SIMD16_WIDTH
> bufferInfo(state
.gsState
);
821 const GsBufferInfo
<simdvertex
, KNOB_SIMD_WIDTH
> bufferInfo(state
.gsState
);
824 // record valid prims from the frontend to avoid over binning the newly generated
826 #if USE_SIMD16_FRONTEND
827 uint32_t numInputPrims
= numPrims_simd8
;
829 uint32_t numInputPrims
= pa
.NumPrims();
832 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
834 tlsGsContext
.InstanceID
= instance
;
835 tlsGsContext
.mask
= GenerateMask(numInputPrims
);
837 // execute the geometry shader
838 state
.pfnGsFunc(GetPrivateState(pDC
), &tlsGsContext
);
840 tlsGsContext
.pStream
+= bufferInfo
.vertexInstanceStride
;
841 tlsGsContext
.pCutOrStreamIdBuffer
+= bufferInfo
.cutInstanceStride
;
844 // set up new binner and state for the GS output topology
845 #if USE_SIMD16_FRONTEND
846 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
849 switch (pState
->outputTopology
)
851 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles_simd16
; break;
852 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines_simd16
; break;
853 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints_simd16
; break;
854 default: SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
859 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
862 switch (pState
->outputTopology
)
864 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles
; break;
865 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines
; break;
866 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
867 default: SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
872 // foreach input prim:
873 // - setup a new PA based on the emitted verts for that prim
874 // - loop over the new verts, calling PA to assemble each prim
875 uint32_t* pVertexCount
= (uint32_t*)&tlsGsContext
.vertexCount
;
876 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
878 uint32_t totalPrimsGenerated
= 0;
879 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
881 uint8_t* pInstanceBase
= (uint8_t*)pGsOut
+ inputPrim
* bufferInfo
.vertexPrimitiveStride
;
882 uint8_t* pCutBufferBase
= (uint8_t*)pCutBuffer
+ inputPrim
* bufferInfo
.cutPrimitiveStride
;
884 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
886 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
887 if (numEmittedVerts
== 0)
892 uint8_t* pBase
= pInstanceBase
+ instance
* bufferInfo
.vertexInstanceStride
;
893 uint8_t* pCutBase
= pCutBufferBase
+ instance
* bufferInfo
.cutInstanceStride
;
895 uint32_t numAttribs
= state
.feNumAttributes
;
897 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
899 bool processCutVerts
= false;
901 uint8_t* pCutBuffer
= pCutBase
;
903 // assign default stream ID, only relevant when GS is outputting a single stream
904 uint32_t streamID
= 0;
905 if (pState
->isSingleStream
)
907 processCutVerts
= true;
908 streamID
= pState
->singleStreamID
;
909 if (streamID
!= stream
) continue;
913 // early exit if this stream is not enabled for streamout
914 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
919 // multi-stream output, need to translate StreamID buffer to a cut buffer
920 ProcessStreamIdBuffer(stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pStreamCutBuffer
);
921 pCutBuffer
= (uint8_t*)pStreamCutBuffer
;
922 processCutVerts
= false;
925 #if USE_SIMD16_FRONTEND
926 PA_STATE_CUT
gsPa(pDC
, pBase
, numEmittedVerts
, reinterpret_cast<simd16mask
*>(pCutBuffer
), numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
);
929 PA_STATE_CUT
gsPa(pDC
, pBase
, numEmittedVerts
, pCutBuffer
, numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
);
932 while (gsPa
.GetNextStreamOutput())
936 #if USE_SIMD16_FRONTEND
937 simd16vector attrib_simd16
[3];
939 bool assemble
= gsPa
.Assemble_simd16(VERTEX_POSITION_SLOT
, attrib_simd16
);
942 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
947 totalPrimsGenerated
+= gsPa
.NumPrims();
949 if (HasStreamOutT::value
)
951 #if USE_SIMD16_FRONTEND
952 const uint32_t numPrims
= gsPa
.NumPrims();
953 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
954 const uint32_t numPrims_hi
= std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
956 gsPa
.useAlternateOffset
= false;
957 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, numPrims_lo
, stream
);
961 gsPa
.useAlternateOffset
= true;
962 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, numPrims_hi
, stream
);
965 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
969 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
971 #if USE_SIMD16_FRONTEND
972 simd16scalari vPrimId
;
973 // pull primitiveID from the GS output if available
974 if (state
.gsState
.emitsPrimitiveID
)
976 simd16vector primIdAttrib
[3];
977 gsPa
.Assemble_simd16(VERTEX_PRIMID_SLOT
, primIdAttrib
);
978 vPrimId
= _simd16_castps_si(primIdAttrib
[state
.frontendState
.topologyProvokingVertex
].x
);
982 vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
985 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
986 simd16scalari vViewPortIdx
;
987 if (state
.gsState
.emitsViewportArrayIndex
)
989 simd16vector vpiAttrib
[3];
990 gsPa
.Assemble_simd16(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT
, vpiAttrib
);
992 // OOB indices => forced to zero.
993 simd16scalari vNumViewports
= _simd16_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
994 simd16scalari vClearMask
= _simd16_cmplt_epi32(_simd16_castps_si(vpiAttrib
[0].x
), vNumViewports
);
995 vpiAttrib
[0].x
= _simd16_and_ps(_simd16_castsi_ps(vClearMask
), vpiAttrib
[0].x
);
997 vViewPortIdx
= _simd16_castps_si(vpiAttrib
[0].x
);
1001 vViewPortIdx
= _simd16_set1_epi32(0);
1004 gsPa
.useAlternateOffset
= false;
1005 pfnClipFunc(pDC
, gsPa
, workerId
, attrib_simd16
, GenMask(gsPa
.NumPrims()), vPrimId
, vViewPortIdx
);
1007 simdscalari vPrimId
;
1008 // pull primitiveID from the GS output if available
1009 if (state
.gsState
.emitsPrimitiveID
)
1011 simdvector primIdAttrib
[3];
1012 gsPa
.Assemble(VERTEX_PRIMID_SLOT
, primIdAttrib
);
1013 vPrimId
= _simd_castps_si(primIdAttrib
[state
.frontendState
.topologyProvokingVertex
].x
);
1017 vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1020 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
1021 simdscalari vViewPortIdx
;
1022 if (state
.gsState
.emitsViewportArrayIndex
)
1024 simdvector vpiAttrib
[3];
1025 gsPa
.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT
, vpiAttrib
);
1027 // OOB indices => forced to zero.
1028 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1029 simdscalari vClearMask
= _simd_cmplt_epi32(_simd_castps_si(vpiAttrib
[0].x
), vNumViewports
);
1030 vpiAttrib
[0].x
= _simd_and_ps(_simd_castsi_ps(vClearMask
), vpiAttrib
[0].x
);
1032 vViewPortIdx
= _simd_castps_si(vpiAttrib
[0].x
);
1036 vViewPortIdx
= _simd_set1_epi32(0);
1039 pfnClipFunc(pDC
, gsPa
, workerId
, attrib
, GenMask(gsPa
.NumPrims()), vPrimId
, vViewPortIdx
);
1043 } while (gsPa
.NextPrim());
1049 // update GS pipeline stats
1050 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1051 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1052 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
*numInputPrims
));
1053 AR_END(FEGeometryShader
, 1);
1056 //////////////////////////////////////////////////////////////////////////
1057 /// @brief Allocate GS buffers
1058 /// @param pDC - pointer to draw context.
1059 /// @param state - API state
1060 /// @param ppGsOut - pointer to GS output buffer allocation
1061 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1062 template<typename SIMDVERTEX
, uint32_t SIMD_WIDTH
>
1063 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
, const API_STATE
& state
, void** ppGsOut
, void** ppCutBuffer
,
1064 void **ppStreamCutBuffer
)
1066 auto pArena
= pDC
->pArena
;
1067 SWR_ASSERT(pArena
!= nullptr);
1068 SWR_ASSERT(state
.gsState
.gsEnable
);
1070 // allocate arena space to hold GS output verts
1071 // @todo pack attribs
1072 // @todo support multiple streams
1074 const GsBufferInfo
<SIMDVERTEX
, SIMD_WIDTH
> bufferInfo(state
.gsState
);
1076 const uint32_t vertexBufferSize
= state
.gsState
.instanceCount
* bufferInfo
.vertexInstanceStride
;
1078 *ppGsOut
= pArena
->AllocAligned(vertexBufferSize
, SIMD_WIDTH
* sizeof(float));
1080 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
1081 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
1083 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
1084 if (state
.gsState
.isSingleStream
)
1086 const uint32_t cutBufferSize
= state
.gsState
.instanceCount
* bufferInfo
.cutInstanceStride
;
1088 *ppCutBuffer
= pArena
->AllocAligned(cutBufferSize
, SIMD_WIDTH
* sizeof(float));
1089 *ppStreamCutBuffer
= nullptr;
1093 const uint32_t cutBufferSize
= state
.gsState
.instanceCount
* bufferInfo
.cutInstanceStride
;
1094 const uint32_t streamCutBufferSize
= state
.gsState
.instanceCount
* bufferInfo
.streamCutInstanceStride
;
1096 *ppCutBuffer
= pArena
->AllocAligned(cutBufferSize
, SIMD_WIDTH
* sizeof(float));
1097 *ppStreamCutBuffer
= pArena
->AllocAligned(streamCutBufferSize
, SIMD_WIDTH
* sizeof(float));
1101 //////////////////////////////////////////////////////////////////////////
1102 /// @brief Contains all data generated by the HS and passed to the
1103 /// tessellator and DS.
1104 struct TessellationThreadLocalData
1106 SWR_HS_CONTEXT hsContext
;
1107 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
1111 simdscalar
* pDSOutput
;
1112 size_t numDSOutputVectors
;
1115 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1117 //////////////////////////////////////////////////////////////////////////
1118 /// @brief Allocate tessellation data for this worker thread.
1120 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1122 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1123 if (gt_pTessellationThreadData
== nullptr)
1125 gt_pTessellationThreadData
= (TessellationThreadLocalData
*)
1126 AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1127 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1131 //////////////////////////////////////////////////////////////////////////
1132 /// @brief Implements Tessellation Stages.
1133 /// @param pDC - pointer to draw context.
1134 /// @param workerId - thread's worker id. Even thread has a unique id.
1135 /// @param pa - The primitive assembly object.
1136 /// @param pGsOut - output stream for GS
1138 typename HasGeometryShaderT
,
1139 typename HasStreamOutT
,
1141 static void TessellationStages(
1147 void* pCutStreamBuffer
,
1148 uint32_t* pSoPrimData
,
1149 #if USE_SIMD16_FRONTEND
1150 uint32_t numPrims_simd8
,
1154 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1155 const API_STATE
& state
= GetApiState(pDC
);
1156 const SWR_TS_STATE
& tsState
= state
.tsState
;
1158 SWR_ASSERT(gt_pTessellationThreadData
);
1160 HANDLE tsCtx
= TSInitCtx(
1162 tsState
.partitioning
,
1163 tsState
.tsOutputTopology
,
1164 gt_pTessellationThreadData
->pTxCtx
,
1165 gt_pTessellationThreadData
->tsCtxSize
);
1166 if (tsCtx
== nullptr)
1168 gt_pTessellationThreadData
->pTxCtx
= AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1171 tsState
.partitioning
,
1172 tsState
.tsOutputTopology
,
1173 gt_pTessellationThreadData
->pTxCtx
,
1174 gt_pTessellationThreadData
->tsCtxSize
);
1178 #if USE_SIMD16_FRONTEND
1179 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1180 if (HasRastT::value
)
1182 switch (tsState
.postDSTopology
)
1184 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles_simd16
; break;
1185 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines_simd16
; break;
1186 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints_simd16
; break;
1187 default: SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1192 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1193 if (HasRastT::value
)
1195 switch (tsState
.postDSTopology
)
1197 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles
; break;
1198 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines
; break;
1199 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
1200 default: SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1205 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1206 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
1207 hsContext
.PrimitiveID
= primID
;
1209 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1210 // Max storage for one attribute for an entire simdprimitive
1211 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1213 // assemble all attributes for the input primitives
1214 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1216 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
1217 pa
.Assemble(attribSlot
, simdattrib
);
1219 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1221 hsContext
.vert
[i
].attrib
[attribSlot
] = simdattrib
[i
];
1226 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1229 #if USE_SIMD16_FRONTEND
1230 uint32_t numPrims
= numPrims_simd8
;
1232 uint32_t numPrims
= pa
.NumPrims();
1234 hsContext
.mask
= GenerateMask(numPrims
);
1237 AR_BEGIN(FEHullShader
, pDC
->drawId
);
1238 state
.pfnHsFunc(GetPrivateState(pDC
), &hsContext
);
1239 AR_END(FEHullShader
, 0);
1241 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1243 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1245 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1248 SWR_TS_TESSELLATED_DATA tsData
= { 0 };
1249 AR_BEGIN(FETessellation
, pDC
->drawId
);
1250 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1251 AR_EVENT(TessPrimCount(1));
1252 AR_END(FETessellation
, 0);
1254 if (tsData
.NumPrimitives
== 0)
1258 SWR_ASSERT(tsData
.NumDomainPoints
);
1260 // Allocate DS Output memory
1261 uint32_t requiredDSVectorInvocations
= AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1262 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.numDsOutputAttribs
;
1263 #if USE_SIMD16_FRONTEND
1264 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) * tsState
.numDsOutputAttribs
; // simd8 -> simd16, padding
1266 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1268 if (requiredDSOutputVectors
> gt_pTessellationThreadData
->numDSOutputVectors
)
1270 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1271 gt_pTessellationThreadData
->pDSOutput
= (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1272 #if USE_SIMD16_FRONTEND
1273 gt_pTessellationThreadData
->numDSOutputVectors
= RoundUpEven(requiredDSVectorInvocations
) * tsState
.numDsOutputAttribs
; // simd8 -> simd16, padding
1275 gt_pTessellationThreadData
->numDSOutputVectors
= requiredDSOutputVectors
;
1278 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1279 SWR_ASSERT(gt_pTessellationThreadData
->numDSOutputVectors
>= requiredDSOutputVectors
);
1282 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1285 // Run Domain Shader
1286 SWR_DS_CONTEXT dsContext
;
1287 dsContext
.PrimitiveID
= pPrimId
[p
];
1288 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1289 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1290 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1291 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1292 #if USE_SIMD16_FRONTEND
1293 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1295 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1298 uint32_t dsInvocations
= 0;
1300 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
; ++dsContext
.vectorOffset
)
1302 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1304 AR_BEGIN(FEDomainShader
, pDC
->drawId
);
1305 state
.pfnDsFunc(GetPrivateState(pDC
), &dsContext
);
1306 AR_END(FEDomainShader
, 0);
1308 dsInvocations
+= KNOB_SIMD_WIDTH
;
1310 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1312 #if USE_SIMD16_FRONTEND
1313 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1318 #if USE_SIMD16_FRONTEND
1319 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1320 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1322 dsContext
.pOutputData
,
1323 dsContext
.vectorStride
,
1325 tsState
.numDsOutputAttribs
,
1327 tsData
.NumPrimitives
,
1328 tsState
.postDSTopology
);
1330 while (tessPa
.HasWork())
1332 #if USE_SIMD16_FRONTEND
1333 const uint32_t numPrims
= tessPa
.NumPrims();
1334 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1335 const uint32_t numPrims_hi
= std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1337 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1338 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1339 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1342 if (HasGeometryShaderT::value
)
1344 #if USE_SIMD16_FRONTEND
1345 tessPa
.useAlternateOffset
= false;
1346 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
, numPrims_lo
, primID_lo
);
1350 tessPa
.useAlternateOffset
= true;
1351 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
, numPrims_hi
, primID_hi
);
1354 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1355 pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
,
1356 _simd_set1_epi32(dsContext
.PrimitiveID
));
1361 if (HasStreamOutT::value
)
1363 #if USE_SIMD16_FRONTEND
1364 tessPa
.useAlternateOffset
= false;
1365 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, numPrims_lo
, 0);
1369 tessPa
.useAlternateOffset
= true;
1370 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, numPrims_hi
, 0);
1373 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1377 if (HasRastT::value
)
1379 #if USE_SIMD16_FRONTEND
1380 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1382 simdvector prim
[3]; // Only deal with triangles, lines, or points
1384 AR_BEGIN(FEPAAssemble
, pDC
->drawId
);
1386 #if USE_SIMD16_FRONTEND
1387 tessPa
.Assemble_simd16(VERTEX_POSITION_SLOT
, prim_simd16
);
1389 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1391 AR_END(FEPAAssemble
, 1);
1392 SWR_ASSERT(assemble
);
1394 SWR_ASSERT(pfnClipFunc
);
1395 #if USE_SIMD16_FRONTEND
1396 tessPa
.useAlternateOffset
= false;
1397 pfnClipFunc(pDC
, tessPa
, workerId
, prim_simd16
, GenMask(numPrims
), primID
, _simd16_set1_epi32(0));
1399 pfnClipFunc(pDC
, tessPa
, workerId
, prim
,
1400 GenMask(tessPa
.NumPrims()), _simd_set1_epi32(dsContext
.PrimitiveID
), _simd_set1_epi32(0));
1407 } // while (tessPa.HasWork())
1408 } // for (uint32_t p = 0; p < numPrims; ++p)
1410 #if USE_SIMD16_FRONTEND
1411 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1413 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1414 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1416 gt_pTessellationThreadData
->numDSOutputVectors
= 0;
1419 TSDestroyCtx(tsCtx
);
1422 THREAD
PA_STATE::SIMDVERTEX
*pVertexStore
= nullptr;
1423 THREAD
uint32_t gVertexStoreSize
= 0;
1425 //////////////////////////////////////////////////////////////////////////
1426 /// @brief FE handler for SwrDraw.
1427 /// @tparam IsIndexedT - Is indexed drawing enabled
1428 /// @tparam HasTessellationT - Is tessellation enabled
1429 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1430 /// @tparam HasStreamOutT - Is stream-out enabled
1431 /// @tparam HasRastT - Is rasterization enabled
1432 /// @param pContext - pointer to SWR context.
1433 /// @param pDC - pointer to draw context.
1434 /// @param workerId - thread's worker id.
1435 /// @param pUserData - Pointer to DRAW_WORK
1437 typename IsIndexedT
,
1438 typename IsCutIndexEnabledT
,
1439 typename HasTessellationT
,
1440 typename HasGeometryShaderT
,
1441 typename HasStreamOutT
,
1444 SWR_CONTEXT
*pContext
,
1450 #if KNOB_ENABLE_TOSS_POINTS
1451 if (KNOB_TOSS_QUEUE_FE
)
1457 AR_BEGIN(FEProcessDraw
, pDC
->drawId
);
1459 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1460 const API_STATE
& state
= GetApiState(pDC
);
1462 uint32_t indexSize
= 0;
1463 uint32_t endVertex
= work
.numVerts
;
1465 const int32_t* pLastRequestedIndex
= nullptr;
1466 if (IsIndexedT::value
)
1471 indexSize
= sizeof(uint32_t);
1472 pLastRequestedIndex
= &(work
.pIB
[endVertex
]);
1475 indexSize
= sizeof(uint16_t);
1476 // nasty address offset to last index
1477 pLastRequestedIndex
= (int32_t*)(&(((uint16_t*)work
.pIB
)[endVertex
]));
1480 indexSize
= sizeof(uint8_t);
1481 // nasty address offset to last index
1482 pLastRequestedIndex
= (int32_t*)(&(((uint8_t*)work
.pIB
)[endVertex
]));
1485 SWR_INVALID("Invalid work.type: %d", work
.type
);
1490 // No cuts, prune partial primitives.
1491 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1494 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1495 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1498 void* pGsOut
= nullptr;
1499 void* pCutBuffer
= nullptr;
1500 void* pStreamCutBuffer
= nullptr;
1501 if (HasGeometryShaderT::value
)
1503 #if USE_SIMD16_FRONTEND
1504 AllocateGsBuffers
<simd16vertex
, KNOB_SIMD16_WIDTH
>(pDC
, state
, &pGsOut
, &pCutBuffer
, &pStreamCutBuffer
);
1506 AllocateGsBuffers
<simdvertex
, KNOB_SIMD_WIDTH
>(pDC
, state
, &pGsOut
, &pCutBuffer
, &pStreamCutBuffer
);
1510 if (HasTessellationT::value
)
1512 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1513 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1514 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1516 AllocateTessellationData(pContext
);
1520 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1521 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1522 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1525 // allocate space for streamout input prim data
1526 uint32_t* pSoPrimData
= nullptr;
1527 if (HasStreamOutT::value
)
1529 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1532 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, state
.gsState
.gsEnable
);
1534 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1536 // grow the vertex store for the PA as necessary
1537 if (gVertexStoreSize
< vertexCount
)
1539 if (pVertexStore
!= nullptr)
1541 AlignedFree(pVertexStore
);
1544 while (gVertexStoreSize
< vertexCount
)
1546 #if USE_SIMD16_FRONTEND
1547 gVertexStoreSize
+= 4; // grow in chunks of 4 simd16vertex
1549 gVertexStoreSize
+= 8; // grow in chunks of 8 simdvertex
1553 SWR_ASSERT(gVertexStoreSize
<= MAX_NUM_VERTS_PER_PRIM
);
1555 pVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(gVertexStoreSize
* sizeof(pVertexStore
[0]), 64));
1557 SWR_ASSERT(pVertexStore
!= nullptr);
1560 // choose primitive assembler
1561 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
, state
.topology
, work
.numVerts
, pVertexStore
, gVertexStoreSize
);
1562 PA_STATE
& pa
= paFactory
.GetPA();
1564 #if USE_SIMD16_FRONTEND
1567 SWR_VS_CONTEXT vsContext_lo
;
1568 SWR_VS_CONTEXT vsContext_hi
;
1570 vsContext_lo
.pVin
= &vin_lo
;
1571 vsContext_hi
.pVin
= &vin_hi
;
1572 vsContext_lo
.AlternateOffset
= 0;
1573 vsContext_hi
.AlternateOffset
= 1;
1575 SWR_FETCH_CONTEXT fetchInfo_lo
= { 0 };
1577 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1578 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1579 fetchInfo_lo
.StartVertex
= 0;
1581 if (IsIndexedT::value
)
1583 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1585 // if the entire index buffer isn't being consumed, set the last index
1586 // so that fetches < a SIMD wide will be masked off
1587 fetchInfo_lo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1588 if (pLastRequestedIndex
< fetchInfo_lo
.pLastIndex
)
1590 fetchInfo_lo
.pLastIndex
= pLastRequestedIndex
;
1595 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1598 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1600 const simd16scalari vScale
= _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1602 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1606 simd16scalari vIndex
;
1608 if (IsIndexedT::value
)
1610 fetchInfo_lo
.pIndices
= work
.pIB
;
1611 fetchInfo_hi
.pIndices
= (int32_t *)((uint8_t *)fetchInfo_lo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
); // 1/2 of KNOB_SIMD16_WIDTH
1615 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1617 fetchInfo_lo
.pIndices
= (const int32_t *)&vIndex
;
1618 fetchInfo_hi
.pIndices
= (const int32_t *)&vIndex
+ KNOB_SIMD_WIDTH
; // 1/2 of KNOB_SIMD16_WIDTH
1621 fetchInfo_lo
.CurInstance
= instanceNum
;
1622 fetchInfo_hi
.CurInstance
= instanceNum
;
1624 vsContext_lo
.InstanceID
= instanceNum
;
1625 vsContext_hi
.InstanceID
= instanceNum
;
1627 while (pa
.HasWork())
1629 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1630 // So we need to keep this outside of (i < endVertex) check.
1632 simdmask
*pvCutIndices_lo
= nullptr;
1633 simdmask
*pvCutIndices_hi
= nullptr;
1635 if (IsIndexedT::value
)
1637 // simd16mask <=> simdmask[2]
1639 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1640 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1643 simd16vertex
&vout
= pa
.GetNextVsOutput();
1645 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1646 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1650 // 1. Execute FS/VS for a single SIMD.
1651 AR_BEGIN(FEFetchShader
, pDC
->drawId
);
1652 state
.pfnFetchFunc(fetchInfo_lo
, vin_lo
);
1654 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1656 state
.pfnFetchFunc(fetchInfo_hi
, vin_hi
);
1658 AR_END(FEFetchShader
, 0);
1660 // forward fetch generated vertex IDs to the vertex shader
1661 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1662 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1664 // Setup active mask for vertex shader.
1665 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1666 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1668 // forward cut mask to the PA
1669 if (IsIndexedT::value
)
1671 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1672 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1675 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1677 #if KNOB_ENABLE_TOSS_POINTS
1678 if (!KNOB_TOSS_FETCH
)
1681 AR_BEGIN(FEVertexShader
, pDC
->drawId
);
1682 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext_lo
);
1684 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1686 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext_hi
);
1688 AR_END(FEVertexShader
, 0);
1690 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1694 // 2. Assemble primitives given the last two SIMD.
1697 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1699 RDTSC_START(FEPAAssemble
);
1700 bool assemble
= pa
.Assemble_simd16(VERTEX_POSITION_SLOT
, prim_simd16
);
1701 RDTSC_STOP(FEPAAssemble
, 1, 0);
1703 #if KNOB_ENABLE_TOSS_POINTS
1704 if (!KNOB_TOSS_FETCH
)
1707 #if KNOB_ENABLE_TOSS_POINTS
1713 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1715 const uint32_t numPrims
= pa
.NumPrims();
1716 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1717 const uint32_t numPrims_hi
= std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1719 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
1720 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1721 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1723 if (HasTessellationT::value
)
1725 pa
.useAlternateOffset
= false;
1726 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, numPrims_lo
, primID_lo
);
1730 pa
.useAlternateOffset
= true;
1731 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, numPrims_hi
, primID_hi
);
1734 else if (HasGeometryShaderT::value
)
1736 pa
.useAlternateOffset
= false;
1737 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, numPrims_lo
, primID_lo
);
1741 pa
.useAlternateOffset
= true;
1742 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, numPrims_hi
, primID_hi
);
1747 // If streamout is enabled then stream vertices out to memory.
1748 if (HasStreamOutT::value
)
1751 pa
.useAlternateOffset
= false;
1752 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, numPrims_lo
, 0);
1756 pa
.useAlternateOffset
= true;
1757 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, numPrims_hi
, 0);
1760 pa
.useAlternateOffset
= false;
1761 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1765 if (HasRastT::value
)
1767 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
1769 pa
.useAlternateOffset
= false;
1770 pDC
->pState
->pfnProcessPrims_simd16(pDC
, pa
, workerId
, prim_simd16
, GenMask(numPrims
), primID
, _simd16_setzero_si());
1776 } while (pa
.NextPrim());
1778 if (IsIndexedT::value
)
1780 fetchInfo_lo
.pIndices
= (int32_t *)((uint8_t*)fetchInfo_lo
.pIndices
+ KNOB_SIMD16_WIDTH
* indexSize
);
1781 fetchInfo_hi
.pIndices
= (int32_t *)((uint8_t*)fetchInfo_hi
.pIndices
+ KNOB_SIMD16_WIDTH
* indexSize
);
1785 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
1788 i
+= KNOB_SIMD16_WIDTH
;
1796 SWR_VS_CONTEXT vsContext
;
1798 vsContext
.pVin
= &vin
;
1800 SWR_FETCH_CONTEXT fetchInfo
= { 0 };
1802 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
1803 fetchInfo
.StartInstance
= work
.startInstance
;
1804 fetchInfo
.StartVertex
= 0;
1806 if (IsIndexedT::value
)
1808 fetchInfo
.BaseVertex
= work
.baseVertex
;
1810 // if the entire index buffer isn't being consumed, set the last index
1811 // so that fetches < a SIMD wide will be masked off
1812 fetchInfo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1813 if (pLastRequestedIndex
< fetchInfo
.pLastIndex
)
1815 fetchInfo
.pLastIndex
= pLastRequestedIndex
;
1820 fetchInfo
.StartVertex
= work
.startVertex
;
1823 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1825 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1826 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1831 if (IsIndexedT::value
)
1833 fetchInfo
.pIndices
= work
.pIB
;
1837 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
1838 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
1841 fetchInfo
.CurInstance
= instanceNum
;
1842 vsContext
.InstanceID
= instanceNum
;
1844 while (pa
.HasWork())
1846 // GetNextVsOutput currently has the side effect of updating some PA state machine state.
1847 // So we need to keep this outside of (i < endVertex) check.
1848 simdmask
* pvCutIndices
= nullptr;
1849 if (IsIndexedT::value
)
1851 pvCutIndices
= &pa
.GetNextVsIndices();
1854 simdvertex
& vout
= pa
.GetNextVsOutput();
1855 vsContext
.pVout
= &vout
;
1860 // 1. Execute FS/VS for a single SIMD.
1861 AR_BEGIN(FEFetchShader
, pDC
->drawId
);
1862 state
.pfnFetchFunc(fetchInfo
, vin
);
1863 AR_END(FEFetchShader
, 0);
1865 // forward fetch generated vertex IDs to the vertex shader
1866 vsContext
.VertexID
= fetchInfo
.VertexID
;
1868 // Setup active mask for vertex shader.
1869 vsContext
.mask
= GenerateMask(endVertex
- i
);
1871 // forward cut mask to the PA
1872 if (IsIndexedT::value
)
1874 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
1877 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1879 #if KNOB_ENABLE_TOSS_POINTS
1880 if (!KNOB_TOSS_FETCH
)
1883 AR_BEGIN(FEVertexShader
, pDC
->drawId
);
1884 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext
);
1885 AR_END(FEVertexShader
, 0);
1887 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1891 // 2. Assemble primitives given the last two SIMD.
1894 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
1895 // PaAssemble returns false if there is not enough verts to assemble.
1896 AR_BEGIN(FEPAAssemble
, pDC
->drawId
);
1897 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1898 AR_END(FEPAAssemble
, 1);
1900 #if KNOB_ENABLE_TOSS_POINTS
1901 if (!KNOB_TOSS_FETCH
)
1904 #if KNOB_ENABLE_TOSS_POINTS
1910 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1912 if (HasTessellationT::value
)
1914 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
1915 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1917 else if (HasGeometryShaderT::value
)
1919 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1920 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1924 // If streamout is enabled then stream vertices out to memory.
1925 if (HasStreamOutT::value
)
1927 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1930 if (HasRastT::value
)
1932 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
1934 pDC
->pState
->pfnProcessPrims(pDC
, pa
, workerId
, prim
,
1935 GenMask(pa
.NumPrims()), pa
.GetPrimID(work
.startPrimID
), _simd_set1_epi32(0));
1941 } while (pa
.NextPrim());
1943 if (IsIndexedT::value
)
1945 fetchInfo
.pIndices
= (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
1949 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
1952 i
+= KNOB_SIMD_WIDTH
;
1959 AR_END(FEProcessDraw
, numPrims
* work
.numInstances
);
1962 struct FEDrawChooser
1964 typedef PFN_FE_WORK_FUNC FuncType
;
1966 template <typename
... ArgsB
>
1967 static FuncType
GetFunc()
1969 return ProcessDraw
<ArgsB
...>;
1974 // Selector for correct templated Draw front-end function
1975 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
1977 bool IsCutIndexEnabled
,
1978 bool HasTessellation
,
1979 bool HasGeometryShader
,
1981 bool HasRasterization
)
1983 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
, IsCutIndexEnabled
, HasTessellation
, HasGeometryShader
, HasStreamOut
, HasRasterization
);