1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE
uint32_t GenMask(uint32_t numBits
)
49 numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
50 return ((1U << numBits
) - 1);
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief FE handler for SwrSync.
55 /// @param pContext - pointer to SWR context.
56 /// @param pDC - pointer to draw context.
57 /// @param workerId - thread's worker id. Even thread has a unique id.
58 /// @param pUserData - Pointer to user data passed back to sync callback.
59 /// @todo This should go away when we switch this to use compute threading.
60 void ProcessSync(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
64 work
.pfnWork
= ProcessSyncBE
;
66 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
67 pTileMgr
->enqueue(0, 0, &work
);
70 //////////////////////////////////////////////////////////////////////////
71 /// @brief FE handler for SwrDestroyContext.
72 /// @param pContext - pointer to SWR context.
73 /// @param pDC - pointer to draw context.
74 /// @param workerId - thread's worker id. Even thread has a unique id.
75 /// @param pUserData - Pointer to user data passed back to sync callback.
76 void ProcessShutdown(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
80 work
.pfnWork
= ProcessShutdownBE
;
82 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
83 // Enqueue at least 1 work item for each worker thread
84 // account for number of numa nodes
85 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
87 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
89 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
91 pTileMgr
->enqueue(i
, n
, &work
);
96 //////////////////////////////////////////////////////////////////////////
97 /// @brief FE handler for SwrClearRenderTarget.
98 /// @param pContext - pointer to SWR context.
99 /// @param pDC - pointer to draw context.
100 /// @param workerId - thread's worker id. Even thread has a unique id.
101 /// @param pUserData - Pointer to user data passed back to clear callback.
102 /// @todo This should go away when we switch this to use compute threading.
103 void ProcessClear(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
105 CLEAR_DESC
* pDesc
= (CLEAR_DESC
*)pUserData
;
106 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
108 // queue a clear to each macro tile
109 // compute macro tile bounds for the specified rect
110 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
111 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
112 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
113 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
117 work
.pfnWork
= ProcessClearBE
;
118 work
.desc
.clear
= *pDesc
;
120 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
122 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
124 pTileMgr
->enqueue(x
, y
, &work
);
129 //////////////////////////////////////////////////////////////////////////
130 /// @brief FE handler for SwrStoreTiles.
131 /// @param pContext - pointer to SWR context.
132 /// @param pDC - pointer to draw context.
133 /// @param workerId - thread's worker id. Even thread has a unique id.
134 /// @param pUserData - Pointer to user data passed back to callback.
135 /// @todo This should go away when we switch this to use compute threading.
136 void ProcessStoreTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
138 RDTSC_BEGIN(FEProcessStoreTiles
, pDC
->drawId
);
139 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
140 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
142 // queue a store to each macro tile
143 // compute macro tile bounds for the specified rect
144 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
145 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
146 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
147 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
151 work
.type
= STORETILES
;
152 work
.pfnWork
= ProcessStoreTilesBE
;
153 work
.desc
.storeTiles
= *pDesc
;
155 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
157 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
159 pTileMgr
->enqueue(x
, y
, &work
);
163 RDTSC_END(FEProcessStoreTiles
, 0);
166 //////////////////////////////////////////////////////////////////////////
167 /// @brief FE handler for SwrInvalidateTiles.
168 /// @param pContext - pointer to SWR context.
169 /// @param pDC - pointer to draw context.
170 /// @param workerId - thread's worker id. Even thread has a unique id.
171 /// @param pUserData - Pointer to user data passed back to callback.
172 /// @todo This should go away when we switch this to use compute threading.
173 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
* pContext
,
178 RDTSC_BEGIN(FEProcessInvalidateTiles
, pDC
->drawId
);
179 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
180 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
182 // compute macro tile bounds for the specified rect
183 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
184 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
185 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
186 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
188 if (pDesc
->fullTilesOnly
== false)
190 // include partial tiles
191 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
192 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
193 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
194 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
197 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
198 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
200 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
201 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
205 work
.type
= DISCARDINVALIDATETILES
;
206 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
207 work
.desc
.discardInvalidateTiles
= *pDesc
;
209 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
211 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
213 pTileMgr
->enqueue(x
, y
, &work
);
217 RDTSC_END(FEProcessInvalidateTiles
, 0);
220 //////////////////////////////////////////////////////////////////////////
221 /// @brief Computes the number of primitives given the number of verts.
222 /// @param mode - primitive topology for draw operation.
223 /// @param numPrims - number of vertices or indices for draw.
224 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
225 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
231 case TOP_TRIANGLE_LIST
:
233 case TOP_TRIANGLE_STRIP
:
234 return numPrims
< 3 ? 0 : numPrims
- 2;
235 case TOP_TRIANGLE_FAN
:
236 return numPrims
< 3 ? 0 : numPrims
- 2;
237 case TOP_TRIANGLE_DISC
:
238 return numPrims
< 2 ? 0 : numPrims
- 1;
242 return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
244 return numPrims
< 2 ? 0 : numPrims
- 1;
251 case TOP_LINE_LIST_ADJ
:
253 case TOP_LISTSTRIP_ADJ
:
254 return numPrims
< 3 ? 0 : numPrims
- 3;
255 case TOP_TRI_LIST_ADJ
:
257 case TOP_TRI_STRIP_ADJ
:
258 return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
260 case TOP_PATCHLIST_1
:
261 case TOP_PATCHLIST_2
:
262 case TOP_PATCHLIST_3
:
263 case TOP_PATCHLIST_4
:
264 case TOP_PATCHLIST_5
:
265 case TOP_PATCHLIST_6
:
266 case TOP_PATCHLIST_7
:
267 case TOP_PATCHLIST_8
:
268 case TOP_PATCHLIST_9
:
269 case TOP_PATCHLIST_10
:
270 case TOP_PATCHLIST_11
:
271 case TOP_PATCHLIST_12
:
272 case TOP_PATCHLIST_13
:
273 case TOP_PATCHLIST_14
:
274 case TOP_PATCHLIST_15
:
275 case TOP_PATCHLIST_16
:
276 case TOP_PATCHLIST_17
:
277 case TOP_PATCHLIST_18
:
278 case TOP_PATCHLIST_19
:
279 case TOP_PATCHLIST_20
:
280 case TOP_PATCHLIST_21
:
281 case TOP_PATCHLIST_22
:
282 case TOP_PATCHLIST_23
:
283 case TOP_PATCHLIST_24
:
284 case TOP_PATCHLIST_25
:
285 case TOP_PATCHLIST_26
:
286 case TOP_PATCHLIST_27
:
287 case TOP_PATCHLIST_28
:
288 case TOP_PATCHLIST_29
:
289 case TOP_PATCHLIST_30
:
290 case TOP_PATCHLIST_31
:
291 case TOP_PATCHLIST_32
:
292 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
295 case TOP_POINT_LIST_BF
:
296 case TOP_LINE_STRIP_CONT
:
297 case TOP_LINE_STRIP_BF
:
298 case TOP_LINE_STRIP_CONT_BF
:
299 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
300 case TOP_TRI_STRIP_REVERSE
:
301 case TOP_PATCHLIST_BASE
:
303 SWR_INVALID("Unsupported topology: %d", mode
);
310 //////////////////////////////////////////////////////////////////////////
311 /// @brief Computes the number of verts given the number of primitives.
312 /// @param mode - primitive topology for draw operation.
313 /// @param numPrims - number of primitives for draw.
314 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
320 case TOP_TRIANGLE_LIST
:
322 case TOP_TRIANGLE_STRIP
:
323 return numPrims
? numPrims
+ 2 : 0;
324 case TOP_TRIANGLE_FAN
:
325 return numPrims
? numPrims
+ 2 : 0;
326 case TOP_TRIANGLE_DISC
:
327 return numPrims
? numPrims
+ 1 : 0;
331 return numPrims
? numPrims
* 2 + 2 : 0;
333 return numPrims
? numPrims
+ 1 : 0;
340 case TOP_LINE_LIST_ADJ
:
342 case TOP_LISTSTRIP_ADJ
:
343 return numPrims
? numPrims
+ 3 : 0;
344 case TOP_TRI_LIST_ADJ
:
346 case TOP_TRI_STRIP_ADJ
:
347 return numPrims
? (numPrims
+ 2) * 2 : 0;
349 case TOP_PATCHLIST_1
:
350 case TOP_PATCHLIST_2
:
351 case TOP_PATCHLIST_3
:
352 case TOP_PATCHLIST_4
:
353 case TOP_PATCHLIST_5
:
354 case TOP_PATCHLIST_6
:
355 case TOP_PATCHLIST_7
:
356 case TOP_PATCHLIST_8
:
357 case TOP_PATCHLIST_9
:
358 case TOP_PATCHLIST_10
:
359 case TOP_PATCHLIST_11
:
360 case TOP_PATCHLIST_12
:
361 case TOP_PATCHLIST_13
:
362 case TOP_PATCHLIST_14
:
363 case TOP_PATCHLIST_15
:
364 case TOP_PATCHLIST_16
:
365 case TOP_PATCHLIST_17
:
366 case TOP_PATCHLIST_18
:
367 case TOP_PATCHLIST_19
:
368 case TOP_PATCHLIST_20
:
369 case TOP_PATCHLIST_21
:
370 case TOP_PATCHLIST_22
:
371 case TOP_PATCHLIST_23
:
372 case TOP_PATCHLIST_24
:
373 case TOP_PATCHLIST_25
:
374 case TOP_PATCHLIST_26
:
375 case TOP_PATCHLIST_27
:
376 case TOP_PATCHLIST_28
:
377 case TOP_PATCHLIST_29
:
378 case TOP_PATCHLIST_30
:
379 case TOP_PATCHLIST_31
:
380 case TOP_PATCHLIST_32
:
381 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
384 case TOP_POINT_LIST_BF
:
385 case TOP_LINE_STRIP_CONT
:
386 case TOP_LINE_STRIP_BF
:
387 case TOP_LINE_STRIP_CONT_BF
:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
389 case TOP_TRI_STRIP_REVERSE
:
390 case TOP_PATCHLIST_BASE
:
392 SWR_INVALID("Unsupported topology: %d", mode
);
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
405 uint32_t numVerts
= 0;
409 case TOP_POINT_LIST_BF
:
414 case TOP_LINE_LIST_ADJ
:
416 case TOP_LINE_STRIP_CONT
:
417 case TOP_LINE_STRIP_BF
:
418 case TOP_LISTSTRIP_ADJ
:
421 case TOP_TRIANGLE_LIST
:
422 case TOP_TRIANGLE_STRIP
:
423 case TOP_TRIANGLE_FAN
:
424 case TOP_TRI_LIST_ADJ
:
425 case TOP_TRI_STRIP_ADJ
:
426 case TOP_TRI_STRIP_REVERSE
:
434 case TOP_PATCHLIST_1
:
435 case TOP_PATCHLIST_2
:
436 case TOP_PATCHLIST_3
:
437 case TOP_PATCHLIST_4
:
438 case TOP_PATCHLIST_5
:
439 case TOP_PATCHLIST_6
:
440 case TOP_PATCHLIST_7
:
441 case TOP_PATCHLIST_8
:
442 case TOP_PATCHLIST_9
:
443 case TOP_PATCHLIST_10
:
444 case TOP_PATCHLIST_11
:
445 case TOP_PATCHLIST_12
:
446 case TOP_PATCHLIST_13
:
447 case TOP_PATCHLIST_14
:
448 case TOP_PATCHLIST_15
:
449 case TOP_PATCHLIST_16
:
450 case TOP_PATCHLIST_17
:
451 case TOP_PATCHLIST_18
:
452 case TOP_PATCHLIST_19
:
453 case TOP_PATCHLIST_20
:
454 case TOP_PATCHLIST_21
:
455 case TOP_PATCHLIST_22
:
456 case TOP_PATCHLIST_23
:
457 case TOP_PATCHLIST_24
:
458 case TOP_PATCHLIST_25
:
459 case TOP_PATCHLIST_26
:
460 case TOP_PATCHLIST_27
:
461 case TOP_PATCHLIST_28
:
462 case TOP_PATCHLIST_29
:
463 case TOP_PATCHLIST_30
:
464 case TOP_PATCHLIST_31
:
465 case TOP_PATCHLIST_32
:
466 numVerts
= topology
- TOP_PATCHLIST_BASE
;
469 SWR_INVALID("Unsupported topology: %d", topology
);
477 case TOP_LISTSTRIP_ADJ
:
478 case TOP_LINE_LIST_ADJ
:
481 case TOP_TRI_STRIP_ADJ
:
482 case TOP_TRI_LIST_ADJ
:
493 //////////////////////////////////////////////////////////////////////////
494 /// @brief Generate mask from remaining work.
495 /// @param numWorkItems - Number of items being worked on by a SIMD.
496 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
499 (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
500 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
501 return _simd_castps_si(_simd_vmask_ps(mask
));
504 static INLINE simd16scalari
GenerateMask16(uint32_t numItemsRemaining
)
507 (numItemsRemaining
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: numItemsRemaining
;
508 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
509 return _simd16_castps_si(_simd16_vmask_ps(mask
));
512 //////////////////////////////////////////////////////////////////////////
513 /// @brief StreamOut - Streams vertex data out to SO buffers.
514 /// Generally, we are only streaming out a SIMDs worth of triangles.
515 /// @param pDC - pointer to draw context.
516 /// @param workerId - thread's worker id. Even thread has a unique id.
517 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
518 static void StreamOut(
519 DRAW_CONTEXT
* pDC
, PA_STATE
& pa
, uint32_t workerId
, uint32_t* pPrimData
, uint32_t streamIndex
)
521 RDTSC_BEGIN(FEStreamout
, pDC
->drawId
);
523 const API_STATE
& state
= GetApiState(pDC
);
524 const SWR_STREAMOUT_STATE
& soState
= state
.soState
;
526 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
528 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
530 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
532 SWR_STREAMOUT_CONTEXT soContext
= {0};
534 // Setup buffer state pointers.
535 for (uint32_t i
= 0; i
< 4; ++i
)
537 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
540 uint32_t numPrims
= pa
.NumPrims();
542 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
545 uint64_t soMask
= soState
.streamMasks
[streamIndex
];
547 // Write all entries into primitive data buffer for SOS.
548 while (_BitScanForward64(&slot
, soMask
))
550 simd4scalar attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
551 uint32_t paSlot
= slot
+ soState
.vertexAttribOffset
[streamIndex
];
552 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
554 // Attribute offset is relative offset from start of vertex.
555 // Note that attributes start at slot 1 in the PA buffer. We need to write this
556 // to prim data starting at slot 0. Which is why we do (slot - 1).
557 // Also note: GL works slightly differently, and needs slot 0
558 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
560 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
561 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
563 uint32_t* pPrimDataAttrib
=
564 pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
566 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
569 soMask
&= ~(uint64_t(1) << slot
);
572 // Update pPrimData pointer
573 soContext
.pPrimData
= pPrimData
;
576 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr,
577 "Trying to execute uninitialized streamout jit function.");
578 state
.pfnSoFunc
[streamIndex
](soContext
);
581 // Update SO write offset. The driver provides memory for the update.
582 for (uint32_t i
= 0; i
< 4; ++i
)
584 if (state
.soBuffer
[i
].pWriteOffset
)
586 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
589 if (state
.soBuffer
[i
].soWriteEnable
)
591 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
592 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
596 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
597 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
599 RDTSC_END(FEStreamout
, 1);
602 #if USE_SIMD16_FRONTEND
603 //////////////////////////////////////////////////////////////////////////
604 /// Is value an even number (a multiple of two)
606 template <typename T
>
607 INLINE
static bool IsEven(T value
)
609 return (value
& 1) == 0;
612 //////////////////////////////////////////////////////////////////////////
613 /// Round up value to an even number (a multiple of two)
615 template <typename T
>
616 INLINE
static T
RoundUpEven(T value
)
618 return (value
+ 1) & ~1;
621 //////////////////////////////////////////////////////////////////////////
622 /// Round down value to an even number (a multiple of two)
624 template <typename T
>
625 INLINE
static T
RoundDownEven(T value
)
630 //////////////////////////////////////////////////////////////////////////
631 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
633 /// vertexCount is in terms of the source simdvertexes and must be even
635 /// attribCount will limit the vector copies to those attribs specified
637 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
639 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
* vertex_simd16
,
640 const simdvertex
* vertex
,
641 uint32_t vertexCount
,
642 uint32_t attribCount
)
645 SWR_ASSERT(vertex_simd16
);
646 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
650 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
652 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
654 for (uint32_t k
= 0; k
< 4; k
+= 1)
657 _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
659 if ((i
+ 1) < vertexCount
)
662 _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
667 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
669 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
675 //////////////////////////////////////////////////////////////////////////
676 /// @brief Computes number of invocations. The current index represents
677 /// the start of the SIMD. The max index represents how much work
678 /// items are remaining. If there is less then a SIMD's xmin of work
679 /// then return the remaining amount of work.
680 /// @param curIndex - The start index for the SIMD.
681 /// @param maxIndex - The last index for all work items.
682 static INLINE
uint32_t GetNumInvocations(uint32_t curIndex
, uint32_t maxIndex
)
684 uint32_t remainder
= (maxIndex
- curIndex
);
685 #if USE_SIMD16_FRONTEND
686 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
688 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
692 //////////////////////////////////////////////////////////////////////////
693 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
694 /// The geometry shader will loop over each active streamout buffer, assembling
695 /// primitives for the downstream stages. When multistream output is enabled,
696 /// the generated stream ID buffer from the GS needs to be converted to a cut
697 /// buffer for the primitive assembler.
698 /// @param stream - stream id to generate the cut buffer for
699 /// @param pStreamIdBase - pointer to the stream ID buffer
700 /// @param numEmittedVerts - Number of total verts emitted by the GS
701 /// @param pCutBuffer - output buffer to write cuts to
702 void ProcessStreamIdBuffer(uint32_t stream
,
703 uint8_t* pStreamIdBase
,
704 uint32_t numEmittedVerts
,
707 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
709 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
710 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
712 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
714 uint8_t curInputByte
= pStreamIdBase
[2 * b
];
716 for (uint32_t i
= 0; i
< 4; ++i
)
718 if ((curInputByte
& 0x3) != stream
)
725 curInputByte
= pStreamIdBase
[2 * b
+ 1];
726 for (uint32_t i
= 0; i
< 4; ++i
)
728 if ((curInputByte
& 0x3) != stream
)
730 outByte
|= (1 << (i
+ 4));
735 *pCutBuffer
++ = outByte
;
739 // Buffers that are allocated if GS is enabled
743 uint8_t* pGsOut
[KNOB_SIMD_WIDTH
];
744 uint8_t* pGsTransposed
;
745 void* pStreamCutBuffer
;
748 //////////////////////////////////////////////////////////////////////////
749 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
750 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
752 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
753 /// @param numVerts - Number of vertices outputted by the GS
754 /// @param numAttribs - Number of attributes per vertex
755 template <typename SIMD_T
, uint32_t SimdWidth
>
756 void TransposeSOAtoAOS(uint8_t* pDst
, uint8_t* pSrc
, uint32_t numVerts
, uint32_t numAttribs
)
758 uint32_t srcVertexStride
= numAttribs
* sizeof(float) * 4;
759 uint32_t dstVertexStride
= numAttribs
* sizeof(Float
<SIMD_T
>) * 4;
761 OSALIGNSIMD16(uint32_t) gatherOffsets
[SimdWidth
];
763 for (uint32_t i
= 0; i
< SimdWidth
; ++i
)
765 gatherOffsets
[i
] = srcVertexStride
* i
;
767 auto vGatherOffsets
= SIMD_T::load_si((Integer
<SIMD_T
>*)&gatherOffsets
[0]);
769 uint32_t numSimd
= AlignUp(numVerts
, SimdWidth
) / SimdWidth
;
770 uint32_t remainingVerts
= numVerts
;
772 for (uint32_t s
= 0; s
< numSimd
; ++s
)
774 uint8_t* pSrcBase
= pSrc
+ s
* srcVertexStride
* SimdWidth
;
775 uint8_t* pDstBase
= pDst
+ s
* dstVertexStride
;
777 // Compute mask to prevent src overflow
778 uint32_t mask
= std::min(remainingVerts
, SimdWidth
);
779 mask
= GenMask(mask
);
780 auto vMask
= SIMD_T::vmask_ps(mask
);
781 auto viMask
= SIMD_T::castps_si(vMask
);
783 for (uint32_t a
= 0; a
< numAttribs
; ++a
)
785 auto attribGatherX
= SIMD_T::mask_i32gather_ps(
786 SIMD_T::setzero_ps(), (const float*)pSrcBase
, vGatherOffsets
, vMask
);
787 auto attribGatherY
= SIMD_T::mask_i32gather_ps(
788 SIMD_T::setzero_ps(),
789 (const float*)(pSrcBase
+ sizeof(float)),
792 auto attribGatherZ
= SIMD_T::mask_i32gather_ps(
793 SIMD_T::setzero_ps(),
794 (const float*)(pSrcBase
+ sizeof(float) * 2),
797 auto attribGatherW
= SIMD_T::mask_i32gather_ps(
798 SIMD_T::setzero_ps(),
799 (const float*)(pSrcBase
+ sizeof(float) * 3),
803 SIMD_T::maskstore_ps((float*)pDstBase
, viMask
, attribGatherX
);
804 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>)), viMask
, attribGatherY
);
805 SIMD_T::maskstore_ps(
806 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 2), viMask
, attribGatherZ
);
807 SIMD_T::maskstore_ps(
808 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 3), viMask
, attribGatherW
);
810 pSrcBase
+= sizeof(float) * 4;
811 pDstBase
+= sizeof(Float
<SIMD_T
>) * 4;
813 remainingVerts
-= SimdWidth
;
818 //////////////////////////////////////////////////////////////////////////
819 /// @brief Implements GS stage.
820 /// @param pDC - pointer to draw context.
821 /// @param workerId - thread's worker id. Even thread has a unique id.
822 /// @param pa - The primitive assembly object.
823 /// @param pGsOut - output stream for GS
824 template <typename HasStreamOutT
, typename HasRastT
>
825 static void GeometryShaderStage(DRAW_CONTEXT
* pDC
,
828 GsBuffers
* pGsBuffers
,
829 uint32_t* pSoPrimData
,
830 #if USE_SIMD16_FRONTEND
831 uint32_t numPrims_simd8
,
833 simdscalari
const& primID
)
835 RDTSC_BEGIN(FEGeometryShader
, pDC
->drawId
);
837 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
839 const API_STATE
& state
= GetApiState(pDC
);
840 const SWR_GS_STATE
* pState
= &state
.gsState
;
841 SWR_GS_CONTEXT gsContext
;
843 static uint8_t sNullBuffer
[128] = {0};
845 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
847 gsContext
.pStreams
[i
] = pGsBuffers
->pGsOut
[i
];
849 gsContext
.pVerts
= (simdvector
*)pGsBuffers
->pGsIn
;
850 gsContext
.PrimitiveID
= primID
;
852 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
853 simdvector attrib
[MAX_NUM_VERTS_PER_PRIM
];
855 // assemble all attributes for the input primitive
856 gsContext
.inputVertStride
= pState
->inputVertStride
;
857 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
859 uint32_t srcAttribSlot
= pState
->srcVertexAttribOffset
+ slot
;
860 uint32_t attribSlot
= pState
->vertexAttribOffset
+ slot
;
861 pa
.Assemble(srcAttribSlot
, attrib
);
863 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
865 gsContext
.pVerts
[attribSlot
+ pState
->inputVertStride
* i
] = attrib
[i
];
870 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
871 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
873 gsContext
.pVerts
[VERTEX_POSITION_SLOT
+ pState
->inputVertStride
* i
] = attrib
[i
];
876 // record valid prims from the frontend to avoid over binning the newly generated
878 #if USE_SIMD16_FRONTEND
879 uint32_t numInputPrims
= numPrims_simd8
;
881 uint32_t numInputPrims
= pa
.NumPrims();
884 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
886 gsContext
.InstanceID
= instance
;
887 gsContext
.mask
= GenerateMask(numInputPrims
);
889 // execute the geometry shader
890 state
.pfnGsFunc(GetPrivateState(pDC
), pWorkerData
, &gsContext
);
891 AR_EVENT(GSStats((HANDLE
)&gsContext
.stats
));
893 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
895 gsContext
.pStreams
[i
] += pState
->allocationSize
;
899 // set up new binner and state for the GS output topology
900 #if USE_SIMD16_FRONTEND
901 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
904 switch (pState
->outputTopology
)
907 pfnClipFunc
= ClipRectangles_simd16
;
909 case TOP_TRIANGLE_STRIP
:
910 pfnClipFunc
= ClipTriangles_simd16
;
913 pfnClipFunc
= ClipLines_simd16
;
916 pfnClipFunc
= ClipPoints_simd16
;
919 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
924 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
927 switch (pState
->outputTopology
)
930 pfnClipFunc
= ClipRectangles
;
932 case TOP_TRIANGLE_STRIP
:
933 pfnClipFunc
= ClipTriangles
;
936 pfnClipFunc
= ClipLines
;
939 pfnClipFunc
= ClipPoints
;
942 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
947 // foreach input prim:
948 // - setup a new PA based on the emitted verts for that prim
949 // - loop over the new verts, calling PA to assemble each prim
950 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
952 uint32_t totalPrimsGenerated
= 0;
953 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
955 uint8_t* pInstanceBase
= (uint8_t*)pGsBuffers
->pGsOut
[inputPrim
];
957 // Vertex count is either emitted by shader or static
958 uint32_t vertexCount
= 0;
959 if (pState
->staticVertexCount
)
961 vertexCount
= pState
->staticVertexCount
;
965 // If emitted in shader, it should be the stored in the first dword of the output buffer
966 vertexCount
= *(uint32_t*)pInstanceBase
;
969 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
971 uint32_t numEmittedVerts
= vertexCount
;
972 if (numEmittedVerts
== 0)
977 uint8_t* pBase
= pInstanceBase
+ instance
* pState
->allocationSize
;
979 pState
->controlDataSize
== 0 ? &sNullBuffer
[0] : pBase
+ pState
->controlDataOffset
;
980 uint8_t* pVertexBaseAOS
= pBase
+ pState
->outputVertexOffset
;
982 #if USE_SIMD16_FRONTEND
983 TransposeSOAtoAOS
<SIMD512
, KNOB_SIMD16_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
986 pState
->outputVertexSize
);
988 TransposeSOAtoAOS
<SIMD256
, KNOB_SIMD_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
991 pState
->outputVertexSize
);
994 uint32_t numAttribs
= state
.feNumAttributes
;
996 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
998 bool processCutVerts
= false;
999 uint8_t* pCutBuffer
= pCutBase
;
1001 // assign default stream ID, only relevant when GS is outputting a single stream
1002 uint32_t streamID
= 0;
1003 if (pState
->isSingleStream
)
1005 processCutVerts
= true;
1006 streamID
= pState
->singleStreamID
;
1007 if (streamID
!= stream
)
1012 // early exit if this stream is not enabled for streamout
1013 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
1018 // multi-stream output, need to translate StreamID buffer to a cut buffer
1019 ProcessStreamIdBuffer(
1020 stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pGsBuffers
->pStreamCutBuffer
);
1021 pCutBuffer
= (uint8_t*)pGsBuffers
->pStreamCutBuffer
;
1022 processCutVerts
= false;
1025 #if USE_SIMD16_FRONTEND
1026 PA_STATE_CUT
gsPa(pDC
,
1027 (uint8_t*)pGsBuffers
->pGsTransposed
,
1029 pState
->outputVertexSize
,
1030 reinterpret_cast<simd16mask
*>(pCutBuffer
),
1033 pState
->outputTopology
,
1035 pa
.numVertsPerPrim
);
1038 PA_STATE_CUT
gsPa(pDC
,
1039 (uint8_t*)pGsBuffers
->pGsTransposed
,
1041 pState
->outputVertexSize
,
1045 pState
->outputTopology
,
1047 pa
.numVertsPerPrim
);
1050 while (gsPa
.GetNextStreamOutput())
1054 #if USE_SIMD16_FRONTEND
1055 simd16vector attrib_simd16
[3];
1057 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib_simd16
);
1060 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
1065 totalPrimsGenerated
+= gsPa
.NumPrims();
1067 if (HasStreamOutT::value
)
1069 #if ENABLE_AVX512_SIMD16
1070 gsPa
.useAlternateOffset
= false;
1072 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
1075 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
1077 #if USE_SIMD16_FRONTEND
1078 simd16scalari vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
1080 // Gather data from the SVG if provided.
1081 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1082 simd16scalari vRtIdx
= SIMD16::setzero_si();
1083 SIMD16::Vec4 svgAttrib
[4];
1085 if (state
.backendState
.readViewportArrayIndex
||
1086 state
.backendState
.readRenderTargetArrayIndex
)
1088 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1091 if (state
.backendState
.readViewportArrayIndex
)
1094 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1095 gsPa
.viewportArrayActive
= true;
1097 if (state
.backendState
.readRenderTargetArrayIndex
)
1099 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1100 gsPa
.rtArrayActive
= true;
1104 // OOB VPAI indices => forced to zero.
1106 SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1107 simd16scalari vNumViewports
=
1108 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1109 simd16scalari vClearMask
=
1110 SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1111 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1113 gsPa
.useAlternateOffset
= false;
1118 GenMask(gsPa
.NumPrims()),
1124 simdscalari vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1126 // Gather data from the SVG if provided.
1127 simdscalari vViewportIdx
= SIMD::setzero_si();
1128 simdscalari vRtIdx
= SIMD::setzero_si();
1129 SIMD::Vec4 svgAttrib
[4];
1131 if (state
.backendState
.readViewportArrayIndex
||
1132 state
.backendState
.readRenderTargetArrayIndex
)
1134 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1137 if (state
.backendState
.readViewportArrayIndex
)
1140 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1142 // OOB VPAI indices => forced to zero.
1144 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1145 simdscalari vNumViewports
=
1146 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1147 simdscalari vClearMask
=
1148 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1149 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1150 gsPa
.viewportArrayActive
= true;
1152 if (state
.backendState
.readRenderTargetArrayIndex
)
1154 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1155 gsPa
.rtArrayActive
= true;
1162 GenMask(gsPa
.NumPrims()),
1169 } while (gsPa
.NextPrim());
1175 // update GS pipeline stats
1176 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1177 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1178 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
* numInputPrims
));
1179 RDTSC_END(FEGeometryShader
, 1);
1182 //////////////////////////////////////////////////////////////////////////
1183 /// @brief Allocate GS buffers
1184 /// @param pDC - pointer to draw context.
1185 /// @param state - API state
1186 /// @param ppGsOut - pointer to GS output buffer allocation
1187 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1188 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1189 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
,
1190 const API_STATE
& state
,
1191 uint32_t vertsPerPrim
,
1192 GsBuffers
* pGsBuffers
)
1194 auto pArena
= pDC
->pArena
;
1195 SWR_ASSERT(pArena
!= nullptr);
1196 SWR_ASSERT(state
.gsState
.gsEnable
);
1198 const SWR_GS_STATE
& gsState
= state
.gsState
;
1200 // Allocate storage for vertex inputs
1201 uint32_t vertexInBufferSize
= gsState
.inputVertStride
* sizeof(simdvector
) * vertsPerPrim
;
1202 pGsBuffers
->pGsIn
= (uint8_t*)pArena
->AllocAligned(vertexInBufferSize
, 32);
1204 // Allocate arena space to hold GS output verts
1205 const uint32_t vertexBufferSize
= gsState
.instanceCount
* gsState
.allocationSize
;
1207 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1209 pGsBuffers
->pGsOut
[i
] = (uint8_t*)pArena
->AllocAligned(vertexBufferSize
, 32);
1212 // Allocate storage for transposed GS output
1213 uint32_t numSimdBatches
= AlignUp(gsState
.maxNumVerts
, SIMD_WIDTH
) / SIMD_WIDTH
;
1214 uint32_t transposedBufferSize
=
1215 numSimdBatches
* gsState
.outputVertexSize
* sizeof(Vec4
<SIMD_T
>);
1216 pGsBuffers
->pGsTransposed
= (uint8_t*)pArena
->AllocAligned(transposedBufferSize
, 32);
1218 // Allocate storage to hold temporary stream->cut buffer, if necessary
1219 if (state
.gsState
.isSingleStream
)
1221 pGsBuffers
->pStreamCutBuffer
= nullptr;
1225 pGsBuffers
->pStreamCutBuffer
=
1226 (uint8_t*)pArena
->AllocAligned(AlignUp(gsState
.maxNumVerts
* 2, 32), 32);
1230 //////////////////////////////////////////////////////////////////////////
1231 /// @brief Contains all data generated by the HS and passed to the
1232 /// tessellator and DS.
1233 struct TessellationThreadLocalData
1235 SWR_HS_CONTEXT hsContext
;
1236 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
1240 simdscalar
* pDSOutput
;
1241 size_t dsOutputAllocSize
;
1244 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1246 //////////////////////////////////////////////////////////////////////////
1247 /// @brief Allocate tessellation data for this worker thread.
1249 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1251 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1252 if (gt_pTessellationThreadData
== nullptr)
1254 gt_pTessellationThreadData
=
1255 (TessellationThreadLocalData
*)AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1256 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1260 //////////////////////////////////////////////////////////////////////////
1261 /// @brief Implements Tessellation Stages.
1262 /// @param pDC - pointer to draw context.
1263 /// @param workerId - thread's worker id. Even thread has a unique id.
1264 /// @param pa - The primitive assembly object.
1265 /// @param pGsOut - output stream for GS
1266 template <typename HasGeometryShaderT
, typename HasStreamOutT
, typename HasRastT
>
1267 static void TessellationStages(DRAW_CONTEXT
* pDC
,
1270 GsBuffers
* pGsBuffers
,
1271 uint32_t* pSoPrimData
,
1272 #if USE_SIMD16_FRONTEND
1273 uint32_t numPrims_simd8
,
1275 simdscalari
const& primID
)
1277 const API_STATE
& state
= GetApiState(pDC
);
1278 const SWR_TS_STATE
& tsState
= state
.tsState
;
1279 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1281 SWR_ASSERT(gt_pTessellationThreadData
);
1283 HANDLE tsCtx
= TSInitCtx(tsState
.domain
,
1284 tsState
.partitioning
,
1285 tsState
.tsOutputTopology
,
1286 gt_pTessellationThreadData
->pTxCtx
,
1287 gt_pTessellationThreadData
->tsCtxSize
);
1288 if (tsCtx
== nullptr)
1290 gt_pTessellationThreadData
->pTxCtx
=
1291 AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1292 tsCtx
= TSInitCtx(tsState
.domain
,
1293 tsState
.partitioning
,
1294 tsState
.tsOutputTopology
,
1295 gt_pTessellationThreadData
->pTxCtx
,
1296 gt_pTessellationThreadData
->tsCtxSize
);
1300 #if USE_SIMD16_FRONTEND
1301 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1302 if (HasRastT::value
)
1304 switch (tsState
.postDSTopology
)
1306 case TOP_TRIANGLE_LIST
:
1307 pfnClipFunc
= ClipTriangles_simd16
;
1310 pfnClipFunc
= ClipLines_simd16
;
1312 case TOP_POINT_LIST
:
1313 pfnClipFunc
= ClipPoints_simd16
;
1316 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1321 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1322 if (HasRastT::value
)
1324 switch (tsState
.postDSTopology
)
1326 case TOP_TRIANGLE_LIST
:
1327 pfnClipFunc
= ClipTriangles
;
1330 pfnClipFunc
= ClipLines
;
1332 case TOP_POINT_LIST
:
1333 pfnClipFunc
= ClipPoints
;
1336 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1341 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1342 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
1343 hsContext
.PrimitiveID
= primID
;
1345 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1346 // Max storage for one attribute for an entire simdprimitive
1347 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1349 // assemble all attributes for the input primitives
1350 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1352 uint32_t attribSlot
= tsState
.vertexAttribOffset
+ slot
;
1353 pa
.Assemble(attribSlot
, simdattrib
);
1355 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1357 hsContext
.vert
[i
].attrib
[VERTEX_ATTRIB_START_SLOT
+ slot
] = simdattrib
[i
];
1362 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1365 #if USE_SIMD16_FRONTEND
1366 uint32_t numPrims
= numPrims_simd8
;
1368 uint32_t numPrims
= pa
.NumPrims();
1370 hsContext
.mask
= GenerateMask(numPrims
);
1373 RDTSC_BEGIN(FEHullShader
, pDC
->drawId
);
1374 state
.pfnHsFunc(GetPrivateState(pDC
), pWorkerData
, &hsContext
);
1375 RDTSC_END(FEHullShader
, 0);
1377 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1378 AR_EVENT(HSStats((HANDLE
)&hsContext
.stats
));
1380 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1382 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1385 SWR_TS_TESSELLATED_DATA tsData
= {0};
1386 RDTSC_BEGIN(FETessellation
, pDC
->drawId
);
1387 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1388 AR_EVENT(TessPrimCount(1));
1389 RDTSC_END(FETessellation
, 0);
1391 if (tsData
.NumPrimitives
== 0)
1395 SWR_ASSERT(tsData
.NumDomainPoints
);
1397 // Allocate DS Output memory
1398 uint32_t requiredDSVectorInvocations
=
1399 AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1400 #if USE_SIMD16_FRONTEND
1401 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) *
1402 tsState
.dsAllocationSize
; // simd8 -> simd16, padding
1404 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.dsAllocationSize
;
1405 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1407 if (requiredAllocSize
> gt_pTessellationThreadData
->dsOutputAllocSize
)
1409 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1410 gt_pTessellationThreadData
->pDSOutput
=
1411 (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1412 gt_pTessellationThreadData
->dsOutputAllocSize
= requiredAllocSize
;
1414 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1415 SWR_ASSERT(gt_pTessellationThreadData
->dsOutputAllocSize
>= requiredAllocSize
);
1418 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1421 // Run Domain Shader
1422 SWR_DS_CONTEXT dsContext
;
1423 dsContext
.PrimitiveID
= pPrimId
[p
];
1424 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1425 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1426 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1427 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1428 dsContext
.outVertexAttribOffset
= tsState
.dsOutVtxAttribOffset
;
1429 #if USE_SIMD16_FRONTEND
1430 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1432 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1435 uint32_t dsInvocations
= 0;
1437 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
;
1438 ++dsContext
.vectorOffset
)
1440 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1442 RDTSC_BEGIN(FEDomainShader
, pDC
->drawId
);
1443 state
.pfnDsFunc(GetPrivateState(pDC
), pWorkerData
, &dsContext
);
1444 RDTSC_END(FEDomainShader
, 0);
1446 AR_EVENT(DSStats((HANDLE
)&dsContext
.stats
));
1448 dsInvocations
+= KNOB_SIMD_WIDTH
;
1450 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1452 #if USE_SIMD16_FRONTEND
1453 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1458 #if USE_SIMD16_FRONTEND
1459 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1460 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1462 dsContext
.pOutputData
,
1463 dsContext
.vectorStride
,
1466 tsState
.numDsOutputAttribs
+ tsState
.dsOutVtxAttribOffset
,
1468 tsData
.NumPrimitives
,
1469 tsState
.postDSTopology
,
1470 NumVertsPerPrim(tsState
.postDSTopology
, false));
1472 while (tessPa
.HasWork())
1474 #if USE_SIMD16_FRONTEND
1475 const uint32_t numPrims
= tessPa
.NumPrims();
1476 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1477 const uint32_t numPrims_hi
=
1478 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1480 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1481 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1482 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1485 if (HasGeometryShaderT::value
)
1487 #if USE_SIMD16_FRONTEND
1488 tessPa
.useAlternateOffset
= false;
1489 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1490 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1494 tessPa
.useAlternateOffset
= true;
1495 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1496 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1499 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1505 _simd_set1_epi32(dsContext
.PrimitiveID
));
1510 if (HasStreamOutT::value
)
1512 #if ENABLE_AVX512_SIMD16
1513 tessPa
.useAlternateOffset
= false;
1515 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1518 if (HasRastT::value
)
1520 #if USE_SIMD16_FRONTEND
1521 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1523 simdvector prim
[3]; // Only deal with triangles, lines, or points
1525 RDTSC_BEGIN(FEPAAssemble
, pDC
->drawId
);
1527 #if USE_SIMD16_FRONTEND
1528 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1530 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1532 RDTSC_END(FEPAAssemble
, 1);
1533 SWR_ASSERT(assemble
);
1535 SWR_ASSERT(pfnClipFunc
);
1536 #if USE_SIMD16_FRONTEND
1537 // Gather data from the SVG if provided.
1538 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1539 simd16scalari vRtIdx
= SIMD16::setzero_si();
1540 SIMD16::Vec4 svgAttrib
[4];
1542 if (state
.backendState
.readViewportArrayIndex
||
1543 state
.backendState
.readRenderTargetArrayIndex
)
1545 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1548 if (state
.backendState
.readViewportArrayIndex
)
1550 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1551 tessPa
.viewportArrayActive
= true;
1553 if (state
.backendState
.readRenderTargetArrayIndex
)
1555 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1556 tessPa
.rtArrayActive
= true;
1561 // OOB VPAI indices => forced to zero.
1562 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1563 simd16scalari vNumViewports
=
1564 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1565 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1566 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1568 tessPa
.useAlternateOffset
= false;
1579 // Gather data from the SGV if provided.
1580 simdscalari vViewportIdx
= SIMD::setzero_si();
1581 simdscalari vRtIdx
= SIMD::setzero_si();
1582 SIMD::Vec4 svgAttrib
[4];
1584 if (state
.backendState
.readViewportArrayIndex
||
1585 state
.backendState
.readRenderTargetArrayIndex
)
1587 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1590 if (state
.backendState
.readViewportArrayIndex
)
1592 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1594 // OOB VPAI indices => forced to zero.
1595 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1596 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1597 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1598 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1599 tessPa
.viewportArrayActive
= true;
1601 if (state
.backendState
.readRenderTargetArrayIndex
)
1603 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1604 tessPa
.rtArrayActive
= true;
1610 GenMask(tessPa
.NumPrims()),
1611 _simd_set1_epi32(dsContext
.PrimitiveID
),
1620 } // while (tessPa.HasWork())
1621 } // for (uint32_t p = 0; p < numPrims; ++p)
1623 #if USE_SIMD16_FRONTEND
1624 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1626 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1627 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1629 gt_pTessellationThreadData
->dsOutputAllocSize
= 0;
1632 TSDestroyCtx(tsCtx
);
1635 THREAD
PA_STATE::SIMDVERTEX
* gpVertexStore
= nullptr;
1636 THREAD
uint32_t gVertexStoreSize
= 0;
1638 //////////////////////////////////////////////////////////////////////////
1639 /// @brief FE handler for SwrDraw.
1640 /// @tparam IsIndexedT - Is indexed drawing enabled
1641 /// @tparam HasTessellationT - Is tessellation enabled
1642 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1643 /// @tparam HasStreamOutT - Is stream-out enabled
1644 /// @tparam HasRastT - Is rasterization enabled
1645 /// @param pContext - pointer to SWR context.
1646 /// @param pDC - pointer to draw context.
1647 /// @param workerId - thread's worker id.
1648 /// @param pUserData - Pointer to DRAW_WORK
1649 template <typename IsIndexedT
,
1650 typename IsCutIndexEnabledT
,
1651 typename HasTessellationT
,
1652 typename HasGeometryShaderT
,
1653 typename HasStreamOutT
,
1655 void ProcessDraw(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
1657 #if KNOB_ENABLE_TOSS_POINTS
1658 if (KNOB_TOSS_QUEUE_FE
)
1664 RDTSC_BEGIN(FEProcessDraw
, pDC
->drawId
);
1666 void* pWorkerData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1668 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1669 const API_STATE
& state
= GetApiState(pDC
);
1671 uint32_t indexSize
= 0;
1672 uint32_t endVertex
= work
.numVerts
;
1674 gfxptr_t xpLastRequestedIndex
= 0;
1675 if (IsIndexedT::value
)
1680 indexSize
= sizeof(uint32_t);
1683 indexSize
= sizeof(uint16_t);
1686 indexSize
= sizeof(uint8_t);
1689 SWR_INVALID("Invalid work.type: %d", work
.type
);
1691 xpLastRequestedIndex
= work
.xpIB
+ endVertex
* indexSize
;
1695 // No cuts, prune partial primitives.
1696 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1699 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1700 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1703 GsBuffers gsBuffers
;
1704 if (HasGeometryShaderT::value
)
1706 #if USE_SIMD16_FRONTEND
1707 AllocateGsBuffers
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1708 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1710 AllocateGsBuffers
<SIMD256
, KNOB_SIMD_WIDTH
>(
1711 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1715 if (HasTessellationT::value
)
1717 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1718 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1719 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1721 AllocateTessellationData(pContext
);
1725 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1726 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1727 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1730 // allocate space for streamout input prim data
1731 uint32_t* pSoPrimData
= nullptr;
1732 if (HasStreamOutT::value
)
1734 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1737 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, true);
1738 #if USE_SIMD16_FRONTEND
1739 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simd16vector
);
1741 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simdvector
);
1744 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1746 // Compute storage requirements for vertex store
1747 // TODO: allocation needs to be rethought for better cut support
1748 uint32_t numVerts
= vertexCount
+ 2; // Need extra space for PA state machine
1749 uint32_t vertexStoreSize
= numVerts
* simdVertexSizeBytes
;
1751 // grow the vertex store for the PA as necessary
1752 if (gVertexStoreSize
< vertexStoreSize
)
1754 if (gpVertexStore
!= nullptr)
1756 AlignedFree(gpVertexStore
);
1757 gpVertexStore
= nullptr;
1760 SWR_ASSERT(gpVertexStore
== nullptr);
1762 gpVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(vertexStoreSize
, 64));
1763 gVertexStoreSize
= vertexStoreSize
;
1765 SWR_ASSERT(gpVertexStore
!= nullptr);
1768 // choose primitive assembler
1770 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
,
1775 state
.frontendState
.vsVertexSize
,
1776 GetNumVerts(state
.topology
, 1));
1777 PA_STATE
& pa
= paFactory
.GetPA();
1779 #if USE_SIMD16_FRONTEND
1780 #if USE_SIMD16_SHADERS
1786 SWR_VS_CONTEXT vsContext_lo
;
1787 SWR_VS_CONTEXT vsContext_hi
;
1789 #if USE_SIMD16_SHADERS
1790 vsContext_lo
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1791 vsContext_hi
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1793 vsContext_lo
.pVin
= &vin_lo
;
1794 vsContext_hi
.pVin
= &vin_hi
;
1796 vsContext_lo
.AlternateOffset
= 0;
1797 vsContext_hi
.AlternateOffset
= 1;
1799 SWR_FETCH_CONTEXT fetchInfo_lo
= {0};
1801 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1802 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1803 fetchInfo_lo
.StartVertex
= 0;
1805 if (IsIndexedT::value
)
1807 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1809 // if the entire index buffer isn't being consumed, set the last index
1810 // so that fetches < a SIMD wide will be masked off
1811 fetchInfo_lo
.xpLastIndex
= state
.indexBuffer
.xpIndices
+ state
.indexBuffer
.size
;
1812 if (xpLastRequestedIndex
< fetchInfo_lo
.xpLastIndex
)
1814 fetchInfo_lo
.xpLastIndex
= xpLastRequestedIndex
;
1819 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1822 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1824 const simd16scalari vScale
=
1825 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1827 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1831 simd16scalari vIndex
;
1833 if (IsIndexedT::value
)
1835 fetchInfo_lo
.xpIndices
= work
.xpIB
;
1836 fetchInfo_hi
.xpIndices
=
1837 fetchInfo_lo
.xpIndices
+ KNOB_SIMD_WIDTH
* indexSize
; // 1/2 of KNOB_SIMD16_WIDTH
1841 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1843 fetchInfo_lo
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(GetPrivateState(pDC
), &vIndex
);
1844 fetchInfo_hi
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(
1845 GetPrivateState(pDC
),
1846 &vIndex
+ KNOB_SIMD_WIDTH
* sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1849 fetchInfo_lo
.CurInstance
= instanceNum
;
1850 fetchInfo_hi
.CurInstance
= instanceNum
;
1852 vsContext_lo
.InstanceID
= instanceNum
;
1853 vsContext_hi
.InstanceID
= instanceNum
;
1855 while (pa
.HasWork())
1857 // GetNextVsOutput currently has the side effect of updating some PA state machine
1858 // state. So we need to keep this outside of (i < endVertex) check.
1860 simdmask
* pvCutIndices_lo
= nullptr;
1861 simdmask
* pvCutIndices_hi
= nullptr;
1863 if (IsIndexedT::value
)
1865 // simd16mask <=> simdmask[2]
1867 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1868 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1871 simd16vertex
& vout
= pa
.GetNextVsOutput();
1873 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1874 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1878 if (!IsIndexedT::value
)
1880 fetchInfo_lo
.xpLastIndex
= fetchInfo_lo
.xpIndices
;
1882 offset
= std::min(endVertex
- i
, (uint32_t)KNOB_SIMD16_WIDTH
);
1883 offset
*= 4; // convert from index to address
1884 #if USE_SIMD16_SHADERS
1885 fetchInfo_lo
.xpLastIndex
+= offset
;
1887 fetchInfo_lo
.xpLastIndex
+= std::min(offset
, (uint32_t)KNOB_SIMD_WIDTH
);
1889 std::min(offset
, (uint32_t)KNOB_SIMD16_WIDTH
) - KNOB_SIMD_WIDTH
;
1890 assert(offset
>= 0);
1891 fetchInfo_hi
.xpLastIndex
= fetchInfo_hi
.xpIndices
;
1892 fetchInfo_hi
.xpLastIndex
+= offset2
;
1895 // 1. Execute FS/VS for a single SIMD.
1896 RDTSC_BEGIN(FEFetchShader
, pDC
->drawId
);
1897 #if USE_SIMD16_SHADERS
1898 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin
);
1900 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin_lo
);
1902 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1904 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_hi
, vin_hi
);
1907 RDTSC_END(FEFetchShader
, 0);
1909 // forward fetch generated vertex IDs to the vertex shader
1910 #if USE_SIMD16_SHADERS
1912 vsContext_lo
.VertexID16
=
1913 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID
, 0);
1914 vsContext_lo
.VertexID16
=
1915 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID2
, 1);
1917 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1918 vsContext_hi
.VertexID
= fetchInfo_lo
.VertexID2
;
1921 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1922 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1925 // Setup active mask for vertex shader.
1927 vsContext_lo
.mask16
= GenerateMask16(endVertex
- i
);
1929 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1930 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1933 // forward cut mask to the PA
1934 if (IsIndexedT::value
)
1936 #if USE_SIMD16_SHADERS
1937 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1938 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask2
));
1940 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1941 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1945 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1947 #if KNOB_ENABLE_TOSS_POINTS
1948 if (!KNOB_TOSS_FETCH
)
1951 RDTSC_BEGIN(FEVertexShader
, pDC
->drawId
);
1953 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1954 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1956 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1957 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1959 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1961 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_hi
);
1962 AR_EVENT(VSStats((HANDLE
)&vsContext_hi
.stats
));
1965 RDTSC_END(FEVertexShader
, 0);
1967 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1971 // 2. Assemble primitives given the last two SIMD.
1974 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1976 RDTSC_START(FEPAAssemble
);
1977 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1978 RDTSC_STOP(FEPAAssemble
, 1, 0);
1980 #if KNOB_ENABLE_TOSS_POINTS
1981 if (!KNOB_TOSS_FETCH
)
1984 #if KNOB_ENABLE_TOSS_POINTS
1990 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1992 const uint32_t numPrims
= pa
.NumPrims();
1993 const uint32_t numPrims_lo
=
1994 std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1995 const uint32_t numPrims_hi
=
1996 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1998 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
1999 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
2000 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
2002 if (HasTessellationT::value
)
2004 pa
.useAlternateOffset
= false;
2005 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2016 pa
.useAlternateOffset
= true;
2017 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2027 else if (HasGeometryShaderT::value
)
2029 pa
.useAlternateOffset
= false;
2030 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2040 pa
.useAlternateOffset
= true;
2041 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2052 // If streamout is enabled then stream vertices out to memory.
2053 if (HasStreamOutT::value
)
2055 pa
.useAlternateOffset
= false;
2056 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2059 if (HasRastT::value
)
2061 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
2062 // Gather data from the SVG if provided.
2063 simd16scalari vpai
= SIMD16::setzero_si();
2064 simd16scalari rtai
= SIMD16::setzero_si();
2065 SIMD16::Vec4 svgAttrib
[4];
2067 if (state
.backendState
.readViewportArrayIndex
||
2068 state
.backendState
.readRenderTargetArrayIndex
)
2070 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2073 if (state
.backendState
.readViewportArrayIndex
)
2075 vpai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2076 pa
.viewportArrayActive
= true;
2078 if (state
.backendState
.readRenderTargetArrayIndex
)
2081 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2082 pa
.rtArrayActive
= true;
2086 // OOB VPAI indices => forced to zero.
2087 vpai
= SIMD16::max_epi32(vpai
, SIMD16::setzero_si());
2088 simd16scalari vNumViewports
=
2089 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2090 simd16scalari vClearMask
=
2091 SIMD16::cmplt_epi32(vpai
, vNumViewports
);
2092 vpai
= SIMD16::and_si(vClearMask
, vpai
);
2094 pa
.useAlternateOffset
= false;
2095 pDC
->pState
->pfnProcessPrims_simd16(pDC
,
2109 } while (pa
.NextPrim());
2111 if (IsIndexedT::value
)
2113 fetchInfo_lo
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2114 fetchInfo_hi
.xpIndices
= fetchInfo_hi
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2118 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
2121 i
+= KNOB_SIMD16_WIDTH
;
2128 SWR_VS_CONTEXT vsContext
;
2129 SWR_FETCH_CONTEXT fetchInfo
= {0};
2131 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
2132 fetchInfo
.StartInstance
= work
.startInstance
;
2133 fetchInfo
.StartVertex
= 0;
2135 if (IsIndexedT::value
)
2137 fetchInfo
.BaseVertex
= work
.baseVertex
;
2139 // if the entire index buffer isn't being consumed, set the last index
2140 // so that fetches < a SIMD wide will be masked off
2141 fetchInfo
.pLastIndex
=
2142 (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
2143 if (xpLastRequestedIndex
< fetchInfo
.pLastIndex
)
2145 fetchInfo
.pLastIndex
= xpLastRequestedIndex
;
2150 fetchInfo
.StartVertex
= work
.startVertex
;
2153 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2155 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2156 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
2161 if (IsIndexedT::value
)
2163 fetchInfo
.pIndices
= work
.pIB
;
2167 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
2168 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
2171 fetchInfo
.CurInstance
= instanceNum
;
2172 vsContext
.InstanceID
= instanceNum
;
2174 while (pa
.HasWork())
2176 // GetNextVsOutput currently has the side effect of updating some PA state machine
2177 // state. So we need to keep this outside of (i < endVertex) check.
2178 simdmask
* pvCutIndices
= nullptr;
2179 if (IsIndexedT::value
)
2181 pvCutIndices
= &pa
.GetNextVsIndices();
2184 simdvertex
& vout
= pa
.GetNextVsOutput();
2185 vsContext
.pVin
= &vout
;
2186 vsContext
.pVout
= &vout
;
2190 // 1. Execute FS/VS for a single SIMD.
2191 RDTSC_BEGIN(FEFetchShader
, pDC
->drawId
);
2192 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo
, vout
);
2193 RDTSC_END(FEFetchShader
, 0);
2195 // forward fetch generated vertex IDs to the vertex shader
2196 vsContext
.VertexID
= fetchInfo
.VertexID
;
2198 // Setup active mask for vertex shader.
2199 vsContext
.mask
= GenerateMask(endVertex
- i
);
2201 // forward cut mask to the PA
2202 if (IsIndexedT::value
)
2204 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
2207 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
2209 #if KNOB_ENABLE_TOSS_POINTS
2210 if (!KNOB_TOSS_FETCH
)
2213 RDTSC_BEGIN(FEVertexShader
, pDC
->drawId
);
2214 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext
);
2215 RDTSC_END(FEVertexShader
, 0);
2217 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
2218 AR_EVENT(VSStats((HANDLE
)&vsContext
.stats
));
2222 // 2. Assemble primitives given the last two SIMD.
2225 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
2226 // PaAssemble returns false if there is not enough verts to assemble.
2227 RDTSC_BEGIN(FEPAAssemble
, pDC
->drawId
);
2228 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
2229 RDTSC_END(FEPAAssemble
, 1);
2231 #if KNOB_ENABLE_TOSS_POINTS
2232 if (!KNOB_TOSS_FETCH
)
2235 #if KNOB_ENABLE_TOSS_POINTS
2241 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2243 if (HasTessellationT::value
)
2245 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2251 pa
.GetPrimID(work
.startPrimID
));
2253 else if (HasGeometryShaderT::value
)
2255 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
2261 pa
.GetPrimID(work
.startPrimID
));
2265 // If streamout is enabled then stream vertices out to memory.
2266 if (HasStreamOutT::value
)
2268 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2271 if (HasRastT::value
)
2273 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
2275 // Gather data from the SVG if provided.
2276 simdscalari vViewportIdx
= SIMD::setzero_si();
2277 simdscalari vRtIdx
= SIMD::setzero_si();
2278 SIMD::Vec4 svgAttrib
[4];
2280 if (state
.backendState
.readViewportArrayIndex
||
2281 state
.backendState
.readRenderTargetArrayIndex
)
2283 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2286 if (state
.backendState
.readViewportArrayIndex
)
2289 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2291 // OOB VPAI indices => forced to zero.
2293 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
2294 simdscalari vNumViewports
=
2295 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2296 simdscalari vClearMask
=
2297 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
2298 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
2299 pa
.viewportArrayActive
= true;
2301 if (state
.backendState
.readRenderTargetArrayIndex
)
2304 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2305 pa
.rtArrayActive
= true;
2308 pDC
->pState
->pfnProcessPrims(pDC
,
2312 GenMask(pa
.NumPrims()),
2313 pa
.GetPrimID(work
.startPrimID
),
2321 } while (pa
.NextPrim());
2323 if (IsIndexedT::value
)
2325 fetchInfo
.pIndices
=
2326 (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
2330 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
2333 i
+= KNOB_SIMD_WIDTH
;
2340 RDTSC_END(FEProcessDraw
, numPrims
* work
.numInstances
);
2343 struct FEDrawChooser
2345 typedef PFN_FE_WORK_FUNC FuncType
;
2347 template <typename
... ArgsB
>
2348 static FuncType
GetFunc()
2350 return ProcessDraw
<ArgsB
...>;
2354 // Selector for correct templated Draw front-end function
2355 PFN_FE_WORK_FUNC
GetProcessDrawFunc(bool IsIndexed
,
2356 bool IsCutIndexEnabled
,
2357 bool HasTessellation
,
2358 bool HasGeometryShader
,
2360 bool HasRasterization
)
2362 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
,