1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
51 void ProcessSync(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
55 work
.pfnWork
= ProcessSyncBE
;
57 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
58 pTileMgr
->enqueue(0, 0, &work
);
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 void ProcessShutdown(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
71 work
.pfnWork
= ProcessShutdownBE
;
73 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
78 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
80 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
82 pTileMgr
->enqueue(i
, n
, &work
);
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
94 void ProcessClear(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
96 CLEAR_DESC
* pDesc
= (CLEAR_DESC
*)pUserData
;
97 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
102 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
103 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
104 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
108 work
.pfnWork
= ProcessClearBE
;
109 work
.desc
.clear
= *pDesc
;
111 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
113 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
115 pTileMgr
->enqueue(x
, y
, &work
);
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
129 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessStoreTiles
, pDC
->drawId
);
130 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
131 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
136 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
137 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
138 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
142 work
.type
= STORETILES
;
143 work
.pfnWork
= ProcessStoreTilesBE
;
144 work
.desc
.storeTiles
= *pDesc
;
146 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
148 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
150 pTileMgr
->enqueue(x
, y
, &work
);
154 RDTSC_END(pContext
->pBucketMgr
, FEProcessStoreTiles
, 0);
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
* pContext
,
169 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, pDC
->drawId
);
170 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
171 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
175 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
176 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
177 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
179 if (pDesc
->fullTilesOnly
== false)
181 // include partial tiles
182 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
183 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
184 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
185 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
188 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
189 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
191 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
192 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
196 work
.type
= DISCARDINVALIDATETILES
;
197 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
198 work
.desc
.discardInvalidateTiles
= *pDesc
;
200 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
202 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
204 pTileMgr
->enqueue(x
, y
, &work
);
208 RDTSC_END(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, 0);
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
222 case TOP_TRIANGLE_LIST
:
224 case TOP_TRIANGLE_STRIP
:
225 return numPrims
< 3 ? 0 : numPrims
- 2;
226 case TOP_TRIANGLE_FAN
:
227 return numPrims
< 3 ? 0 : numPrims
- 2;
228 case TOP_TRIANGLE_DISC
:
229 return numPrims
< 2 ? 0 : numPrims
- 1;
233 return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
235 return numPrims
< 2 ? 0 : numPrims
- 1;
242 case TOP_LINE_LIST_ADJ
:
244 case TOP_LISTSTRIP_ADJ
:
245 return numPrims
< 3 ? 0 : numPrims
- 3;
246 case TOP_TRI_LIST_ADJ
:
248 case TOP_TRI_STRIP_ADJ
:
249 return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
251 case TOP_PATCHLIST_1
:
252 case TOP_PATCHLIST_2
:
253 case TOP_PATCHLIST_3
:
254 case TOP_PATCHLIST_4
:
255 case TOP_PATCHLIST_5
:
256 case TOP_PATCHLIST_6
:
257 case TOP_PATCHLIST_7
:
258 case TOP_PATCHLIST_8
:
259 case TOP_PATCHLIST_9
:
260 case TOP_PATCHLIST_10
:
261 case TOP_PATCHLIST_11
:
262 case TOP_PATCHLIST_12
:
263 case TOP_PATCHLIST_13
:
264 case TOP_PATCHLIST_14
:
265 case TOP_PATCHLIST_15
:
266 case TOP_PATCHLIST_16
:
267 case TOP_PATCHLIST_17
:
268 case TOP_PATCHLIST_18
:
269 case TOP_PATCHLIST_19
:
270 case TOP_PATCHLIST_20
:
271 case TOP_PATCHLIST_21
:
272 case TOP_PATCHLIST_22
:
273 case TOP_PATCHLIST_23
:
274 case TOP_PATCHLIST_24
:
275 case TOP_PATCHLIST_25
:
276 case TOP_PATCHLIST_26
:
277 case TOP_PATCHLIST_27
:
278 case TOP_PATCHLIST_28
:
279 case TOP_PATCHLIST_29
:
280 case TOP_PATCHLIST_30
:
281 case TOP_PATCHLIST_31
:
282 case TOP_PATCHLIST_32
:
283 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
286 case TOP_POINT_LIST_BF
:
287 case TOP_LINE_STRIP_CONT
:
288 case TOP_LINE_STRIP_BF
:
289 case TOP_LINE_STRIP_CONT_BF
:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
291 case TOP_TRI_STRIP_REVERSE
:
292 case TOP_PATCHLIST_BASE
:
294 SWR_INVALID("Unsupported topology: %d", mode
);
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
311 case TOP_TRIANGLE_LIST
:
313 case TOP_TRIANGLE_STRIP
:
314 return numPrims
? numPrims
+ 2 : 0;
315 case TOP_TRIANGLE_FAN
:
316 return numPrims
? numPrims
+ 2 : 0;
317 case TOP_TRIANGLE_DISC
:
318 return numPrims
? numPrims
+ 1 : 0;
322 return numPrims
? numPrims
* 2 + 2 : 0;
324 return numPrims
? numPrims
+ 1 : 0;
331 case TOP_LINE_LIST_ADJ
:
333 case TOP_LISTSTRIP_ADJ
:
334 return numPrims
? numPrims
+ 3 : 0;
335 case TOP_TRI_LIST_ADJ
:
337 case TOP_TRI_STRIP_ADJ
:
338 return numPrims
? (numPrims
+ 2) * 2 : 0;
340 case TOP_PATCHLIST_1
:
341 case TOP_PATCHLIST_2
:
342 case TOP_PATCHLIST_3
:
343 case TOP_PATCHLIST_4
:
344 case TOP_PATCHLIST_5
:
345 case TOP_PATCHLIST_6
:
346 case TOP_PATCHLIST_7
:
347 case TOP_PATCHLIST_8
:
348 case TOP_PATCHLIST_9
:
349 case TOP_PATCHLIST_10
:
350 case TOP_PATCHLIST_11
:
351 case TOP_PATCHLIST_12
:
352 case TOP_PATCHLIST_13
:
353 case TOP_PATCHLIST_14
:
354 case TOP_PATCHLIST_15
:
355 case TOP_PATCHLIST_16
:
356 case TOP_PATCHLIST_17
:
357 case TOP_PATCHLIST_18
:
358 case TOP_PATCHLIST_19
:
359 case TOP_PATCHLIST_20
:
360 case TOP_PATCHLIST_21
:
361 case TOP_PATCHLIST_22
:
362 case TOP_PATCHLIST_23
:
363 case TOP_PATCHLIST_24
:
364 case TOP_PATCHLIST_25
:
365 case TOP_PATCHLIST_26
:
366 case TOP_PATCHLIST_27
:
367 case TOP_PATCHLIST_28
:
368 case TOP_PATCHLIST_29
:
369 case TOP_PATCHLIST_30
:
370 case TOP_PATCHLIST_31
:
371 case TOP_PATCHLIST_32
:
372 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
375 case TOP_POINT_LIST_BF
:
376 case TOP_LINE_STRIP_CONT
:
377 case TOP_LINE_STRIP_BF
:
378 case TOP_LINE_STRIP_CONT_BF
:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
380 case TOP_TRI_STRIP_REVERSE
:
381 case TOP_PATCHLIST_BASE
:
383 SWR_INVALID("Unsupported topology: %d", mode
);
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
396 uint32_t numVerts
= 0;
400 case TOP_POINT_LIST_BF
:
405 case TOP_LINE_LIST_ADJ
:
407 case TOP_LINE_STRIP_CONT
:
408 case TOP_LINE_STRIP_BF
:
409 case TOP_LISTSTRIP_ADJ
:
412 case TOP_TRIANGLE_LIST
:
413 case TOP_TRIANGLE_STRIP
:
414 case TOP_TRIANGLE_FAN
:
415 case TOP_TRI_LIST_ADJ
:
416 case TOP_TRI_STRIP_ADJ
:
417 case TOP_TRI_STRIP_REVERSE
:
425 case TOP_PATCHLIST_1
:
426 case TOP_PATCHLIST_2
:
427 case TOP_PATCHLIST_3
:
428 case TOP_PATCHLIST_4
:
429 case TOP_PATCHLIST_5
:
430 case TOP_PATCHLIST_6
:
431 case TOP_PATCHLIST_7
:
432 case TOP_PATCHLIST_8
:
433 case TOP_PATCHLIST_9
:
434 case TOP_PATCHLIST_10
:
435 case TOP_PATCHLIST_11
:
436 case TOP_PATCHLIST_12
:
437 case TOP_PATCHLIST_13
:
438 case TOP_PATCHLIST_14
:
439 case TOP_PATCHLIST_15
:
440 case TOP_PATCHLIST_16
:
441 case TOP_PATCHLIST_17
:
442 case TOP_PATCHLIST_18
:
443 case TOP_PATCHLIST_19
:
444 case TOP_PATCHLIST_20
:
445 case TOP_PATCHLIST_21
:
446 case TOP_PATCHLIST_22
:
447 case TOP_PATCHLIST_23
:
448 case TOP_PATCHLIST_24
:
449 case TOP_PATCHLIST_25
:
450 case TOP_PATCHLIST_26
:
451 case TOP_PATCHLIST_27
:
452 case TOP_PATCHLIST_28
:
453 case TOP_PATCHLIST_29
:
454 case TOP_PATCHLIST_30
:
455 case TOP_PATCHLIST_31
:
456 case TOP_PATCHLIST_32
:
457 numVerts
= topology
- TOP_PATCHLIST_BASE
;
460 SWR_INVALID("Unsupported topology: %d", topology
);
468 case TOP_LISTSTRIP_ADJ
:
469 case TOP_LINE_LIST_ADJ
:
472 case TOP_TRI_STRIP_ADJ
:
473 case TOP_TRI_LIST_ADJ
:
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
487 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
490 (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
491 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask
));
495 static INLINE simd16scalari
GenerateMask16(uint32_t numItemsRemaining
)
498 (numItemsRemaining
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: numItemsRemaining
;
499 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask
));
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
509 static void StreamOut(
510 DRAW_CONTEXT
* pDC
, PA_STATE
& pa
, uint32_t workerId
, uint32_t* pPrimData
, uint32_t streamIndex
)
512 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEStreamout
, pDC
->drawId
);
514 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
516 const API_STATE
& state
= GetApiState(pDC
);
517 const SWR_STREAMOUT_STATE
& soState
= state
.soState
;
519 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
523 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
525 SWR_STREAMOUT_CONTEXT soContext
= {0};
527 // Setup buffer state pointers.
528 for (uint32_t i
= 0; i
< 4; ++i
)
530 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
533 uint32_t numPrims
= pa
.NumPrims();
535 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
538 uint64_t soMask
= soState
.streamMasks
[streamIndex
];
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot
, soMask
))
543 simd4scalar attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
544 uint32_t paSlot
= slot
+ soState
.vertexAttribOffset
[streamIndex
];
545 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
556 uint32_t* pPrimDataAttrib
=
557 pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
559 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
562 soMask
&= ~(uint64_t(1) << slot
);
565 // Update pPrimData pointer
566 soContext
.pPrimData
= pPrimData
;
569 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state
.pfnSoFunc
[streamIndex
](GetPrivateState(pDC
), pWorkerData
, soContext
);
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i
= 0; i
< 4; ++i
)
577 if (state
.soBuffer
[i
].pWriteOffset
)
579 bool nullTileAccessed
= false;
580 void* pWriteOffset
= pDC
->pContext
->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC
), soContext
.pBuffer
[i
]->pWriteOffset
, &nullTileAccessed
, pWorkerData
);
582 *((uint32_t*)pWriteOffset
) = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
585 if (state
.soBuffer
[i
].soWriteEnable
)
587 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
588 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
592 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
593 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
595 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEStreamout
, 1);
598 #if USE_SIMD16_FRONTEND
599 //////////////////////////////////////////////////////////////////////////
600 /// Is value an even number (a multiple of two)
602 template <typename T
>
603 INLINE
static bool IsEven(T value
)
605 return (value
& 1) == 0;
608 //////////////////////////////////////////////////////////////////////////
609 /// Round up value to an even number (a multiple of two)
611 template <typename T
>
612 INLINE
static T
RoundUpEven(T value
)
614 return (value
+ 1) & ~1;
617 //////////////////////////////////////////////////////////////////////////
618 /// Round down value to an even number (a multiple of two)
620 template <typename T
>
621 INLINE
static T
RoundDownEven(T value
)
626 //////////////////////////////////////////////////////////////////////////
627 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
629 /// vertexCount is in terms of the source simdvertexes and must be even
631 /// attribCount will limit the vector copies to those attribs specified
633 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
635 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
* vertex_simd16
,
636 const simdvertex
* vertex
,
637 uint32_t vertexCount
,
638 uint32_t attribCount
)
641 SWR_ASSERT(vertex_simd16
);
642 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
646 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
648 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
650 for (uint32_t k
= 0; k
< 4; k
+= 1)
653 _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
655 if ((i
+ 1) < vertexCount
)
658 _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
663 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
665 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
671 //////////////////////////////////////////////////////////////////////////
672 /// @brief Computes number of invocations. The current index represents
673 /// the start of the SIMD. The max index represents how much work
674 /// items are remaining. If there is less then a SIMD's xmin of work
675 /// then return the remaining amount of work.
676 /// @param curIndex - The start index for the SIMD.
677 /// @param maxIndex - The last index for all work items.
678 static INLINE
uint32_t GetNumInvocations(uint32_t curIndex
, uint32_t maxIndex
)
680 uint32_t remainder
= (maxIndex
- curIndex
);
681 #if USE_SIMD16_FRONTEND
682 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
684 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
688 //////////////////////////////////////////////////////////////////////////
689 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
690 /// The geometry shader will loop over each active streamout buffer, assembling
691 /// primitives for the downstream stages. When multistream output is enabled,
692 /// the generated stream ID buffer from the GS needs to be converted to a cut
693 /// buffer for the primitive assembler.
694 /// @param stream - stream id to generate the cut buffer for
695 /// @param pStreamIdBase - pointer to the stream ID buffer
696 /// @param numEmittedVerts - Number of total verts emitted by the GS
697 /// @param pCutBuffer - output buffer to write cuts to
698 void ProcessStreamIdBuffer(uint32_t stream
,
699 uint8_t* pStreamIdBase
,
700 uint32_t numEmittedVerts
,
703 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
705 uint32_t numInputBytes
= AlignUp(numEmittedVerts
* 2, 8) / 8;
706 uint32_t numOutputBytes
= AlignUp(numEmittedVerts
, 8) / 8;
708 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
710 uint8_t curInputByte
= pStreamIdBase
[2 * b
];
712 for (uint32_t i
= 0; i
< 4; ++i
)
714 if ((curInputByte
& 0x3) != stream
)
721 curInputByte
= pStreamIdBase
[2 * b
+ 1];
722 for (uint32_t i
= 0; i
< 4; ++i
)
724 if ((curInputByte
& 0x3) != stream
)
726 outByte
|= (1 << (i
+ 4));
731 *pCutBuffer
++ = outByte
;
735 // Buffers that are allocated if GS is enabled
739 uint8_t* pGsOut
[KNOB_SIMD_WIDTH
];
740 uint8_t* pGsTransposed
;
741 void* pStreamCutBuffer
;
744 //////////////////////////////////////////////////////////////////////////
745 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
746 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
748 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
749 /// @param numVerts - Number of vertices outputted by the GS
750 /// @param numAttribs - Number of attributes per vertex
751 template <typename SIMD_T
, uint32_t SimdWidth
>
752 void TransposeSOAtoAOS(uint8_t* pDst
, uint8_t* pSrc
, uint32_t numVerts
, uint32_t numAttribs
)
754 uint32_t srcVertexStride
= numAttribs
* sizeof(float) * 4;
755 uint32_t dstVertexStride
= numAttribs
* sizeof(Float
<SIMD_T
>) * 4;
757 OSALIGNSIMD16(uint32_t) gatherOffsets
[SimdWidth
];
759 for (uint32_t i
= 0; i
< SimdWidth
; ++i
)
761 gatherOffsets
[i
] = srcVertexStride
* i
;
763 auto vGatherOffsets
= SIMD_T::load_si((Integer
<SIMD_T
>*)&gatherOffsets
[0]);
765 uint32_t numSimd
= AlignUp(numVerts
, SimdWidth
) / SimdWidth
;
766 uint32_t remainingVerts
= numVerts
;
768 for (uint32_t s
= 0; s
< numSimd
; ++s
)
770 uint8_t* pSrcBase
= pSrc
+ s
* srcVertexStride
* SimdWidth
;
771 uint8_t* pDstBase
= pDst
+ s
* dstVertexStride
;
773 // Compute mask to prevent src overflow
774 uint32_t mask
= std::min(remainingVerts
, SimdWidth
);
775 mask
= GenMask(mask
);
776 auto vMask
= SIMD_T::vmask_ps(mask
);
777 auto viMask
= SIMD_T::castps_si(vMask
);
779 for (uint32_t a
= 0; a
< numAttribs
; ++a
)
781 auto attribGatherX
= SIMD_T::mask_i32gather_ps(
782 SIMD_T::setzero_ps(), (const float*)pSrcBase
, vGatherOffsets
, vMask
);
783 auto attribGatherY
= SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
784 (const float*)(pSrcBase
+ sizeof(float)),
788 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
789 (const float*)(pSrcBase
+ sizeof(float) * 2),
793 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
794 (const float*)(pSrcBase
+ sizeof(float) * 3),
798 SIMD_T::maskstore_ps((float*)pDstBase
, viMask
, attribGatherX
);
799 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>)), viMask
, attribGatherY
);
800 SIMD_T::maskstore_ps(
801 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 2), viMask
, attribGatherZ
);
802 SIMD_T::maskstore_ps(
803 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 3), viMask
, attribGatherW
);
805 pSrcBase
+= sizeof(float) * 4;
806 pDstBase
+= sizeof(Float
<SIMD_T
>) * 4;
808 remainingVerts
-= SimdWidth
;
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Implements GS stage.
815 /// @param pDC - pointer to draw context.
816 /// @param workerId - thread's worker id. Even thread has a unique id.
817 /// @param pa - The primitive assembly object.
818 /// @param pGsOut - output stream for GS
819 template <typename HasStreamOutT
, typename HasRastT
>
820 static void GeometryShaderStage(DRAW_CONTEXT
* pDC
,
823 GsBuffers
* pGsBuffers
,
824 uint32_t* pSoPrimData
,
825 #if USE_SIMD16_FRONTEND
826 uint32_t numPrims_simd8
,
828 simdscalari
const& primID
)
830 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, pDC
->drawId
);
832 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
834 const API_STATE
& state
= GetApiState(pDC
);
835 const SWR_GS_STATE
* pState
= &state
.gsState
;
836 SWR_GS_CONTEXT gsContext
;
838 static uint8_t sNullBuffer
[128] = {0};
840 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
842 gsContext
.pStreams
[i
] = pGsBuffers
->pGsOut
[i
];
844 gsContext
.pVerts
= (simdvector
*)pGsBuffers
->pGsIn
;
845 gsContext
.PrimitiveID
= primID
;
847 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
848 simdvector attrib
[MAX_NUM_VERTS_PER_PRIM
];
850 // assemble all attributes for the input primitive
851 gsContext
.inputVertStride
= pState
->inputVertStride
;
852 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
854 uint32_t attribOffset
= slot
+ pState
->vertexAttribOffset
;
855 pa
.Assemble(attribOffset
, attrib
);
857 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
859 gsContext
.pVerts
[attribOffset
+ pState
->inputVertStride
* i
] = attrib
[i
];
863 // record valid prims from the frontend to avoid over binning the newly generated
865 #if USE_SIMD16_FRONTEND
866 uint32_t numInputPrims
= numPrims_simd8
;
868 uint32_t numInputPrims
= pa
.NumPrims();
871 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
873 gsContext
.InstanceID
= instance
;
874 gsContext
.mask
= GenerateMask(numInputPrims
);
876 // execute the geometry shader
877 state
.pfnGsFunc(GetPrivateState(pDC
), pWorkerData
, &gsContext
);
878 AR_EVENT(GSStats((HANDLE
)&gsContext
.stats
));
880 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
882 gsContext
.pStreams
[i
] += pState
->allocationSize
;
886 // set up new binner and state for the GS output topology
887 #if USE_SIMD16_FRONTEND
888 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
891 switch (pState
->outputTopology
)
894 pfnClipFunc
= ClipRectangles_simd16
;
896 case TOP_TRIANGLE_STRIP
:
897 pfnClipFunc
= ClipTriangles_simd16
;
900 pfnClipFunc
= ClipLines_simd16
;
903 pfnClipFunc
= ClipPoints_simd16
;
906 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
911 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
914 switch (pState
->outputTopology
)
917 pfnClipFunc
= ClipRectangles
;
919 case TOP_TRIANGLE_STRIP
:
920 pfnClipFunc
= ClipTriangles
;
923 pfnClipFunc
= ClipLines
;
926 pfnClipFunc
= ClipPoints
;
929 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
934 // foreach input prim:
935 // - setup a new PA based on the emitted verts for that prim
936 // - loop over the new verts, calling PA to assemble each prim
937 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
939 uint32_t totalPrimsGenerated
= 0;
940 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
942 uint8_t* pInstanceBase
= (uint8_t*)pGsBuffers
->pGsOut
[inputPrim
];
944 // Vertex count is either emitted by shader or static
945 uint32_t vertexCount
= 0;
946 if (pState
->staticVertexCount
)
948 vertexCount
= pState
->staticVertexCount
;
952 // If emitted in shader, it should be the stored in the first dword of the output buffer
953 vertexCount
= *(uint32_t*)pInstanceBase
;
956 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
958 uint32_t numEmittedVerts
= vertexCount
;
959 if (numEmittedVerts
== 0)
964 uint8_t* pBase
= pInstanceBase
+ instance
* pState
->allocationSize
;
966 pState
->controlDataSize
== 0 ? &sNullBuffer
[0] : pBase
+ pState
->controlDataOffset
;
967 uint8_t* pVertexBaseAOS
= pBase
+ pState
->outputVertexOffset
;
969 #if USE_SIMD16_FRONTEND
970 TransposeSOAtoAOS
<SIMD512
, KNOB_SIMD16_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
973 pState
->outputVertexSize
);
975 TransposeSOAtoAOS
<SIMD256
, KNOB_SIMD_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
978 pState
->outputVertexSize
);
981 uint32_t numAttribs
= state
.feNumAttributes
;
983 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
985 bool processCutVerts
= false;
986 uint8_t* pCutBuffer
= pCutBase
;
988 // assign default stream ID, only relevant when GS is outputting a single stream
989 uint32_t streamID
= 0;
990 if (pState
->isSingleStream
)
992 processCutVerts
= true;
993 streamID
= pState
->singleStreamID
;
994 if (streamID
!= stream
)
999 // early exit if this stream is not enabled for streamout
1000 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
1005 // multi-stream output, need to translate StreamID buffer to a cut buffer
1006 ProcessStreamIdBuffer(
1007 stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pGsBuffers
->pStreamCutBuffer
);
1008 pCutBuffer
= (uint8_t*)pGsBuffers
->pStreamCutBuffer
;
1009 processCutVerts
= false;
1012 #if USE_SIMD16_FRONTEND
1013 PA_STATE_CUT
gsPa(pDC
,
1014 (uint8_t*)pGsBuffers
->pGsTransposed
,
1016 pState
->outputVertexSize
,
1017 reinterpret_cast<simd16mask
*>(pCutBuffer
),
1020 pState
->outputTopology
,
1022 pa
.numVertsPerPrim
);
1025 PA_STATE_CUT
gsPa(pDC
,
1026 (uint8_t*)pGsBuffers
->pGsTransposed
,
1028 pState
->outputVertexSize
,
1032 pState
->outputTopology
,
1034 pa
.numVertsPerPrim
);
1037 while (gsPa
.GetNextStreamOutput())
1041 #if USE_SIMD16_FRONTEND
1042 simd16vector attrib_simd16
[3];
1044 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib_simd16
);
1047 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
1052 totalPrimsGenerated
+= gsPa
.NumPrims();
1054 if (HasStreamOutT::value
)
1056 #if ENABLE_AVX512_SIMD16
1057 gsPa
.useAlternateOffset
= false;
1059 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
1062 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
1064 #if USE_SIMD16_FRONTEND
1065 simd16scalari vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
1067 // Gather data from the SVG if provided.
1068 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1069 simd16scalari vRtIdx
= SIMD16::setzero_si();
1070 SIMD16::Vec4 svgAttrib
[4];
1072 if (state
.backendState
.readViewportArrayIndex
||
1073 state
.backendState
.readRenderTargetArrayIndex
)
1075 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1078 if (state
.backendState
.readViewportArrayIndex
)
1081 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1082 gsPa
.viewportArrayActive
= true;
1084 if (state
.backendState
.readRenderTargetArrayIndex
)
1086 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1087 gsPa
.rtArrayActive
= true;
1091 // OOB VPAI indices => forced to zero.
1093 SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1094 simd16scalari vNumViewports
=
1095 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1096 simd16scalari vClearMask
=
1097 SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1098 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1100 gsPa
.useAlternateOffset
= false;
1105 GenMask(gsPa
.NumPrims()),
1111 simdscalari vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1113 // Gather data from the SVG if provided.
1114 simdscalari vViewportIdx
= SIMD::setzero_si();
1115 simdscalari vRtIdx
= SIMD::setzero_si();
1116 SIMD::Vec4 svgAttrib
[4];
1118 if (state
.backendState
.readViewportArrayIndex
||
1119 state
.backendState
.readRenderTargetArrayIndex
)
1121 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1124 if (state
.backendState
.readViewportArrayIndex
)
1127 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1129 // OOB VPAI indices => forced to zero.
1131 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1132 simdscalari vNumViewports
=
1133 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1134 simdscalari vClearMask
=
1135 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1136 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1137 gsPa
.viewportArrayActive
= true;
1139 if (state
.backendState
.readRenderTargetArrayIndex
)
1141 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1142 gsPa
.rtArrayActive
= true;
1149 GenMask(gsPa
.NumPrims()),
1156 } while (gsPa
.NextPrim());
1162 // update GS pipeline stats
1163 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1164 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1165 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
* numInputPrims
));
1166 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, 1);
1169 //////////////////////////////////////////////////////////////////////////
1170 /// @brief Allocate GS buffers
1171 /// @param pDC - pointer to draw context.
1172 /// @param state - API state
1173 /// @param ppGsOut - pointer to GS output buffer allocation
1174 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1175 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1176 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
,
1177 const API_STATE
& state
,
1178 uint32_t vertsPerPrim
,
1179 GsBuffers
* pGsBuffers
)
1181 auto pArena
= pDC
->pArena
;
1182 SWR_ASSERT(pArena
!= nullptr);
1183 SWR_ASSERT(state
.gsState
.gsEnable
);
1185 const SWR_GS_STATE
& gsState
= state
.gsState
;
1187 // Allocate storage for vertex inputs
1188 uint32_t vertexInBufferSize
= gsState
.inputVertStride
* sizeof(simdvector
) * vertsPerPrim
;
1189 pGsBuffers
->pGsIn
= (uint8_t*)pArena
->AllocAligned(vertexInBufferSize
, 32);
1191 // Allocate arena space to hold GS output verts
1192 const uint32_t vertexBufferSize
= gsState
.instanceCount
* gsState
.allocationSize
;
1194 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1196 pGsBuffers
->pGsOut
[i
] = (uint8_t*)pArena
->AllocAligned(vertexBufferSize
, 32);
1199 // Allocate storage for transposed GS output
1200 uint32_t numSimdBatches
= AlignUp(gsState
.maxNumVerts
, SIMD_WIDTH
) / SIMD_WIDTH
;
1201 uint32_t transposedBufferSize
=
1202 numSimdBatches
* gsState
.outputVertexSize
* sizeof(Vec4
<SIMD_T
>);
1203 pGsBuffers
->pGsTransposed
= (uint8_t*)pArena
->AllocAligned(transposedBufferSize
, 32);
1205 // Allocate storage to hold temporary stream->cut buffer, if necessary
1206 if (state
.gsState
.isSingleStream
)
1208 pGsBuffers
->pStreamCutBuffer
= nullptr;
1212 pGsBuffers
->pStreamCutBuffer
=
1213 (uint8_t*)pArena
->AllocAligned(AlignUp(gsState
.maxNumVerts
* 2, 32), 32);
1217 //////////////////////////////////////////////////////////////////////////
1218 /// @brief Contains all data generated by the HS and passed to the
1219 /// tessellator and DS.
1220 struct TessellationThreadLocalData
1222 SWR_HS_CONTEXT hsContext
;
1227 size_t hsOutputAllocSize
;
1229 simdscalar
* pDSOutput
;
1230 size_t dsOutputAllocSize
;
1233 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1235 //////////////////////////////////////////////////////////////////////////
1236 /// @brief Allocate tessellation data for this worker thread.
1238 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1240 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1241 if (gt_pTessellationThreadData
== nullptr)
1243 gt_pTessellationThreadData
=
1244 (TessellationThreadLocalData
*)AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1245 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1249 //////////////////////////////////////////////////////////////////////////
1250 /// @brief Implements Tessellation Stages.
1251 /// @param pDC - pointer to draw context.
1252 /// @param workerId - thread's worker id. Even thread has a unique id.
1253 /// @param pa - The primitive assembly object.
1254 /// @param pGsOut - output stream for GS
1255 template <typename HasGeometryShaderT
, typename HasStreamOutT
, typename HasRastT
>
1256 static void TessellationStages(DRAW_CONTEXT
* pDC
,
1259 GsBuffers
* pGsBuffers
,
1260 uint32_t* pSoPrimData
,
1261 #if USE_SIMD16_FRONTEND
1262 uint32_t numPrims_simd8
,
1264 simdscalari
const& primID
)
1266 const API_STATE
& state
= GetApiState(pDC
);
1267 const SWR_TS_STATE
& tsState
= state
.tsState
;
1268 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1270 SWR_ASSERT(gt_pTessellationThreadData
);
1272 HANDLE tsCtx
= TSInitCtx(tsState
.domain
,
1273 tsState
.partitioning
,
1274 tsState
.tsOutputTopology
,
1275 gt_pTessellationThreadData
->pTxCtx
,
1276 gt_pTessellationThreadData
->tsCtxSize
);
1277 if (tsCtx
== nullptr)
1279 gt_pTessellationThreadData
->pTxCtx
=
1280 AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1281 tsCtx
= TSInitCtx(tsState
.domain
,
1282 tsState
.partitioning
,
1283 tsState
.tsOutputTopology
,
1284 gt_pTessellationThreadData
->pTxCtx
,
1285 gt_pTessellationThreadData
->tsCtxSize
);
1289 #if USE_SIMD16_FRONTEND
1290 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1291 if (HasRastT::value
)
1293 switch (tsState
.postDSTopology
)
1295 case TOP_TRIANGLE_LIST
:
1296 pfnClipFunc
= ClipTriangles_simd16
;
1299 pfnClipFunc
= ClipLines_simd16
;
1301 case TOP_POINT_LIST
:
1302 pfnClipFunc
= ClipPoints_simd16
;
1305 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1310 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1311 if (HasRastT::value
)
1313 switch (tsState
.postDSTopology
)
1315 case TOP_TRIANGLE_LIST
:
1316 pfnClipFunc
= ClipTriangles
;
1319 pfnClipFunc
= ClipLines
;
1321 case TOP_POINT_LIST
:
1322 pfnClipFunc
= ClipPoints
;
1325 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1330 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1331 hsContext
.PrimitiveID
= primID
;
1332 hsContext
.outputSize
= tsState
.hsAllocationSize
;
1334 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1335 // Max storage for one attribute for an entire simdprimitive
1336 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1338 // assemble all attributes for the input primitives
1339 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1341 uint32_t attribSlot
= tsState
.srcVertexAttribOffset
+ slot
;
1342 pa
.Assemble(attribSlot
, simdattrib
);
1344 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1346 hsContext
.vert
[i
].attrib
[tsState
.vertexAttribOffset
+ slot
] = simdattrib
[i
];
1350 // Allocate HS output storage
1351 uint32_t requiredAllocSize
= KNOB_SIMD_WIDTH
* tsState
.hsAllocationSize
;
1353 if (requiredAllocSize
> gt_pTessellationThreadData
->hsOutputAllocSize
)
1355 AlignedFree(gt_pTessellationThreadData
->pHSOutput
);
1356 gt_pTessellationThreadData
->pHSOutput
= (uint8_t*)AlignedMalloc(requiredAllocSize
, 64);
1357 gt_pTessellationThreadData
->hsOutputAllocSize
= requiredAllocSize
;
1360 hsContext
.pCPout
= (ScalarPatch
*)gt_pTessellationThreadData
->pHSOutput
;
1363 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1366 #if USE_SIMD16_FRONTEND
1367 uint32_t numPrims
= numPrims_simd8
;
1369 uint32_t numPrims
= pa
.NumPrims();
1371 hsContext
.mask
= GenerateMask(numPrims
);
1374 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEHullShader
, pDC
->drawId
);
1375 state
.pfnHsFunc(GetPrivateState(pDC
), pWorkerData
, &hsContext
);
1376 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEHullShader
, 0);
1378 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1379 AR_EVENT(HSStats((HANDLE
)&hsContext
.stats
));
1381 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1383 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1385 ScalarPatch
* pCPout
= (ScalarPatch
*)(gt_pTessellationThreadData
->pHSOutput
+ tsState
.hsAllocationSize
* p
);
1387 SWR_TESSELLATION_FACTORS tessFactors
;
1388 tessFactors
= hsContext
.pCPout
[p
].tessFactors
;
1391 SWR_TS_TESSELLATED_DATA tsData
= {0};
1392 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FETessellation
, pDC
->drawId
);
1393 TSTessellate(tsCtx
, tessFactors
, tsData
);
1394 AR_EVENT(TessPrimCount(1));
1395 RDTSC_END(pDC
->pContext
->pBucketMgr
, FETessellation
, 0);
1397 if (tsData
.NumPrimitives
== 0)
1401 SWR_ASSERT(tsData
.NumDomainPoints
);
1403 // Allocate DS Output memory
1404 uint32_t requiredDSVectorInvocations
=
1405 AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1406 #if USE_SIMD16_FRONTEND
1407 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) *
1408 tsState
.dsAllocationSize
; // simd8 -> simd16, padding
1410 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.dsAllocationSize
;
1411 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1413 if (requiredAllocSize
> gt_pTessellationThreadData
->dsOutputAllocSize
)
1415 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1416 gt_pTessellationThreadData
->pDSOutput
=
1417 (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1418 gt_pTessellationThreadData
->dsOutputAllocSize
= requiredAllocSize
;
1420 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1421 SWR_ASSERT(gt_pTessellationThreadData
->dsOutputAllocSize
>= requiredAllocSize
);
1424 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1427 // Run Domain Shader
1428 SWR_DS_CONTEXT dsContext
;
1429 dsContext
.PrimitiveID
= pPrimId
[p
];
1430 dsContext
.pCpIn
= pCPout
;
1431 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1432 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1433 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1434 dsContext
.outVertexAttribOffset
= tsState
.dsOutVtxAttribOffset
;
1435 #if USE_SIMD16_FRONTEND
1436 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1438 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1441 uint32_t dsInvocations
= 0;
1443 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
;
1444 ++dsContext
.vectorOffset
)
1446 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1448 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEDomainShader
, pDC
->drawId
);
1449 state
.pfnDsFunc(GetPrivateState(pDC
), pWorkerData
, &dsContext
);
1450 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEDomainShader
, 0);
1452 AR_EVENT(DSStats((HANDLE
)&dsContext
.stats
));
1454 dsInvocations
+= KNOB_SIMD_WIDTH
;
1456 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1458 #if USE_SIMD16_FRONTEND
1459 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1464 #if USE_SIMD16_FRONTEND
1465 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1466 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1468 dsContext
.pOutputData
,
1469 dsContext
.vectorStride
,
1472 tsState
.numDsOutputAttribs
+ tsState
.dsOutVtxAttribOffset
,
1474 tsData
.NumPrimitives
,
1475 tsState
.postDSTopology
,
1476 NumVertsPerPrim(tsState
.postDSTopology
, false));
1478 while (tessPa
.HasWork())
1480 #if USE_SIMD16_FRONTEND
1481 const uint32_t numPrims
= tessPa
.NumPrims();
1482 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1483 const uint32_t numPrims_hi
=
1484 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1486 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1487 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1488 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1491 if (HasGeometryShaderT::value
)
1493 #if USE_SIMD16_FRONTEND
1494 tessPa
.useAlternateOffset
= false;
1495 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1496 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1500 tessPa
.useAlternateOffset
= true;
1501 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1502 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1505 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1511 _simd_set1_epi32(dsContext
.PrimitiveID
));
1516 if (HasStreamOutT::value
)
1518 #if ENABLE_AVX512_SIMD16
1519 tessPa
.useAlternateOffset
= false;
1521 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1524 if (HasRastT::value
)
1526 #if USE_SIMD16_FRONTEND
1527 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1529 simdvector prim
[3]; // Only deal with triangles, lines, or points
1531 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
1533 #if USE_SIMD16_FRONTEND
1534 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1536 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1538 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, 1);
1539 SWR_ASSERT(assemble
);
1541 SWR_ASSERT(pfnClipFunc
);
1542 #if USE_SIMD16_FRONTEND
1543 // Gather data from the SVG if provided.
1544 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1545 simd16scalari vRtIdx
= SIMD16::setzero_si();
1546 SIMD16::Vec4 svgAttrib
[4];
1548 if (state
.backendState
.readViewportArrayIndex
||
1549 state
.backendState
.readRenderTargetArrayIndex
)
1551 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1554 if (state
.backendState
.readViewportArrayIndex
)
1556 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1557 tessPa
.viewportArrayActive
= true;
1559 if (state
.backendState
.readRenderTargetArrayIndex
)
1561 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1562 tessPa
.rtArrayActive
= true;
1567 // OOB VPAI indices => forced to zero.
1568 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1569 simd16scalari vNumViewports
=
1570 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1571 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1572 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1574 tessPa
.useAlternateOffset
= false;
1585 // Gather data from the SGV if provided.
1586 simdscalari vViewportIdx
= SIMD::setzero_si();
1587 simdscalari vRtIdx
= SIMD::setzero_si();
1588 SIMD::Vec4 svgAttrib
[4];
1590 if (state
.backendState
.readViewportArrayIndex
||
1591 state
.backendState
.readRenderTargetArrayIndex
)
1593 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1596 if (state
.backendState
.readViewportArrayIndex
)
1598 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1600 // OOB VPAI indices => forced to zero.
1601 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1602 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1603 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1604 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1605 tessPa
.viewportArrayActive
= true;
1607 if (state
.backendState
.readRenderTargetArrayIndex
)
1609 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1610 tessPa
.rtArrayActive
= true;
1616 GenMask(tessPa
.NumPrims()),
1617 _simd_set1_epi32(dsContext
.PrimitiveID
),
1626 } // while (tessPa.HasWork())
1627 } // for (uint32_t p = 0; p < numPrims; ++p)
1629 #if USE_SIMD16_FRONTEND
1630 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1632 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1633 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1635 gt_pTessellationThreadData
->dsOutputAllocSize
= 0;
1638 TSDestroyCtx(tsCtx
);
1641 THREAD
PA_STATE::SIMDVERTEX
* gpVertexStore
= nullptr;
1642 THREAD
uint32_t gVertexStoreSize
= 0;
1644 //////////////////////////////////////////////////////////////////////////
1645 /// @brief FE handler for SwrDraw.
1646 /// @tparam IsIndexedT - Is indexed drawing enabled
1647 /// @tparam HasTessellationT - Is tessellation enabled
1648 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1649 /// @tparam HasStreamOutT - Is stream-out enabled
1650 /// @tparam HasRastT - Is rasterization enabled
1651 /// @param pContext - pointer to SWR context.
1652 /// @param pDC - pointer to draw context.
1653 /// @param workerId - thread's worker id.
1654 /// @param pUserData - Pointer to DRAW_WORK
1655 template <typename IsIndexedT
,
1656 typename IsCutIndexEnabledT
,
1657 typename HasTessellationT
,
1658 typename HasGeometryShaderT
,
1659 typename HasStreamOutT
,
1661 void ProcessDraw(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
1663 #if KNOB_ENABLE_TOSS_POINTS
1664 if (KNOB_TOSS_QUEUE_FE
)
1670 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessDraw
, pDC
->drawId
);
1672 void* pWorkerData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1674 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1675 const API_STATE
& state
= GetApiState(pDC
);
1677 uint32_t indexSize
= 0;
1678 uint32_t endVertex
= work
.numVerts
;
1680 gfxptr_t xpLastRequestedIndex
= 0;
1681 if (IsIndexedT::value
)
1686 indexSize
= sizeof(uint32_t);
1689 indexSize
= sizeof(uint16_t);
1692 indexSize
= sizeof(uint8_t);
1695 SWR_INVALID("Invalid work.type: %d", work
.type
);
1697 xpLastRequestedIndex
= work
.xpIB
+ endVertex
* indexSize
;
1701 // No cuts, prune partial primitives.
1702 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1705 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1706 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1709 GsBuffers gsBuffers
;
1710 if (HasGeometryShaderT::value
)
1712 #if USE_SIMD16_FRONTEND
1713 AllocateGsBuffers
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1714 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1716 AllocateGsBuffers
<SIMD256
, KNOB_SIMD_WIDTH
>(
1717 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1721 if (HasTessellationT::value
)
1723 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1724 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1725 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1727 AllocateTessellationData(pContext
);
1731 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1732 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1733 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1736 // allocate space for streamout input prim data
1737 uint32_t* pSoPrimData
= nullptr;
1738 if (HasStreamOutT::value
)
1740 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1743 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, true);
1744 #if USE_SIMD16_FRONTEND
1745 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simd16vector
);
1747 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simdvector
);
1750 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1752 // Compute storage requirements for vertex store
1753 // TODO: allocation needs to be rethought for better cut support
1754 uint32_t numVerts
= vertexCount
+ 2; // Need extra space for PA state machine
1755 uint32_t vertexStoreSize
= numVerts
* simdVertexSizeBytes
;
1757 // grow the vertex store for the PA as necessary
1758 if (gVertexStoreSize
< vertexStoreSize
)
1760 if (gpVertexStore
!= nullptr)
1762 AlignedFree(gpVertexStore
);
1763 gpVertexStore
= nullptr;
1766 SWR_ASSERT(gpVertexStore
== nullptr);
1768 gpVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(vertexStoreSize
, 64));
1769 gVertexStoreSize
= vertexStoreSize
;
1771 SWR_ASSERT(gpVertexStore
!= nullptr);
1774 // choose primitive assembler
1776 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
,
1781 state
.frontendState
.vsVertexSize
,
1782 GetNumVerts(state
.topology
, 1));
1783 PA_STATE
& pa
= paFactory
.GetPA();
1785 #if USE_SIMD16_FRONTEND
1786 #if USE_SIMD16_SHADERS
1792 SWR_VS_CONTEXT vsContext_lo
;
1793 SWR_VS_CONTEXT vsContext_hi
;
1795 #if USE_SIMD16_SHADERS
1796 vsContext_lo
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1797 vsContext_hi
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1799 vsContext_lo
.pVin
= &vin_lo
;
1800 vsContext_hi
.pVin
= &vin_hi
;
1802 vsContext_lo
.AlternateOffset
= 0;
1803 vsContext_hi
.AlternateOffset
= 1;
1805 SWR_FETCH_CONTEXT fetchInfo_lo
= {0};
1807 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1808 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1809 fetchInfo_lo
.StartVertex
= 0;
1811 if (IsIndexedT::value
)
1813 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1815 // if the entire index buffer isn't being consumed, set the last index
1816 // so that fetches < a SIMD wide will be masked off
1817 fetchInfo_lo
.xpLastIndex
= state
.indexBuffer
.xpIndices
+ state
.indexBuffer
.size
;
1818 if (xpLastRequestedIndex
< fetchInfo_lo
.xpLastIndex
)
1820 fetchInfo_lo
.xpLastIndex
= xpLastRequestedIndex
;
1825 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1828 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1830 const simd16scalari vScale
=
1831 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1833 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1837 simd16scalari vIndex
;
1839 if (IsIndexedT::value
)
1841 fetchInfo_lo
.xpIndices
= work
.xpIB
;
1842 fetchInfo_hi
.xpIndices
=
1843 fetchInfo_lo
.xpIndices
+ KNOB_SIMD_WIDTH
* indexSize
; // 1/2 of KNOB_SIMD16_WIDTH
1847 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1849 fetchInfo_lo
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(GetPrivateState(pDC
), &vIndex
);
1850 fetchInfo_hi
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(
1851 GetPrivateState(pDC
),
1852 &vIndex
+ KNOB_SIMD_WIDTH
* sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1855 fetchInfo_lo
.CurInstance
= instanceNum
;
1856 fetchInfo_hi
.CurInstance
= instanceNum
;
1858 vsContext_lo
.InstanceID
= instanceNum
;
1859 vsContext_hi
.InstanceID
= instanceNum
;
1861 while (pa
.HasWork())
1863 // GetNextVsOutput currently has the side effect of updating some PA state machine
1864 // state. So we need to keep this outside of (i < endVertex) check.
1866 simdmask
* pvCutIndices_lo
= nullptr;
1867 simdmask
* pvCutIndices_hi
= nullptr;
1869 if (IsIndexedT::value
)
1871 // simd16mask <=> simdmask[2]
1873 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1874 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1877 simd16vertex
& vout
= pa
.GetNextVsOutput();
1879 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1880 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1884 if (!IsIndexedT::value
)
1886 fetchInfo_lo
.xpLastIndex
= fetchInfo_lo
.xpIndices
;
1888 offset
= std::min(endVertex
- i
, (uint32_t)KNOB_SIMD16_WIDTH
);
1889 offset
*= 4; // convert from index to address
1890 #if USE_SIMD16_SHADERS
1891 fetchInfo_lo
.xpLastIndex
+= offset
;
1893 fetchInfo_lo
.xpLastIndex
+= std::min(offset
, (uint32_t)KNOB_SIMD_WIDTH
);
1895 std::min(offset
, (uint32_t)KNOB_SIMD16_WIDTH
) - KNOB_SIMD_WIDTH
;
1896 assert(offset
>= 0);
1897 fetchInfo_hi
.xpLastIndex
= fetchInfo_hi
.xpIndices
;
1898 fetchInfo_hi
.xpLastIndex
+= offset2
;
1901 // 1. Execute FS/VS for a single SIMD.
1902 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
1903 #if USE_SIMD16_SHADERS
1904 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin
);
1906 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin_lo
);
1908 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1910 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_hi
, vin_hi
);
1913 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
1915 // forward fetch generated vertex IDs to the vertex shader
1916 #if USE_SIMD16_SHADERS
1918 vsContext_lo
.VertexID16
=
1919 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID
, 0);
1920 vsContext_lo
.VertexID16
=
1921 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID2
, 1);
1923 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1924 vsContext_hi
.VertexID
= fetchInfo_lo
.VertexID2
;
1927 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1928 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1931 // Setup active mask for vertex shader.
1933 vsContext_lo
.mask16
= GenerateMask16(endVertex
- i
);
1935 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1936 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1939 // forward cut mask to the PA
1940 if (IsIndexedT::value
)
1942 #if USE_SIMD16_SHADERS
1943 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1944 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask2
));
1946 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1947 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1951 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1953 #if KNOB_ENABLE_TOSS_POINTS
1954 if (!KNOB_TOSS_FETCH
)
1957 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
1959 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1960 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1962 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1963 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1965 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1967 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_hi
);
1968 AR_EVENT(VSStats((HANDLE
)&vsContext_hi
.stats
));
1971 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
1973 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1977 // 2. Assemble primitives given the last two SIMD.
1980 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1982 RDTSC_START(pContext
->pBucketMgr
, FEPAAssemble
);
1983 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1984 RDTSC_STOP(pContext
->pBucketMgr
, FEPAAssemble
, 1, 0);
1986 #if KNOB_ENABLE_TOSS_POINTS
1987 if (!KNOB_TOSS_FETCH
)
1990 #if KNOB_ENABLE_TOSS_POINTS
1996 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1998 const uint32_t numPrims
= pa
.NumPrims();
1999 const uint32_t numPrims_lo
=
2000 std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
2001 const uint32_t numPrims_hi
=
2002 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
2004 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
2005 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
2006 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
2008 if (HasTessellationT::value
)
2010 pa
.useAlternateOffset
= false;
2011 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2022 pa
.useAlternateOffset
= true;
2023 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2033 else if (HasGeometryShaderT::value
)
2035 pa
.useAlternateOffset
= false;
2036 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2046 pa
.useAlternateOffset
= true;
2047 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2058 // If streamout is enabled then stream vertices out to memory.
2059 if (HasStreamOutT::value
)
2061 pa
.useAlternateOffset
= false;
2062 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2065 if (HasRastT::value
)
2067 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
2068 // Gather data from the SVG if provided.
2069 simd16scalari vpai
= SIMD16::setzero_si();
2070 simd16scalari rtai
= SIMD16::setzero_si();
2071 SIMD16::Vec4 svgAttrib
[4];
2073 if (state
.backendState
.readViewportArrayIndex
||
2074 state
.backendState
.readRenderTargetArrayIndex
)
2076 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2079 if (state
.backendState
.readViewportArrayIndex
)
2081 vpai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2082 pa
.viewportArrayActive
= true;
2084 if (state
.backendState
.readRenderTargetArrayIndex
)
2087 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2088 pa
.rtArrayActive
= true;
2092 // OOB VPAI indices => forced to zero.
2093 vpai
= SIMD16::max_epi32(vpai
, SIMD16::setzero_si());
2094 simd16scalari vNumViewports
=
2095 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2096 simd16scalari vClearMask
=
2097 SIMD16::cmplt_epi32(vpai
, vNumViewports
);
2098 vpai
= SIMD16::and_si(vClearMask
, vpai
);
2100 pa
.useAlternateOffset
= false;
2101 pDC
->pState
->pfnProcessPrims_simd16(pDC
,
2115 } while (pa
.NextPrim());
2117 if (IsIndexedT::value
)
2119 fetchInfo_lo
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2120 fetchInfo_hi
.xpIndices
= fetchInfo_hi
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2124 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
2127 i
+= KNOB_SIMD16_WIDTH
;
2134 SWR_VS_CONTEXT vsContext
;
2135 SWR_FETCH_CONTEXT fetchInfo
= {0};
2137 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
2138 fetchInfo
.StartInstance
= work
.startInstance
;
2139 fetchInfo
.StartVertex
= 0;
2141 if (IsIndexedT::value
)
2143 fetchInfo
.BaseVertex
= work
.baseVertex
;
2145 // if the entire index buffer isn't being consumed, set the last index
2146 // so that fetches < a SIMD wide will be masked off
2147 fetchInfo
.pLastIndex
=
2148 (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
2149 if (xpLastRequestedIndex
< fetchInfo
.pLastIndex
)
2151 fetchInfo
.pLastIndex
= xpLastRequestedIndex
;
2156 fetchInfo
.StartVertex
= work
.startVertex
;
2159 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2161 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2162 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
2167 if (IsIndexedT::value
)
2169 fetchInfo
.pIndices
= work
.pIB
;
2173 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
2174 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
2177 fetchInfo
.CurInstance
= instanceNum
;
2178 vsContext
.InstanceID
= instanceNum
;
2180 while (pa
.HasWork())
2182 // GetNextVsOutput currently has the side effect of updating some PA state machine
2183 // state. So we need to keep this outside of (i < endVertex) check.
2184 simdmask
* pvCutIndices
= nullptr;
2185 if (IsIndexedT::value
)
2187 pvCutIndices
= &pa
.GetNextVsIndices();
2190 simdvertex
& vout
= pa
.GetNextVsOutput();
2191 vsContext
.pVin
= &vout
;
2192 vsContext
.pVout
= &vout
;
2196 // 1. Execute FS/VS for a single SIMD.
2197 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
2198 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo
, vout
);
2199 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
2201 // forward fetch generated vertex IDs to the vertex shader
2202 vsContext
.VertexID
= fetchInfo
.VertexID
;
2204 // Setup active mask for vertex shader.
2205 vsContext
.mask
= GenerateMask(endVertex
- i
);
2207 // forward cut mask to the PA
2208 if (IsIndexedT::value
)
2210 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
2213 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
2215 #if KNOB_ENABLE_TOSS_POINTS
2216 if (!KNOB_TOSS_FETCH
)
2219 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
2220 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext
);
2221 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
2223 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
2224 AR_EVENT(VSStats((HANDLE
)&vsContext
.stats
));
2228 // 2. Assemble primitives given the last two SIMD.
2231 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
2232 // PaAssemble returns false if there is not enough verts to assemble.
2233 RDTSC_BEGIN(pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
2234 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
2235 RDTSC_END(pContext
->pBucketMgr
, FEPAAssemble
, 1);
2237 #if KNOB_ENABLE_TOSS_POINTS
2238 if (!KNOB_TOSS_FETCH
)
2241 #if KNOB_ENABLE_TOSS_POINTS
2247 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2249 if (HasTessellationT::value
)
2251 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2257 pa
.GetPrimID(work
.startPrimID
));
2259 else if (HasGeometryShaderT::value
)
2261 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
2267 pa
.GetPrimID(work
.startPrimID
));
2271 // If streamout is enabled then stream vertices out to memory.
2272 if (HasStreamOutT::value
)
2274 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2277 if (HasRastT::value
)
2279 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
2281 // Gather data from the SVG if provided.
2282 simdscalari vViewportIdx
= SIMD::setzero_si();
2283 simdscalari vRtIdx
= SIMD::setzero_si();
2284 SIMD::Vec4 svgAttrib
[4];
2286 if (state
.backendState
.readViewportArrayIndex
||
2287 state
.backendState
.readRenderTargetArrayIndex
)
2289 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2292 if (state
.backendState
.readViewportArrayIndex
)
2295 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2297 // OOB VPAI indices => forced to zero.
2299 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
2300 simdscalari vNumViewports
=
2301 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2302 simdscalari vClearMask
=
2303 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
2304 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
2305 pa
.viewportArrayActive
= true;
2307 if (state
.backendState
.readRenderTargetArrayIndex
)
2310 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2311 pa
.rtArrayActive
= true;
2314 pDC
->pState
->pfnProcessPrims(pDC
,
2318 GenMask(pa
.NumPrims()),
2319 pa
.GetPrimID(work
.startPrimID
),
2327 } while (pa
.NextPrim());
2329 if (IsIndexedT::value
)
2331 fetchInfo
.pIndices
=
2332 (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
2336 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
2339 i
+= KNOB_SIMD_WIDTH
;
2346 RDTSC_END(pContext
->pBucketMgr
, FEProcessDraw
, numPrims
* work
.numInstances
);
2349 struct FEDrawChooser
2351 typedef PFN_FE_WORK_FUNC FuncType
;
2353 template <typename
... ArgsB
>
2354 static FuncType
GetFunc()
2356 return ProcessDraw
<ArgsB
...>;
2360 // Selector for correct templated Draw front-end function
2361 PFN_FE_WORK_FUNC
GetProcessDrawFunc(bool IsIndexed
,
2362 bool IsCutIndexEnabled
,
2363 bool HasTessellation
,
2364 bool HasGeometryShader
,
2366 bool HasRasterization
)
2368 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
,