1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
51 void ProcessSync(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
55 work
.pfnWork
= ProcessSyncBE
;
57 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
58 pTileMgr
->enqueue(0, 0, &work
);
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 void ProcessShutdown(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
71 work
.pfnWork
= ProcessShutdownBE
;
73 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
78 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
80 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
82 pTileMgr
->enqueue(i
, n
, &work
);
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
94 void ProcessClear(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
96 CLEAR_DESC
* pDesc
= (CLEAR_DESC
*)pUserData
;
97 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
102 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
103 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
104 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
108 work
.pfnWork
= ProcessClearBE
;
109 work
.desc
.clear
= *pDesc
;
111 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
113 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
115 pTileMgr
->enqueue(x
, y
, &work
);
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
129 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessStoreTiles
, pDC
->drawId
);
130 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
131 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
136 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
137 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
138 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
142 work
.type
= STORETILES
;
143 work
.pfnWork
= ProcessStoreTilesBE
;
144 work
.desc
.storeTiles
= *pDesc
;
146 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
148 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
150 pTileMgr
->enqueue(x
, y
, &work
);
154 RDTSC_END(pContext
->pBucketMgr
, FEProcessStoreTiles
, 0);
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
* pContext
,
169 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, pDC
->drawId
);
170 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
171 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
175 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
176 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
177 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
179 if (pDesc
->fullTilesOnly
== false)
181 // include partial tiles
182 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
183 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
184 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
185 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
188 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
189 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
191 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
192 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
196 work
.type
= DISCARDINVALIDATETILES
;
197 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
198 work
.desc
.discardInvalidateTiles
= *pDesc
;
200 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
202 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
204 pTileMgr
->enqueue(x
, y
, &work
);
208 RDTSC_END(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, 0);
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
222 case TOP_TRIANGLE_LIST
:
224 case TOP_TRIANGLE_STRIP
:
225 return numPrims
< 3 ? 0 : numPrims
- 2;
226 case TOP_TRIANGLE_FAN
:
227 return numPrims
< 3 ? 0 : numPrims
- 2;
228 case TOP_TRIANGLE_DISC
:
229 return numPrims
< 2 ? 0 : numPrims
- 1;
233 return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
235 return numPrims
< 2 ? 0 : numPrims
- 1;
242 case TOP_LINE_LIST_ADJ
:
244 case TOP_LISTSTRIP_ADJ
:
245 return numPrims
< 3 ? 0 : numPrims
- 3;
246 case TOP_TRI_LIST_ADJ
:
248 case TOP_TRI_STRIP_ADJ
:
249 return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
251 case TOP_PATCHLIST_1
:
252 case TOP_PATCHLIST_2
:
253 case TOP_PATCHLIST_3
:
254 case TOP_PATCHLIST_4
:
255 case TOP_PATCHLIST_5
:
256 case TOP_PATCHLIST_6
:
257 case TOP_PATCHLIST_7
:
258 case TOP_PATCHLIST_8
:
259 case TOP_PATCHLIST_9
:
260 case TOP_PATCHLIST_10
:
261 case TOP_PATCHLIST_11
:
262 case TOP_PATCHLIST_12
:
263 case TOP_PATCHLIST_13
:
264 case TOP_PATCHLIST_14
:
265 case TOP_PATCHLIST_15
:
266 case TOP_PATCHLIST_16
:
267 case TOP_PATCHLIST_17
:
268 case TOP_PATCHLIST_18
:
269 case TOP_PATCHLIST_19
:
270 case TOP_PATCHLIST_20
:
271 case TOP_PATCHLIST_21
:
272 case TOP_PATCHLIST_22
:
273 case TOP_PATCHLIST_23
:
274 case TOP_PATCHLIST_24
:
275 case TOP_PATCHLIST_25
:
276 case TOP_PATCHLIST_26
:
277 case TOP_PATCHLIST_27
:
278 case TOP_PATCHLIST_28
:
279 case TOP_PATCHLIST_29
:
280 case TOP_PATCHLIST_30
:
281 case TOP_PATCHLIST_31
:
282 case TOP_PATCHLIST_32
:
283 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
286 case TOP_POINT_LIST_BF
:
287 case TOP_LINE_STRIP_CONT
:
288 case TOP_LINE_STRIP_BF
:
289 case TOP_LINE_STRIP_CONT_BF
:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
291 case TOP_TRI_STRIP_REVERSE
:
292 case TOP_PATCHLIST_BASE
:
294 SWR_INVALID("Unsupported topology: %d", mode
);
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
311 case TOP_TRIANGLE_LIST
:
313 case TOP_TRIANGLE_STRIP
:
314 return numPrims
? numPrims
+ 2 : 0;
315 case TOP_TRIANGLE_FAN
:
316 return numPrims
? numPrims
+ 2 : 0;
317 case TOP_TRIANGLE_DISC
:
318 return numPrims
? numPrims
+ 1 : 0;
322 return numPrims
? numPrims
* 2 + 2 : 0;
324 return numPrims
? numPrims
+ 1 : 0;
331 case TOP_LINE_LIST_ADJ
:
333 case TOP_LISTSTRIP_ADJ
:
334 return numPrims
? numPrims
+ 3 : 0;
335 case TOP_TRI_LIST_ADJ
:
337 case TOP_TRI_STRIP_ADJ
:
338 return numPrims
? (numPrims
+ 2) * 2 : 0;
340 case TOP_PATCHLIST_1
:
341 case TOP_PATCHLIST_2
:
342 case TOP_PATCHLIST_3
:
343 case TOP_PATCHLIST_4
:
344 case TOP_PATCHLIST_5
:
345 case TOP_PATCHLIST_6
:
346 case TOP_PATCHLIST_7
:
347 case TOP_PATCHLIST_8
:
348 case TOP_PATCHLIST_9
:
349 case TOP_PATCHLIST_10
:
350 case TOP_PATCHLIST_11
:
351 case TOP_PATCHLIST_12
:
352 case TOP_PATCHLIST_13
:
353 case TOP_PATCHLIST_14
:
354 case TOP_PATCHLIST_15
:
355 case TOP_PATCHLIST_16
:
356 case TOP_PATCHLIST_17
:
357 case TOP_PATCHLIST_18
:
358 case TOP_PATCHLIST_19
:
359 case TOP_PATCHLIST_20
:
360 case TOP_PATCHLIST_21
:
361 case TOP_PATCHLIST_22
:
362 case TOP_PATCHLIST_23
:
363 case TOP_PATCHLIST_24
:
364 case TOP_PATCHLIST_25
:
365 case TOP_PATCHLIST_26
:
366 case TOP_PATCHLIST_27
:
367 case TOP_PATCHLIST_28
:
368 case TOP_PATCHLIST_29
:
369 case TOP_PATCHLIST_30
:
370 case TOP_PATCHLIST_31
:
371 case TOP_PATCHLIST_32
:
372 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
375 case TOP_POINT_LIST_BF
:
376 case TOP_LINE_STRIP_CONT
:
377 case TOP_LINE_STRIP_BF
:
378 case TOP_LINE_STRIP_CONT_BF
:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
380 case TOP_TRI_STRIP_REVERSE
:
381 case TOP_PATCHLIST_BASE
:
383 SWR_INVALID("Unsupported topology: %d", mode
);
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
396 uint32_t numVerts
= 0;
400 case TOP_POINT_LIST_BF
:
405 case TOP_LINE_LIST_ADJ
:
407 case TOP_LINE_STRIP_CONT
:
408 case TOP_LINE_STRIP_BF
:
409 case TOP_LISTSTRIP_ADJ
:
412 case TOP_TRIANGLE_LIST
:
413 case TOP_TRIANGLE_STRIP
:
414 case TOP_TRIANGLE_FAN
:
415 case TOP_TRI_LIST_ADJ
:
416 case TOP_TRI_STRIP_ADJ
:
417 case TOP_TRI_STRIP_REVERSE
:
425 case TOP_PATCHLIST_1
:
426 case TOP_PATCHLIST_2
:
427 case TOP_PATCHLIST_3
:
428 case TOP_PATCHLIST_4
:
429 case TOP_PATCHLIST_5
:
430 case TOP_PATCHLIST_6
:
431 case TOP_PATCHLIST_7
:
432 case TOP_PATCHLIST_8
:
433 case TOP_PATCHLIST_9
:
434 case TOP_PATCHLIST_10
:
435 case TOP_PATCHLIST_11
:
436 case TOP_PATCHLIST_12
:
437 case TOP_PATCHLIST_13
:
438 case TOP_PATCHLIST_14
:
439 case TOP_PATCHLIST_15
:
440 case TOP_PATCHLIST_16
:
441 case TOP_PATCHLIST_17
:
442 case TOP_PATCHLIST_18
:
443 case TOP_PATCHLIST_19
:
444 case TOP_PATCHLIST_20
:
445 case TOP_PATCHLIST_21
:
446 case TOP_PATCHLIST_22
:
447 case TOP_PATCHLIST_23
:
448 case TOP_PATCHLIST_24
:
449 case TOP_PATCHLIST_25
:
450 case TOP_PATCHLIST_26
:
451 case TOP_PATCHLIST_27
:
452 case TOP_PATCHLIST_28
:
453 case TOP_PATCHLIST_29
:
454 case TOP_PATCHLIST_30
:
455 case TOP_PATCHLIST_31
:
456 case TOP_PATCHLIST_32
:
457 numVerts
= topology
- TOP_PATCHLIST_BASE
;
460 SWR_INVALID("Unsupported topology: %d", topology
);
468 case TOP_LISTSTRIP_ADJ
:
469 case TOP_LINE_LIST_ADJ
:
472 case TOP_TRI_STRIP_ADJ
:
473 case TOP_TRI_LIST_ADJ
:
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
487 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
490 (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
491 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask
));
495 static INLINE simd16scalari
GenerateMask16(uint32_t numItemsRemaining
)
498 (numItemsRemaining
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: numItemsRemaining
;
499 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask
));
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
509 static void StreamOut(
510 DRAW_CONTEXT
* pDC
, PA_STATE
& pa
, uint32_t workerId
, uint32_t* pPrimData
, uint32_t streamIndex
)
512 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEStreamout
, pDC
->drawId
);
514 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
516 const API_STATE
& state
= GetApiState(pDC
);
517 const SWR_STREAMOUT_STATE
& soState
= state
.soState
;
519 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
523 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
525 SWR_STREAMOUT_CONTEXT soContext
= {0};
527 // Setup buffer state pointers.
528 for (uint32_t i
= 0; i
< 4; ++i
)
530 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
533 uint32_t numPrims
= pa
.NumPrims();
535 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
538 uint64_t soMask
= soState
.streamMasks
[streamIndex
];
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot
, soMask
))
543 simd4scalar attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
544 uint32_t paSlot
= slot
+ soState
.vertexAttribOffset
[streamIndex
];
545 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
556 uint32_t* pPrimDataAttrib
=
557 pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
559 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
562 soMask
&= ~(uint64_t(1) << slot
);
565 // Update pPrimData pointer
566 soContext
.pPrimData
= pPrimData
;
569 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state
.pfnSoFunc
[streamIndex
](GetPrivateState(pDC
), pWorkerData
, soContext
);
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i
= 0; i
< 4; ++i
)
577 if (state
.soBuffer
[i
].pWriteOffset
)
579 bool nullTileAccessed
= false;
580 void* pWriteOffset
= pDC
->pContext
->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC
), soContext
.pBuffer
[i
]->pWriteOffset
, &nullTileAccessed
, pWorkerData
);
582 *((uint32_t*)pWriteOffset
) = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
585 if (state
.soBuffer
[i
].soWriteEnable
)
587 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
588 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
592 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
593 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
595 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEStreamout
, 1);
598 #if USE_SIMD16_FRONTEND
599 //////////////////////////////////////////////////////////////////////////
600 /// Is value an even number (a multiple of two)
602 template <typename T
>
603 INLINE
static bool IsEven(T value
)
605 return (value
& 1) == 0;
608 //////////////////////////////////////////////////////////////////////////
609 /// Round up value to an even number (a multiple of two)
611 template <typename T
>
612 INLINE
static T
RoundUpEven(T value
)
614 return (value
+ 1) & ~1;
617 //////////////////////////////////////////////////////////////////////////
618 /// Round down value to an even number (a multiple of two)
620 template <typename T
>
621 INLINE
static T
RoundDownEven(T value
)
626 //////////////////////////////////////////////////////////////////////////
627 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
629 /// vertexCount is in terms of the source simdvertexes and must be even
631 /// attribCount will limit the vector copies to those attribs specified
633 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
635 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
* vertex_simd16
,
636 const simdvertex
* vertex
,
637 uint32_t vertexCount
,
638 uint32_t attribCount
)
641 SWR_ASSERT(vertex_simd16
);
642 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
646 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
648 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
650 for (uint32_t k
= 0; k
< 4; k
+= 1)
653 _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
655 if ((i
+ 1) < vertexCount
)
658 _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
663 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
665 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
671 //////////////////////////////////////////////////////////////////////////
672 /// @brief Computes number of invocations. The current index represents
673 /// the start of the SIMD. The max index represents how much work
674 /// items are remaining. If there is less then a SIMD's xmin of work
675 /// then return the remaining amount of work.
676 /// @param curIndex - The start index for the SIMD.
677 /// @param maxIndex - The last index for all work items.
678 static INLINE
uint32_t GetNumInvocations(uint32_t curIndex
, uint32_t maxIndex
)
680 uint32_t remainder
= (maxIndex
- curIndex
);
681 #if USE_SIMD16_FRONTEND
682 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
684 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
688 //////////////////////////////////////////////////////////////////////////
689 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
690 /// The geometry shader will loop over each active streamout buffer, assembling
691 /// primitives for the downstream stages. When multistream output is enabled,
692 /// the generated stream ID buffer from the GS needs to be converted to a cut
693 /// buffer for the primitive assembler.
694 /// @param stream - stream id to generate the cut buffer for
695 /// @param pStreamIdBase - pointer to the stream ID buffer
696 /// @param numEmittedVerts - Number of total verts emitted by the GS
697 /// @param pCutBuffer - output buffer to write cuts to
698 void ProcessStreamIdBuffer(uint32_t stream
,
699 uint8_t* pStreamIdBase
,
700 uint32_t numEmittedVerts
,
703 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
705 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
706 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
708 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
710 uint8_t curInputByte
= pStreamIdBase
[2 * b
];
712 for (uint32_t i
= 0; i
< 4; ++i
)
714 if ((curInputByte
& 0x3) != stream
)
721 curInputByte
= pStreamIdBase
[2 * b
+ 1];
722 for (uint32_t i
= 0; i
< 4; ++i
)
724 if ((curInputByte
& 0x3) != stream
)
726 outByte
|= (1 << (i
+ 4));
731 *pCutBuffer
++ = outByte
;
735 // Buffers that are allocated if GS is enabled
739 uint8_t* pGsOut
[KNOB_SIMD_WIDTH
];
740 uint8_t* pGsTransposed
;
741 void* pStreamCutBuffer
;
744 //////////////////////////////////////////////////////////////////////////
745 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
746 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
748 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
749 /// @param numVerts - Number of vertices outputted by the GS
750 /// @param numAttribs - Number of attributes per vertex
751 template <typename SIMD_T
, uint32_t SimdWidth
>
752 void TransposeSOAtoAOS(uint8_t* pDst
, uint8_t* pSrc
, uint32_t numVerts
, uint32_t numAttribs
)
754 uint32_t srcVertexStride
= numAttribs
* sizeof(float) * 4;
755 uint32_t dstVertexStride
= numAttribs
* sizeof(Float
<SIMD_T
>) * 4;
757 OSALIGNSIMD16(uint32_t) gatherOffsets
[SimdWidth
];
759 for (uint32_t i
= 0; i
< SimdWidth
; ++i
)
761 gatherOffsets
[i
] = srcVertexStride
* i
;
763 auto vGatherOffsets
= SIMD_T::load_si((Integer
<SIMD_T
>*)&gatherOffsets
[0]);
765 uint32_t numSimd
= AlignUp(numVerts
, SimdWidth
) / SimdWidth
;
766 uint32_t remainingVerts
= numVerts
;
768 for (uint32_t s
= 0; s
< numSimd
; ++s
)
770 uint8_t* pSrcBase
= pSrc
+ s
* srcVertexStride
* SimdWidth
;
771 uint8_t* pDstBase
= pDst
+ s
* dstVertexStride
;
773 // Compute mask to prevent src overflow
774 uint32_t mask
= std::min(remainingVerts
, SimdWidth
);
775 mask
= GenMask(mask
);
776 auto vMask
= SIMD_T::vmask_ps(mask
);
777 auto viMask
= SIMD_T::castps_si(vMask
);
779 for (uint32_t a
= 0; a
< numAttribs
; ++a
)
781 auto attribGatherX
= SIMD_T::mask_i32gather_ps(
782 SIMD_T::setzero_ps(), (const float*)pSrcBase
, vGatherOffsets
, vMask
);
783 auto attribGatherY
= SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
784 (const float*)(pSrcBase
+ sizeof(float)),
788 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
789 (const float*)(pSrcBase
+ sizeof(float) * 2),
793 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
794 (const float*)(pSrcBase
+ sizeof(float) * 3),
798 SIMD_T::maskstore_ps((float*)pDstBase
, viMask
, attribGatherX
);
799 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>)), viMask
, attribGatherY
);
800 SIMD_T::maskstore_ps(
801 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 2), viMask
, attribGatherZ
);
802 SIMD_T::maskstore_ps(
803 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 3), viMask
, attribGatherW
);
805 pSrcBase
+= sizeof(float) * 4;
806 pDstBase
+= sizeof(Float
<SIMD_T
>) * 4;
808 remainingVerts
-= SimdWidth
;
813 //////////////////////////////////////////////////////////////////////////
814 /// @brief Implements GS stage.
815 /// @param pDC - pointer to draw context.
816 /// @param workerId - thread's worker id. Even thread has a unique id.
817 /// @param pa - The primitive assembly object.
818 /// @param pGsOut - output stream for GS
819 template <typename HasStreamOutT
, typename HasRastT
>
820 static void GeometryShaderStage(DRAW_CONTEXT
* pDC
,
823 GsBuffers
* pGsBuffers
,
824 uint32_t* pSoPrimData
,
825 #if USE_SIMD16_FRONTEND
826 uint32_t numPrims_simd8
,
828 simdscalari
const& primID
)
830 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, pDC
->drawId
);
832 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
834 const API_STATE
& state
= GetApiState(pDC
);
835 const SWR_GS_STATE
* pState
= &state
.gsState
;
836 SWR_GS_CONTEXT gsContext
;
838 static uint8_t sNullBuffer
[128] = {0};
840 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
842 gsContext
.pStreams
[i
] = pGsBuffers
->pGsOut
[i
];
844 gsContext
.pVerts
= (simdvector
*)pGsBuffers
->pGsIn
;
845 gsContext
.PrimitiveID
= primID
;
847 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
848 simdvector attrib
[MAX_NUM_VERTS_PER_PRIM
];
850 // assemble all attributes for the input primitive
851 gsContext
.inputVertStride
= pState
->inputVertStride
;
852 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
854 uint32_t srcAttribSlot
= pState
->srcVertexAttribOffset
+ slot
;
855 uint32_t attribSlot
= pState
->vertexAttribOffset
+ slot
;
856 pa
.Assemble(srcAttribSlot
, attrib
);
858 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
860 gsContext
.pVerts
[attribSlot
+ pState
->inputVertStride
* i
] = attrib
[i
];
865 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
866 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
868 gsContext
.pVerts
[VERTEX_POSITION_SLOT
+ pState
->inputVertStride
* i
] = attrib
[i
];
871 // record valid prims from the frontend to avoid over binning the newly generated
873 #if USE_SIMD16_FRONTEND
874 uint32_t numInputPrims
= numPrims_simd8
;
876 uint32_t numInputPrims
= pa
.NumPrims();
879 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
881 gsContext
.InstanceID
= instance
;
882 gsContext
.mask
= GenerateMask(numInputPrims
);
884 // execute the geometry shader
885 state
.pfnGsFunc(GetPrivateState(pDC
), pWorkerData
, &gsContext
);
886 AR_EVENT(GSStats((HANDLE
)&gsContext
.stats
));
888 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
890 gsContext
.pStreams
[i
] += pState
->allocationSize
;
894 // set up new binner and state for the GS output topology
895 #if USE_SIMD16_FRONTEND
896 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
899 switch (pState
->outputTopology
)
902 pfnClipFunc
= ClipRectangles_simd16
;
904 case TOP_TRIANGLE_STRIP
:
905 pfnClipFunc
= ClipTriangles_simd16
;
908 pfnClipFunc
= ClipLines_simd16
;
911 pfnClipFunc
= ClipPoints_simd16
;
914 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
919 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
922 switch (pState
->outputTopology
)
925 pfnClipFunc
= ClipRectangles
;
927 case TOP_TRIANGLE_STRIP
:
928 pfnClipFunc
= ClipTriangles
;
931 pfnClipFunc
= ClipLines
;
934 pfnClipFunc
= ClipPoints
;
937 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
942 // foreach input prim:
943 // - setup a new PA based on the emitted verts for that prim
944 // - loop over the new verts, calling PA to assemble each prim
945 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
947 uint32_t totalPrimsGenerated
= 0;
948 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
950 uint8_t* pInstanceBase
= (uint8_t*)pGsBuffers
->pGsOut
[inputPrim
];
952 // Vertex count is either emitted by shader or static
953 uint32_t vertexCount
= 0;
954 if (pState
->staticVertexCount
)
956 vertexCount
= pState
->staticVertexCount
;
960 // If emitted in shader, it should be the stored in the first dword of the output buffer
961 vertexCount
= *(uint32_t*)pInstanceBase
;
964 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
966 uint32_t numEmittedVerts
= vertexCount
;
967 if (numEmittedVerts
== 0)
972 uint8_t* pBase
= pInstanceBase
+ instance
* pState
->allocationSize
;
974 pState
->controlDataSize
== 0 ? &sNullBuffer
[0] : pBase
+ pState
->controlDataOffset
;
975 uint8_t* pVertexBaseAOS
= pBase
+ pState
->outputVertexOffset
;
977 #if USE_SIMD16_FRONTEND
978 TransposeSOAtoAOS
<SIMD512
, KNOB_SIMD16_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
981 pState
->outputVertexSize
);
983 TransposeSOAtoAOS
<SIMD256
, KNOB_SIMD_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
986 pState
->outputVertexSize
);
989 uint32_t numAttribs
= state
.feNumAttributes
;
991 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
993 bool processCutVerts
= false;
994 uint8_t* pCutBuffer
= pCutBase
;
996 // assign default stream ID, only relevant when GS is outputting a single stream
997 uint32_t streamID
= 0;
998 if (pState
->isSingleStream
)
1000 processCutVerts
= true;
1001 streamID
= pState
->singleStreamID
;
1002 if (streamID
!= stream
)
1007 // early exit if this stream is not enabled for streamout
1008 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
1013 // multi-stream output, need to translate StreamID buffer to a cut buffer
1014 ProcessStreamIdBuffer(
1015 stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pGsBuffers
->pStreamCutBuffer
);
1016 pCutBuffer
= (uint8_t*)pGsBuffers
->pStreamCutBuffer
;
1017 processCutVerts
= false;
1020 #if USE_SIMD16_FRONTEND
1021 PA_STATE_CUT
gsPa(pDC
,
1022 (uint8_t*)pGsBuffers
->pGsTransposed
,
1024 pState
->outputVertexSize
,
1025 reinterpret_cast<simd16mask
*>(pCutBuffer
),
1028 pState
->outputTopology
,
1030 pa
.numVertsPerPrim
);
1033 PA_STATE_CUT
gsPa(pDC
,
1034 (uint8_t*)pGsBuffers
->pGsTransposed
,
1036 pState
->outputVertexSize
,
1040 pState
->outputTopology
,
1042 pa
.numVertsPerPrim
);
1045 while (gsPa
.GetNextStreamOutput())
1049 #if USE_SIMD16_FRONTEND
1050 simd16vector attrib_simd16
[3];
1052 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib_simd16
);
1055 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
1060 totalPrimsGenerated
+= gsPa
.NumPrims();
1062 if (HasStreamOutT::value
)
1064 #if ENABLE_AVX512_SIMD16
1065 gsPa
.useAlternateOffset
= false;
1067 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
1070 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
1072 #if USE_SIMD16_FRONTEND
1073 simd16scalari vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
1075 // Gather data from the SVG if provided.
1076 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1077 simd16scalari vRtIdx
= SIMD16::setzero_si();
1078 SIMD16::Vec4 svgAttrib
[4];
1080 if (state
.backendState
.readViewportArrayIndex
||
1081 state
.backendState
.readRenderTargetArrayIndex
)
1083 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1086 if (state
.backendState
.readViewportArrayIndex
)
1089 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1090 gsPa
.viewportArrayActive
= true;
1092 if (state
.backendState
.readRenderTargetArrayIndex
)
1094 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1095 gsPa
.rtArrayActive
= true;
1099 // OOB VPAI indices => forced to zero.
1101 SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1102 simd16scalari vNumViewports
=
1103 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1104 simd16scalari vClearMask
=
1105 SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1106 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1108 gsPa
.useAlternateOffset
= false;
1113 GenMask(gsPa
.NumPrims()),
1119 simdscalari vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1121 // Gather data from the SVG if provided.
1122 simdscalari vViewportIdx
= SIMD::setzero_si();
1123 simdscalari vRtIdx
= SIMD::setzero_si();
1124 SIMD::Vec4 svgAttrib
[4];
1126 if (state
.backendState
.readViewportArrayIndex
||
1127 state
.backendState
.readRenderTargetArrayIndex
)
1129 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1132 if (state
.backendState
.readViewportArrayIndex
)
1135 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1137 // OOB VPAI indices => forced to zero.
1139 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1140 simdscalari vNumViewports
=
1141 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1142 simdscalari vClearMask
=
1143 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1144 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1145 gsPa
.viewportArrayActive
= true;
1147 if (state
.backendState
.readRenderTargetArrayIndex
)
1149 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1150 gsPa
.rtArrayActive
= true;
1157 GenMask(gsPa
.NumPrims()),
1164 } while (gsPa
.NextPrim());
1170 // update GS pipeline stats
1171 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1172 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1173 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
* numInputPrims
));
1174 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, 1);
1177 //////////////////////////////////////////////////////////////////////////
1178 /// @brief Allocate GS buffers
1179 /// @param pDC - pointer to draw context.
1180 /// @param state - API state
1181 /// @param ppGsOut - pointer to GS output buffer allocation
1182 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1183 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1184 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
,
1185 const API_STATE
& state
,
1186 uint32_t vertsPerPrim
,
1187 GsBuffers
* pGsBuffers
)
1189 auto pArena
= pDC
->pArena
;
1190 SWR_ASSERT(pArena
!= nullptr);
1191 SWR_ASSERT(state
.gsState
.gsEnable
);
1193 const SWR_GS_STATE
& gsState
= state
.gsState
;
1195 // Allocate storage for vertex inputs
1196 uint32_t vertexInBufferSize
= gsState
.inputVertStride
* sizeof(simdvector
) * vertsPerPrim
;
1197 pGsBuffers
->pGsIn
= (uint8_t*)pArena
->AllocAligned(vertexInBufferSize
, 32);
1199 // Allocate arena space to hold GS output verts
1200 const uint32_t vertexBufferSize
= gsState
.instanceCount
* gsState
.allocationSize
;
1202 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1204 pGsBuffers
->pGsOut
[i
] = (uint8_t*)pArena
->AllocAligned(vertexBufferSize
, 32);
1207 // Allocate storage for transposed GS output
1208 uint32_t numSimdBatches
= AlignUp(gsState
.maxNumVerts
, SIMD_WIDTH
) / SIMD_WIDTH
;
1209 uint32_t transposedBufferSize
=
1210 numSimdBatches
* gsState
.outputVertexSize
* sizeof(Vec4
<SIMD_T
>);
1211 pGsBuffers
->pGsTransposed
= (uint8_t*)pArena
->AllocAligned(transposedBufferSize
, 32);
1213 // Allocate storage to hold temporary stream->cut buffer, if necessary
1214 if (state
.gsState
.isSingleStream
)
1216 pGsBuffers
->pStreamCutBuffer
= nullptr;
1220 pGsBuffers
->pStreamCutBuffer
=
1221 (uint8_t*)pArena
->AllocAligned(AlignUp(gsState
.maxNumVerts
* 2, 32), 32);
1225 //////////////////////////////////////////////////////////////////////////
1226 /// @brief Contains all data generated by the HS and passed to the
1227 /// tessellator and DS.
1228 struct TessellationThreadLocalData
1230 SWR_HS_CONTEXT hsContext
;
1235 size_t hsOutputAllocSize
;
1237 simdscalar
* pDSOutput
;
1238 size_t dsOutputAllocSize
;
1241 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1243 //////////////////////////////////////////////////////////////////////////
1244 /// @brief Allocate tessellation data for this worker thread.
1246 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1248 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1249 if (gt_pTessellationThreadData
== nullptr)
1251 gt_pTessellationThreadData
=
1252 (TessellationThreadLocalData
*)AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1253 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1257 //////////////////////////////////////////////////////////////////////////
1258 /// @brief Implements Tessellation Stages.
1259 /// @param pDC - pointer to draw context.
1260 /// @param workerId - thread's worker id. Even thread has a unique id.
1261 /// @param pa - The primitive assembly object.
1262 /// @param pGsOut - output stream for GS
1263 template <typename HasGeometryShaderT
, typename HasStreamOutT
, typename HasRastT
>
1264 static void TessellationStages(DRAW_CONTEXT
* pDC
,
1267 GsBuffers
* pGsBuffers
,
1268 uint32_t* pSoPrimData
,
1269 #if USE_SIMD16_FRONTEND
1270 uint32_t numPrims_simd8
,
1272 simdscalari
const& primID
)
1274 const API_STATE
& state
= GetApiState(pDC
);
1275 const SWR_TS_STATE
& tsState
= state
.tsState
;
1276 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1278 SWR_ASSERT(gt_pTessellationThreadData
);
1280 HANDLE tsCtx
= TSInitCtx(tsState
.domain
,
1281 tsState
.partitioning
,
1282 tsState
.tsOutputTopology
,
1283 gt_pTessellationThreadData
->pTxCtx
,
1284 gt_pTessellationThreadData
->tsCtxSize
);
1285 if (tsCtx
== nullptr)
1287 gt_pTessellationThreadData
->pTxCtx
=
1288 AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1289 tsCtx
= TSInitCtx(tsState
.domain
,
1290 tsState
.partitioning
,
1291 tsState
.tsOutputTopology
,
1292 gt_pTessellationThreadData
->pTxCtx
,
1293 gt_pTessellationThreadData
->tsCtxSize
);
1297 #if USE_SIMD16_FRONTEND
1298 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1299 if (HasRastT::value
)
1301 switch (tsState
.postDSTopology
)
1303 case TOP_TRIANGLE_LIST
:
1304 pfnClipFunc
= ClipTriangles_simd16
;
1307 pfnClipFunc
= ClipLines_simd16
;
1309 case TOP_POINT_LIST
:
1310 pfnClipFunc
= ClipPoints_simd16
;
1313 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1318 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1319 if (HasRastT::value
)
1321 switch (tsState
.postDSTopology
)
1323 case TOP_TRIANGLE_LIST
:
1324 pfnClipFunc
= ClipTriangles
;
1327 pfnClipFunc
= ClipLines
;
1329 case TOP_POINT_LIST
:
1330 pfnClipFunc
= ClipPoints
;
1333 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1338 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1339 hsContext
.PrimitiveID
= primID
;
1340 hsContext
.outputSize
= tsState
.hsAllocationSize
;
1342 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1343 // Max storage for one attribute for an entire simdprimitive
1344 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1346 // assemble all attributes for the input primitives
1347 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1349 uint32_t attribSlot
= tsState
.srcVertexAttribOffset
+ slot
;
1350 pa
.Assemble(attribSlot
, simdattrib
);
1352 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1354 hsContext
.vert
[i
].attrib
[tsState
.vertexAttribOffset
+ slot
] = simdattrib
[i
];
1358 // Allocate HS output storage
1359 uint32_t requiredAllocSize
= KNOB_SIMD_WIDTH
* tsState
.hsAllocationSize
;
1361 if (requiredAllocSize
> gt_pTessellationThreadData
->hsOutputAllocSize
)
1363 AlignedFree(gt_pTessellationThreadData
->pHSOutput
);
1364 gt_pTessellationThreadData
->pHSOutput
= (uint8_t*)AlignedMalloc(requiredAllocSize
, 64);
1365 gt_pTessellationThreadData
->hsOutputAllocSize
= requiredAllocSize
;
1368 hsContext
.pCPout
= (ScalarPatch
*)gt_pTessellationThreadData
->pHSOutput
;
1371 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1374 #if USE_SIMD16_FRONTEND
1375 uint32_t numPrims
= numPrims_simd8
;
1377 uint32_t numPrims
= pa
.NumPrims();
1379 hsContext
.mask
= GenerateMask(numPrims
);
1382 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEHullShader
, pDC
->drawId
);
1383 state
.pfnHsFunc(GetPrivateState(pDC
), pWorkerData
, &hsContext
);
1384 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEHullShader
, 0);
1386 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1387 AR_EVENT(HSStats((HANDLE
)&hsContext
.stats
));
1389 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1391 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1393 ScalarPatch
* pCPout
= (ScalarPatch
*)(gt_pTessellationThreadData
->pHSOutput
+ tsState
.hsAllocationSize
* p
);
1395 SWR_TESSELLATION_FACTORS tessFactors
;
1396 tessFactors
= hsContext
.pCPout
[p
].tessFactors
;
1399 SWR_TS_TESSELLATED_DATA tsData
= {0};
1400 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FETessellation
, pDC
->drawId
);
1401 TSTessellate(tsCtx
, tessFactors
, tsData
);
1402 AR_EVENT(TessPrimCount(1));
1403 RDTSC_END(pDC
->pContext
->pBucketMgr
, FETessellation
, 0);
1405 if (tsData
.NumPrimitives
== 0)
1409 SWR_ASSERT(tsData
.NumDomainPoints
);
1411 // Allocate DS Output memory
1412 uint32_t requiredDSVectorInvocations
=
1413 AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1414 #if USE_SIMD16_FRONTEND
1415 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) *
1416 tsState
.dsAllocationSize
; // simd8 -> simd16, padding
1418 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.dsAllocationSize
;
1419 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1421 if (requiredAllocSize
> gt_pTessellationThreadData
->dsOutputAllocSize
)
1423 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1424 gt_pTessellationThreadData
->pDSOutput
=
1425 (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1426 gt_pTessellationThreadData
->dsOutputAllocSize
= requiredAllocSize
;
1428 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1429 SWR_ASSERT(gt_pTessellationThreadData
->dsOutputAllocSize
>= requiredAllocSize
);
1432 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1435 // Run Domain Shader
1436 SWR_DS_CONTEXT dsContext
;
1437 dsContext
.PrimitiveID
= pPrimId
[p
];
1438 dsContext
.pCpIn
= pCPout
;
1439 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1440 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1441 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1442 dsContext
.outVertexAttribOffset
= tsState
.dsOutVtxAttribOffset
;
1443 #if USE_SIMD16_FRONTEND
1444 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1446 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1449 uint32_t dsInvocations
= 0;
1451 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
;
1452 ++dsContext
.vectorOffset
)
1454 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1456 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEDomainShader
, pDC
->drawId
);
1457 state
.pfnDsFunc(GetPrivateState(pDC
), pWorkerData
, &dsContext
);
1458 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEDomainShader
, 0);
1460 AR_EVENT(DSStats((HANDLE
)&dsContext
.stats
));
1462 dsInvocations
+= KNOB_SIMD_WIDTH
;
1464 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1466 #if USE_SIMD16_FRONTEND
1467 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1472 #if USE_SIMD16_FRONTEND
1473 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1474 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1476 dsContext
.pOutputData
,
1477 dsContext
.vectorStride
,
1480 tsState
.numDsOutputAttribs
+ tsState
.dsOutVtxAttribOffset
,
1482 tsData
.NumPrimitives
,
1483 tsState
.postDSTopology
,
1484 NumVertsPerPrim(tsState
.postDSTopology
, false));
1486 while (tessPa
.HasWork())
1488 #if USE_SIMD16_FRONTEND
1489 const uint32_t numPrims
= tessPa
.NumPrims();
1490 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1491 const uint32_t numPrims_hi
=
1492 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1494 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1495 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1496 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1499 if (HasGeometryShaderT::value
)
1501 #if USE_SIMD16_FRONTEND
1502 tessPa
.useAlternateOffset
= false;
1503 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1504 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1508 tessPa
.useAlternateOffset
= true;
1509 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1510 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1513 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1519 _simd_set1_epi32(dsContext
.PrimitiveID
));
1524 if (HasStreamOutT::value
)
1526 #if ENABLE_AVX512_SIMD16
1527 tessPa
.useAlternateOffset
= false;
1529 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1532 if (HasRastT::value
)
1534 #if USE_SIMD16_FRONTEND
1535 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1537 simdvector prim
[3]; // Only deal with triangles, lines, or points
1539 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
1541 #if USE_SIMD16_FRONTEND
1542 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1544 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1546 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, 1);
1547 SWR_ASSERT(assemble
);
1549 SWR_ASSERT(pfnClipFunc
);
1550 #if USE_SIMD16_FRONTEND
1551 // Gather data from the SVG if provided.
1552 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1553 simd16scalari vRtIdx
= SIMD16::setzero_si();
1554 SIMD16::Vec4 svgAttrib
[4];
1556 if (state
.backendState
.readViewportArrayIndex
||
1557 state
.backendState
.readRenderTargetArrayIndex
)
1559 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1562 if (state
.backendState
.readViewportArrayIndex
)
1564 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1565 tessPa
.viewportArrayActive
= true;
1567 if (state
.backendState
.readRenderTargetArrayIndex
)
1569 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1570 tessPa
.rtArrayActive
= true;
1575 // OOB VPAI indices => forced to zero.
1576 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1577 simd16scalari vNumViewports
=
1578 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1579 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1580 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1582 tessPa
.useAlternateOffset
= false;
1593 // Gather data from the SGV if provided.
1594 simdscalari vViewportIdx
= SIMD::setzero_si();
1595 simdscalari vRtIdx
= SIMD::setzero_si();
1596 SIMD::Vec4 svgAttrib
[4];
1598 if (state
.backendState
.readViewportArrayIndex
||
1599 state
.backendState
.readRenderTargetArrayIndex
)
1601 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1604 if (state
.backendState
.readViewportArrayIndex
)
1606 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1608 // OOB VPAI indices => forced to zero.
1609 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1610 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1611 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1612 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1613 tessPa
.viewportArrayActive
= true;
1615 if (state
.backendState
.readRenderTargetArrayIndex
)
1617 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1618 tessPa
.rtArrayActive
= true;
1624 GenMask(tessPa
.NumPrims()),
1625 _simd_set1_epi32(dsContext
.PrimitiveID
),
1634 } // while (tessPa.HasWork())
1635 } // for (uint32_t p = 0; p < numPrims; ++p)
1637 #if USE_SIMD16_FRONTEND
1638 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1640 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1641 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1643 gt_pTessellationThreadData
->dsOutputAllocSize
= 0;
1646 TSDestroyCtx(tsCtx
);
1649 THREAD
PA_STATE::SIMDVERTEX
* gpVertexStore
= nullptr;
1650 THREAD
uint32_t gVertexStoreSize
= 0;
1652 //////////////////////////////////////////////////////////////////////////
1653 /// @brief FE handler for SwrDraw.
1654 /// @tparam IsIndexedT - Is indexed drawing enabled
1655 /// @tparam HasTessellationT - Is tessellation enabled
1656 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1657 /// @tparam HasStreamOutT - Is stream-out enabled
1658 /// @tparam HasRastT - Is rasterization enabled
1659 /// @param pContext - pointer to SWR context.
1660 /// @param pDC - pointer to draw context.
1661 /// @param workerId - thread's worker id.
1662 /// @param pUserData - Pointer to DRAW_WORK
1663 template <typename IsIndexedT
,
1664 typename IsCutIndexEnabledT
,
1665 typename HasTessellationT
,
1666 typename HasGeometryShaderT
,
1667 typename HasStreamOutT
,
1669 void ProcessDraw(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
1671 #if KNOB_ENABLE_TOSS_POINTS
1672 if (KNOB_TOSS_QUEUE_FE
)
1678 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessDraw
, pDC
->drawId
);
1680 void* pWorkerData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1682 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1683 const API_STATE
& state
= GetApiState(pDC
);
1685 uint32_t indexSize
= 0;
1686 uint32_t endVertex
= work
.numVerts
;
1688 gfxptr_t xpLastRequestedIndex
= 0;
1689 if (IsIndexedT::value
)
1694 indexSize
= sizeof(uint32_t);
1697 indexSize
= sizeof(uint16_t);
1700 indexSize
= sizeof(uint8_t);
1703 SWR_INVALID("Invalid work.type: %d", work
.type
);
1705 xpLastRequestedIndex
= work
.xpIB
+ endVertex
* indexSize
;
1709 // No cuts, prune partial primitives.
1710 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1713 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1714 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1717 GsBuffers gsBuffers
;
1718 if (HasGeometryShaderT::value
)
1720 #if USE_SIMD16_FRONTEND
1721 AllocateGsBuffers
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1722 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1724 AllocateGsBuffers
<SIMD256
, KNOB_SIMD_WIDTH
>(
1725 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1729 if (HasTessellationT::value
)
1731 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1732 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1733 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1735 AllocateTessellationData(pContext
);
1739 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1740 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1741 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1744 // allocate space for streamout input prim data
1745 uint32_t* pSoPrimData
= nullptr;
1746 if (HasStreamOutT::value
)
1748 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1751 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, true);
1752 #if USE_SIMD16_FRONTEND
1753 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simd16vector
);
1755 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simdvector
);
1758 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1760 // Compute storage requirements for vertex store
1761 // TODO: allocation needs to be rethought for better cut support
1762 uint32_t numVerts
= vertexCount
+ 2; // Need extra space for PA state machine
1763 uint32_t vertexStoreSize
= numVerts
* simdVertexSizeBytes
;
1765 // grow the vertex store for the PA as necessary
1766 if (gVertexStoreSize
< vertexStoreSize
)
1768 if (gpVertexStore
!= nullptr)
1770 AlignedFree(gpVertexStore
);
1771 gpVertexStore
= nullptr;
1774 SWR_ASSERT(gpVertexStore
== nullptr);
1776 gpVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(vertexStoreSize
, 64));
1777 gVertexStoreSize
= vertexStoreSize
;
1779 SWR_ASSERT(gpVertexStore
!= nullptr);
1782 // choose primitive assembler
1784 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
,
1789 state
.frontendState
.vsVertexSize
,
1790 GetNumVerts(state
.topology
, 1));
1791 PA_STATE
& pa
= paFactory
.GetPA();
1793 #if USE_SIMD16_FRONTEND
1794 #if USE_SIMD16_SHADERS
1800 SWR_VS_CONTEXT vsContext_lo
;
1801 SWR_VS_CONTEXT vsContext_hi
;
1803 #if USE_SIMD16_SHADERS
1804 vsContext_lo
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1805 vsContext_hi
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1807 vsContext_lo
.pVin
= &vin_lo
;
1808 vsContext_hi
.pVin
= &vin_hi
;
1810 vsContext_lo
.AlternateOffset
= 0;
1811 vsContext_hi
.AlternateOffset
= 1;
1813 SWR_FETCH_CONTEXT fetchInfo_lo
= {0};
1815 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1816 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1817 fetchInfo_lo
.StartVertex
= 0;
1819 if (IsIndexedT::value
)
1821 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1823 // if the entire index buffer isn't being consumed, set the last index
1824 // so that fetches < a SIMD wide will be masked off
1825 fetchInfo_lo
.xpLastIndex
= state
.indexBuffer
.xpIndices
+ state
.indexBuffer
.size
;
1826 if (xpLastRequestedIndex
< fetchInfo_lo
.xpLastIndex
)
1828 fetchInfo_lo
.xpLastIndex
= xpLastRequestedIndex
;
1833 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1836 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1838 const simd16scalari vScale
=
1839 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1841 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1845 simd16scalari vIndex
;
1847 if (IsIndexedT::value
)
1849 fetchInfo_lo
.xpIndices
= work
.xpIB
;
1850 fetchInfo_hi
.xpIndices
=
1851 fetchInfo_lo
.xpIndices
+ KNOB_SIMD_WIDTH
* indexSize
; // 1/2 of KNOB_SIMD16_WIDTH
1855 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1857 fetchInfo_lo
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(GetPrivateState(pDC
), &vIndex
);
1858 fetchInfo_hi
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(
1859 GetPrivateState(pDC
),
1860 &vIndex
+ KNOB_SIMD_WIDTH
* sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1863 fetchInfo_lo
.CurInstance
= instanceNum
;
1864 fetchInfo_hi
.CurInstance
= instanceNum
;
1866 vsContext_lo
.InstanceID
= instanceNum
;
1867 vsContext_hi
.InstanceID
= instanceNum
;
1869 while (pa
.HasWork())
1871 // GetNextVsOutput currently has the side effect of updating some PA state machine
1872 // state. So we need to keep this outside of (i < endVertex) check.
1874 simdmask
* pvCutIndices_lo
= nullptr;
1875 simdmask
* pvCutIndices_hi
= nullptr;
1877 if (IsIndexedT::value
)
1879 // simd16mask <=> simdmask[2]
1881 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1882 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1885 simd16vertex
& vout
= pa
.GetNextVsOutput();
1887 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1888 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1892 if (!IsIndexedT::value
)
1894 fetchInfo_lo
.xpLastIndex
= fetchInfo_lo
.xpIndices
;
1896 offset
= std::min(endVertex
- i
, (uint32_t)KNOB_SIMD16_WIDTH
);
1897 offset
*= 4; // convert from index to address
1898 #if USE_SIMD16_SHADERS
1899 fetchInfo_lo
.xpLastIndex
+= offset
;
1901 fetchInfo_lo
.xpLastIndex
+= std::min(offset
, (uint32_t)KNOB_SIMD_WIDTH
);
1903 std::min(offset
, (uint32_t)KNOB_SIMD16_WIDTH
) - KNOB_SIMD_WIDTH
;
1904 assert(offset
>= 0);
1905 fetchInfo_hi
.xpLastIndex
= fetchInfo_hi
.xpIndices
;
1906 fetchInfo_hi
.xpLastIndex
+= offset2
;
1909 // 1. Execute FS/VS for a single SIMD.
1910 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
1911 #if USE_SIMD16_SHADERS
1912 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin
);
1914 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin_lo
);
1916 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1918 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_hi
, vin_hi
);
1921 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
1923 // forward fetch generated vertex IDs to the vertex shader
1924 #if USE_SIMD16_SHADERS
1926 vsContext_lo
.VertexID16
=
1927 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID
, 0);
1928 vsContext_lo
.VertexID16
=
1929 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID2
, 1);
1931 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1932 vsContext_hi
.VertexID
= fetchInfo_lo
.VertexID2
;
1935 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1936 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1939 // Setup active mask for vertex shader.
1941 vsContext_lo
.mask16
= GenerateMask16(endVertex
- i
);
1943 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1944 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1947 // forward cut mask to the PA
1948 if (IsIndexedT::value
)
1950 #if USE_SIMD16_SHADERS
1951 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1952 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask2
));
1954 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1955 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1959 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1961 #if KNOB_ENABLE_TOSS_POINTS
1962 if (!KNOB_TOSS_FETCH
)
1965 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
1967 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1968 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1970 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1971 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1973 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1975 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_hi
);
1976 AR_EVENT(VSStats((HANDLE
)&vsContext_hi
.stats
));
1979 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
1981 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1985 // 2. Assemble primitives given the last two SIMD.
1988 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1990 RDTSC_START(pContext
->pBucketMgr
, FEPAAssemble
);
1991 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1992 RDTSC_STOP(pContext
->pBucketMgr
, FEPAAssemble
, 1, 0);
1994 #if KNOB_ENABLE_TOSS_POINTS
1995 if (!KNOB_TOSS_FETCH
)
1998 #if KNOB_ENABLE_TOSS_POINTS
2004 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2006 const uint32_t numPrims
= pa
.NumPrims();
2007 const uint32_t numPrims_lo
=
2008 std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
2009 const uint32_t numPrims_hi
=
2010 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
2012 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
2013 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
2014 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
2016 if (HasTessellationT::value
)
2018 pa
.useAlternateOffset
= false;
2019 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2030 pa
.useAlternateOffset
= true;
2031 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2041 else if (HasGeometryShaderT::value
)
2043 pa
.useAlternateOffset
= false;
2044 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2054 pa
.useAlternateOffset
= true;
2055 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2066 // If streamout is enabled then stream vertices out to memory.
2067 if (HasStreamOutT::value
)
2069 pa
.useAlternateOffset
= false;
2070 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2073 if (HasRastT::value
)
2075 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
2076 // Gather data from the SVG if provided.
2077 simd16scalari vpai
= SIMD16::setzero_si();
2078 simd16scalari rtai
= SIMD16::setzero_si();
2079 SIMD16::Vec4 svgAttrib
[4];
2081 if (state
.backendState
.readViewportArrayIndex
||
2082 state
.backendState
.readRenderTargetArrayIndex
)
2084 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2087 if (state
.backendState
.readViewportArrayIndex
)
2089 vpai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2090 pa
.viewportArrayActive
= true;
2092 if (state
.backendState
.readRenderTargetArrayIndex
)
2095 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2096 pa
.rtArrayActive
= true;
2100 // OOB VPAI indices => forced to zero.
2101 vpai
= SIMD16::max_epi32(vpai
, SIMD16::setzero_si());
2102 simd16scalari vNumViewports
=
2103 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2104 simd16scalari vClearMask
=
2105 SIMD16::cmplt_epi32(vpai
, vNumViewports
);
2106 vpai
= SIMD16::and_si(vClearMask
, vpai
);
2108 pa
.useAlternateOffset
= false;
2109 pDC
->pState
->pfnProcessPrims_simd16(pDC
,
2123 } while (pa
.NextPrim());
2125 if (IsIndexedT::value
)
2127 fetchInfo_lo
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2128 fetchInfo_hi
.xpIndices
= fetchInfo_hi
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2132 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
2135 i
+= KNOB_SIMD16_WIDTH
;
2142 SWR_VS_CONTEXT vsContext
;
2143 SWR_FETCH_CONTEXT fetchInfo
= {0};
2145 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
2146 fetchInfo
.StartInstance
= work
.startInstance
;
2147 fetchInfo
.StartVertex
= 0;
2149 if (IsIndexedT::value
)
2151 fetchInfo
.BaseVertex
= work
.baseVertex
;
2153 // if the entire index buffer isn't being consumed, set the last index
2154 // so that fetches < a SIMD wide will be masked off
2155 fetchInfo
.pLastIndex
=
2156 (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
2157 if (xpLastRequestedIndex
< fetchInfo
.pLastIndex
)
2159 fetchInfo
.pLastIndex
= xpLastRequestedIndex
;
2164 fetchInfo
.StartVertex
= work
.startVertex
;
2167 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2169 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2170 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
2175 if (IsIndexedT::value
)
2177 fetchInfo
.pIndices
= work
.pIB
;
2181 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
2182 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
2185 fetchInfo
.CurInstance
= instanceNum
;
2186 vsContext
.InstanceID
= instanceNum
;
2188 while (pa
.HasWork())
2190 // GetNextVsOutput currently has the side effect of updating some PA state machine
2191 // state. So we need to keep this outside of (i < endVertex) check.
2192 simdmask
* pvCutIndices
= nullptr;
2193 if (IsIndexedT::value
)
2195 pvCutIndices
= &pa
.GetNextVsIndices();
2198 simdvertex
& vout
= pa
.GetNextVsOutput();
2199 vsContext
.pVin
= &vout
;
2200 vsContext
.pVout
= &vout
;
2204 // 1. Execute FS/VS for a single SIMD.
2205 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
2206 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo
, vout
);
2207 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
2209 // forward fetch generated vertex IDs to the vertex shader
2210 vsContext
.VertexID
= fetchInfo
.VertexID
;
2212 // Setup active mask for vertex shader.
2213 vsContext
.mask
= GenerateMask(endVertex
- i
);
2215 // forward cut mask to the PA
2216 if (IsIndexedT::value
)
2218 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
2221 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
2223 #if KNOB_ENABLE_TOSS_POINTS
2224 if (!KNOB_TOSS_FETCH
)
2227 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
2228 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext
);
2229 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
2231 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
2232 AR_EVENT(VSStats((HANDLE
)&vsContext
.stats
));
2236 // 2. Assemble primitives given the last two SIMD.
2239 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
2240 // PaAssemble returns false if there is not enough verts to assemble.
2241 RDTSC_BEGIN(pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
2242 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
2243 RDTSC_END(pContext
->pBucketMgr
, FEPAAssemble
, 1);
2245 #if KNOB_ENABLE_TOSS_POINTS
2246 if (!KNOB_TOSS_FETCH
)
2249 #if KNOB_ENABLE_TOSS_POINTS
2255 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2257 if (HasTessellationT::value
)
2259 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2265 pa
.GetPrimID(work
.startPrimID
));
2267 else if (HasGeometryShaderT::value
)
2269 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
2275 pa
.GetPrimID(work
.startPrimID
));
2279 // If streamout is enabled then stream vertices out to memory.
2280 if (HasStreamOutT::value
)
2282 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2285 if (HasRastT::value
)
2287 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
2289 // Gather data from the SVG if provided.
2290 simdscalari vViewportIdx
= SIMD::setzero_si();
2291 simdscalari vRtIdx
= SIMD::setzero_si();
2292 SIMD::Vec4 svgAttrib
[4];
2294 if (state
.backendState
.readViewportArrayIndex
||
2295 state
.backendState
.readRenderTargetArrayIndex
)
2297 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2300 if (state
.backendState
.readViewportArrayIndex
)
2303 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2305 // OOB VPAI indices => forced to zero.
2307 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
2308 simdscalari vNumViewports
=
2309 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2310 simdscalari vClearMask
=
2311 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
2312 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
2313 pa
.viewportArrayActive
= true;
2315 if (state
.backendState
.readRenderTargetArrayIndex
)
2318 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2319 pa
.rtArrayActive
= true;
2322 pDC
->pState
->pfnProcessPrims(pDC
,
2326 GenMask(pa
.NumPrims()),
2327 pa
.GetPrimID(work
.startPrimID
),
2335 } while (pa
.NextPrim());
2337 if (IsIndexedT::value
)
2339 fetchInfo
.pIndices
=
2340 (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
2344 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
2347 i
+= KNOB_SIMD_WIDTH
;
2354 RDTSC_END(pContext
->pBucketMgr
, FEProcessDraw
, numPrims
* work
.numInstances
);
2357 struct FEDrawChooser
2359 typedef PFN_FE_WORK_FUNC FuncType
;
2361 template <typename
... ArgsB
>
2362 static FuncType
GetFunc()
2364 return ProcessDraw
<ArgsB
...>;
2368 // Selector for correct templated Draw front-end function
2369 PFN_FE_WORK_FUNC
GetProcessDrawFunc(bool IsIndexed
,
2370 bool IsCutIndexEnabled
,
2371 bool HasTessellation
,
2372 bool HasGeometryShader
,
2374 bool HasRasterization
)
2376 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
,