1 /****************************************************************************
2 * Copyright (C) 2014-2018 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
40 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief FE handler for SwrSync.
46 /// @param pContext - pointer to SWR context.
47 /// @param pDC - pointer to draw context.
48 /// @param workerId - thread's worker id. Even thread has a unique id.
49 /// @param pUserData - Pointer to user data passed back to sync callback.
50 /// @todo This should go away when we switch this to use compute threading.
51 void ProcessSync(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
55 work
.pfnWork
= ProcessSyncBE
;
57 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
58 pTileMgr
->enqueue(0, 0, &work
);
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrDestroyContext.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 void ProcessShutdown(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
71 work
.pfnWork
= ProcessShutdownBE
;
73 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
74 // Enqueue at least 1 work item for each worker thread
75 // account for number of numa nodes
76 uint32_t numNumaNodes
= pContext
->threadPool
.numaMask
+ 1;
78 for (uint32_t i
= 0; i
< pContext
->threadPool
.numThreads
; ++i
)
80 for (uint32_t n
= 0; n
< numNumaNodes
; ++n
)
82 pTileMgr
->enqueue(i
, n
, &work
);
87 //////////////////////////////////////////////////////////////////////////
88 /// @brief FE handler for SwrClearRenderTarget.
89 /// @param pContext - pointer to SWR context.
90 /// @param pDC - pointer to draw context.
91 /// @param workerId - thread's worker id. Even thread has a unique id.
92 /// @param pUserData - Pointer to user data passed back to clear callback.
93 /// @todo This should go away when we switch this to use compute threading.
94 void ProcessClear(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
96 CLEAR_DESC
* pDesc
= (CLEAR_DESC
*)pUserData
;
97 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
102 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
103 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
104 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
108 work
.pfnWork
= ProcessClearBE
;
109 work
.desc
.clear
= *pDesc
;
111 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
113 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
115 pTileMgr
->enqueue(x
, y
, &work
);
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
129 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessStoreTiles
, pDC
->drawId
);
130 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
131 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
133 // queue a store to each macro tile
134 // compute macro tile bounds for the specified rect
135 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
136 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
137 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
138 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
142 work
.type
= STORETILES
;
143 work
.pfnWork
= ProcessStoreTilesBE
;
144 work
.desc
.storeTiles
= *pDesc
;
146 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
148 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
150 pTileMgr
->enqueue(x
, y
, &work
);
154 RDTSC_END(pContext
->pBucketMgr
, FEProcessStoreTiles
, 0);
157 //////////////////////////////////////////////////////////////////////////
158 /// @brief FE handler for SwrInvalidateTiles.
159 /// @param pContext - pointer to SWR context.
160 /// @param pDC - pointer to draw context.
161 /// @param workerId - thread's worker id. Even thread has a unique id.
162 /// @param pUserData - Pointer to user data passed back to callback.
163 /// @todo This should go away when we switch this to use compute threading.
164 void ProcessDiscardInvalidateTiles(SWR_CONTEXT
* pContext
,
169 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, pDC
->drawId
);
170 DISCARD_INVALIDATE_TILES_DESC
* pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
171 MacroTileMgr
* pTileMgr
= pDC
->pTileMgr
;
173 // compute macro tile bounds for the specified rect
174 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
175 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
176 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
177 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
179 if (pDesc
->fullTilesOnly
== false)
181 // include partial tiles
182 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
183 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
184 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
185 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
188 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
189 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
191 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
192 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
196 work
.type
= DISCARDINVALIDATETILES
;
197 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
198 work
.desc
.discardInvalidateTiles
= *pDesc
;
200 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
202 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
204 pTileMgr
->enqueue(x
, y
, &work
);
208 RDTSC_END(pContext
->pBucketMgr
, FEProcessInvalidateTiles
, 0);
211 //////////////////////////////////////////////////////////////////////////
212 /// @brief Computes the number of primitives given the number of verts.
213 /// @param mode - primitive topology for draw operation.
214 /// @param numPrims - number of vertices or indices for draw.
215 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
216 uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
222 case TOP_TRIANGLE_LIST
:
224 case TOP_TRIANGLE_STRIP
:
225 return numPrims
< 3 ? 0 : numPrims
- 2;
226 case TOP_TRIANGLE_FAN
:
227 return numPrims
< 3 ? 0 : numPrims
- 2;
228 case TOP_TRIANGLE_DISC
:
229 return numPrims
< 2 ? 0 : numPrims
- 1;
233 return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
235 return numPrims
< 2 ? 0 : numPrims
- 1;
242 case TOP_LINE_LIST_ADJ
:
244 case TOP_LISTSTRIP_ADJ
:
245 return numPrims
< 3 ? 0 : numPrims
- 3;
246 case TOP_TRI_LIST_ADJ
:
248 case TOP_TRI_STRIP_ADJ
:
249 return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
251 case TOP_PATCHLIST_1
:
252 case TOP_PATCHLIST_2
:
253 case TOP_PATCHLIST_3
:
254 case TOP_PATCHLIST_4
:
255 case TOP_PATCHLIST_5
:
256 case TOP_PATCHLIST_6
:
257 case TOP_PATCHLIST_7
:
258 case TOP_PATCHLIST_8
:
259 case TOP_PATCHLIST_9
:
260 case TOP_PATCHLIST_10
:
261 case TOP_PATCHLIST_11
:
262 case TOP_PATCHLIST_12
:
263 case TOP_PATCHLIST_13
:
264 case TOP_PATCHLIST_14
:
265 case TOP_PATCHLIST_15
:
266 case TOP_PATCHLIST_16
:
267 case TOP_PATCHLIST_17
:
268 case TOP_PATCHLIST_18
:
269 case TOP_PATCHLIST_19
:
270 case TOP_PATCHLIST_20
:
271 case TOP_PATCHLIST_21
:
272 case TOP_PATCHLIST_22
:
273 case TOP_PATCHLIST_23
:
274 case TOP_PATCHLIST_24
:
275 case TOP_PATCHLIST_25
:
276 case TOP_PATCHLIST_26
:
277 case TOP_PATCHLIST_27
:
278 case TOP_PATCHLIST_28
:
279 case TOP_PATCHLIST_29
:
280 case TOP_PATCHLIST_30
:
281 case TOP_PATCHLIST_31
:
282 case TOP_PATCHLIST_32
:
283 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
286 case TOP_POINT_LIST_BF
:
287 case TOP_LINE_STRIP_CONT
:
288 case TOP_LINE_STRIP_BF
:
289 case TOP_LINE_STRIP_CONT_BF
:
290 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
291 case TOP_TRI_STRIP_REVERSE
:
292 case TOP_PATCHLIST_BASE
:
294 SWR_INVALID("Unsupported topology: %d", mode
);
301 //////////////////////////////////////////////////////////////////////////
302 /// @brief Computes the number of verts given the number of primitives.
303 /// @param mode - primitive topology for draw operation.
304 /// @param numPrims - number of primitives for draw.
305 uint32_t GetNumVerts(PRIMITIVE_TOPOLOGY mode
, uint32_t numPrims
)
311 case TOP_TRIANGLE_LIST
:
313 case TOP_TRIANGLE_STRIP
:
314 return numPrims
? numPrims
+ 2 : 0;
315 case TOP_TRIANGLE_FAN
:
316 return numPrims
? numPrims
+ 2 : 0;
317 case TOP_TRIANGLE_DISC
:
318 return numPrims
? numPrims
+ 1 : 0;
322 return numPrims
? numPrims
* 2 + 2 : 0;
324 return numPrims
? numPrims
+ 1 : 0;
331 case TOP_LINE_LIST_ADJ
:
333 case TOP_LISTSTRIP_ADJ
:
334 return numPrims
? numPrims
+ 3 : 0;
335 case TOP_TRI_LIST_ADJ
:
337 case TOP_TRI_STRIP_ADJ
:
338 return numPrims
? (numPrims
+ 2) * 2 : 0;
340 case TOP_PATCHLIST_1
:
341 case TOP_PATCHLIST_2
:
342 case TOP_PATCHLIST_3
:
343 case TOP_PATCHLIST_4
:
344 case TOP_PATCHLIST_5
:
345 case TOP_PATCHLIST_6
:
346 case TOP_PATCHLIST_7
:
347 case TOP_PATCHLIST_8
:
348 case TOP_PATCHLIST_9
:
349 case TOP_PATCHLIST_10
:
350 case TOP_PATCHLIST_11
:
351 case TOP_PATCHLIST_12
:
352 case TOP_PATCHLIST_13
:
353 case TOP_PATCHLIST_14
:
354 case TOP_PATCHLIST_15
:
355 case TOP_PATCHLIST_16
:
356 case TOP_PATCHLIST_17
:
357 case TOP_PATCHLIST_18
:
358 case TOP_PATCHLIST_19
:
359 case TOP_PATCHLIST_20
:
360 case TOP_PATCHLIST_21
:
361 case TOP_PATCHLIST_22
:
362 case TOP_PATCHLIST_23
:
363 case TOP_PATCHLIST_24
:
364 case TOP_PATCHLIST_25
:
365 case TOP_PATCHLIST_26
:
366 case TOP_PATCHLIST_27
:
367 case TOP_PATCHLIST_28
:
368 case TOP_PATCHLIST_29
:
369 case TOP_PATCHLIST_30
:
370 case TOP_PATCHLIST_31
:
371 case TOP_PATCHLIST_32
:
372 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
375 case TOP_POINT_LIST_BF
:
376 case TOP_LINE_STRIP_CONT
:
377 case TOP_LINE_STRIP_BF
:
378 case TOP_LINE_STRIP_CONT_BF
:
379 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
380 case TOP_TRI_STRIP_REVERSE
:
381 case TOP_PATCHLIST_BASE
:
383 SWR_INVALID("Unsupported topology: %d", mode
);
390 //////////////////////////////////////////////////////////////////////////
391 /// @brief Return number of verts per primitive.
392 /// @param topology - topology
393 /// @param includeAdjVerts - include adjacent verts in primitive vertices
394 uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
396 uint32_t numVerts
= 0;
400 case TOP_POINT_LIST_BF
:
405 case TOP_LINE_LIST_ADJ
:
407 case TOP_LINE_STRIP_CONT
:
408 case TOP_LINE_STRIP_BF
:
409 case TOP_LISTSTRIP_ADJ
:
412 case TOP_TRIANGLE_LIST
:
413 case TOP_TRIANGLE_STRIP
:
414 case TOP_TRIANGLE_FAN
:
415 case TOP_TRI_LIST_ADJ
:
416 case TOP_TRI_STRIP_ADJ
:
417 case TOP_TRI_STRIP_REVERSE
:
425 case TOP_PATCHLIST_1
:
426 case TOP_PATCHLIST_2
:
427 case TOP_PATCHLIST_3
:
428 case TOP_PATCHLIST_4
:
429 case TOP_PATCHLIST_5
:
430 case TOP_PATCHLIST_6
:
431 case TOP_PATCHLIST_7
:
432 case TOP_PATCHLIST_8
:
433 case TOP_PATCHLIST_9
:
434 case TOP_PATCHLIST_10
:
435 case TOP_PATCHLIST_11
:
436 case TOP_PATCHLIST_12
:
437 case TOP_PATCHLIST_13
:
438 case TOP_PATCHLIST_14
:
439 case TOP_PATCHLIST_15
:
440 case TOP_PATCHLIST_16
:
441 case TOP_PATCHLIST_17
:
442 case TOP_PATCHLIST_18
:
443 case TOP_PATCHLIST_19
:
444 case TOP_PATCHLIST_20
:
445 case TOP_PATCHLIST_21
:
446 case TOP_PATCHLIST_22
:
447 case TOP_PATCHLIST_23
:
448 case TOP_PATCHLIST_24
:
449 case TOP_PATCHLIST_25
:
450 case TOP_PATCHLIST_26
:
451 case TOP_PATCHLIST_27
:
452 case TOP_PATCHLIST_28
:
453 case TOP_PATCHLIST_29
:
454 case TOP_PATCHLIST_30
:
455 case TOP_PATCHLIST_31
:
456 case TOP_PATCHLIST_32
:
457 numVerts
= topology
- TOP_PATCHLIST_BASE
;
460 SWR_INVALID("Unsupported topology: %d", topology
);
468 case TOP_LISTSTRIP_ADJ
:
469 case TOP_LINE_LIST_ADJ
:
472 case TOP_TRI_STRIP_ADJ
:
473 case TOP_TRI_LIST_ADJ
:
484 //////////////////////////////////////////////////////////////////////////
485 /// @brief Generate mask from remaining work.
486 /// @param numWorkItems - Number of items being worked on by a SIMD.
487 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
490 (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
491 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
492 return _simd_castps_si(_simd_vmask_ps(mask
));
495 static INLINE simd16scalari
GenerateMask16(uint32_t numItemsRemaining
)
498 (numItemsRemaining
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: numItemsRemaining
;
499 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
500 return _simd16_castps_si(_simd16_vmask_ps(mask
));
503 //////////////////////////////////////////////////////////////////////////
504 /// @brief StreamOut - Streams vertex data out to SO buffers.
505 /// Generally, we are only streaming out a SIMDs worth of triangles.
506 /// @param pDC - pointer to draw context.
507 /// @param workerId - thread's worker id. Even thread has a unique id.
508 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
509 static void StreamOut(
510 DRAW_CONTEXT
* pDC
, PA_STATE
& pa
, uint32_t workerId
, uint32_t* pPrimData
, uint32_t streamIndex
)
512 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEStreamout
, pDC
->drawId
);
514 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
516 const API_STATE
& state
= GetApiState(pDC
);
517 const SWR_STREAMOUT_STATE
& soState
= state
.soState
;
519 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
521 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each
523 uint32_t primDataDwordVertexStride
= (SWR_VTX_NUM_SLOTS
* sizeof(float) * 4) / sizeof(uint32_t);
525 SWR_STREAMOUT_CONTEXT soContext
= {0};
527 // Setup buffer state pointers.
528 for (uint32_t i
= 0; i
< 4; ++i
)
530 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
533 uint32_t numPrims
= pa
.NumPrims();
535 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
538 uint64_t soMask
= soState
.streamMasks
[streamIndex
];
540 // Write all entries into primitive data buffer for SOS.
541 while (_BitScanForward64(&slot
, soMask
))
543 simd4scalar attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
544 uint32_t paSlot
= slot
+ soState
.vertexAttribOffset
[streamIndex
];
545 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
547 // Attribute offset is relative offset from start of vertex.
548 // Note that attributes start at slot 1 in the PA buffer. We need to write this
549 // to prim data starting at slot 0. Which is why we do (slot - 1).
550 // Also note: GL works slightly differently, and needs slot 0
551 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
553 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
554 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
556 uint32_t* pPrimDataAttrib
=
557 pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
559 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
562 soMask
&= ~(uint64_t(1) << slot
);
565 // Update pPrimData pointer
566 soContext
.pPrimData
= pPrimData
;
569 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr,
570 "Trying to execute uninitialized streamout jit function.");
571 state
.pfnSoFunc
[streamIndex
](GetPrivateState(pDC
), pWorkerData
, soContext
);
574 // Update SO write offset. The driver provides memory for the update.
575 for (uint32_t i
= 0; i
< 4; ++i
)
577 if (state
.soBuffer
[i
].pWriteOffset
)
579 bool nullTileAccessed
= false;
580 void* pWriteOffset
= pDC
->pContext
->pfnTranslateGfxptrForWrite(
581 GetPrivateState(pDC
), soContext
.pBuffer
[i
]->pWriteOffset
, &nullTileAccessed
, pWorkerData
);
582 *((uint32_t*)pWriteOffset
) = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
585 if (state
.soBuffer
[i
].soWriteEnable
)
587 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
588 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
592 pDC
->dynState
.soPrims
+= soContext
.numPrimsWritten
;
594 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
595 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
597 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEStreamout
, 1);
600 #if USE_SIMD16_FRONTEND
601 //////////////////////////////////////////////////////////////////////////
602 /// Is value an even number (a multiple of two)
604 template <typename T
>
605 INLINE
static bool IsEven(T value
)
607 return (value
& 1) == 0;
610 //////////////////////////////////////////////////////////////////////////
611 /// Round up value to an even number (a multiple of two)
613 template <typename T
>
614 INLINE
static T
RoundUpEven(T value
)
616 return (value
+ 1) & ~1;
619 //////////////////////////////////////////////////////////////////////////
620 /// Round down value to an even number (a multiple of two)
622 template <typename T
>
623 INLINE
static T
RoundDownEven(T value
)
628 //////////////////////////////////////////////////////////////////////////
629 /// Pack pairs of simdvertexes into simd16vertexes, assume non-overlapping
631 /// vertexCount is in terms of the source simdvertexes and must be even
633 /// attribCount will limit the vector copies to those attribs specified
635 /// note: the stride between vertexes is determinded by SWR_VTX_NUM_SLOTS
637 void PackPairsOfSimdVertexIntoSimd16Vertex(simd16vertex
* vertex_simd16
,
638 const simdvertex
* vertex
,
639 uint32_t vertexCount
,
640 uint32_t attribCount
)
643 SWR_ASSERT(vertex_simd16
);
644 SWR_ASSERT(attribCount
<= SWR_VTX_NUM_SLOTS
);
648 for (uint32_t i
= 0; i
< vertexCount
; i
+= 2)
650 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
652 for (uint32_t k
= 0; k
< 4; k
+= 1)
655 _simd16_insert_ps(_simd16_setzero_ps(), vertex
[i
].attrib
[j
][k
], 0);
657 if ((i
+ 1) < vertexCount
)
660 _simd16_insert_ps(temp
.attrib
[j
][k
], vertex
[i
+ 1].attrib
[j
][k
], 1);
665 for (uint32_t j
= 0; j
< attribCount
; j
+= 1)
667 vertex_simd16
[i
>> 1].attrib
[j
] = temp
.attrib
[j
];
673 //////////////////////////////////////////////////////////////////////////
674 /// @brief Computes number of invocations. The current index represents
675 /// the start of the SIMD. The max index represents how much work
676 /// items are remaining. If there is less then a SIMD's xmin of work
677 /// then return the remaining amount of work.
678 /// @param curIndex - The start index for the SIMD.
679 /// @param maxIndex - The last index for all work items.
680 static INLINE
uint32_t GetNumInvocations(uint32_t curIndex
, uint32_t maxIndex
)
682 uint32_t remainder
= (maxIndex
- curIndex
);
683 #if USE_SIMD16_FRONTEND
684 return (remainder
>= KNOB_SIMD16_WIDTH
) ? KNOB_SIMD16_WIDTH
: remainder
;
686 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
690 //////////////////////////////////////////////////////////////////////////
691 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
692 /// The geometry shader will loop over each active streamout buffer, assembling
693 /// primitives for the downstream stages. When multistream output is enabled,
694 /// the generated stream ID buffer from the GS needs to be converted to a cut
695 /// buffer for the primitive assembler.
696 /// @param stream - stream id to generate the cut buffer for
697 /// @param pStreamIdBase - pointer to the stream ID buffer
698 /// @param numEmittedVerts - Number of total verts emitted by the GS
699 /// @param pCutBuffer - output buffer to write cuts to
700 void ProcessStreamIdBuffer(uint32_t stream
,
701 uint8_t* pStreamIdBase
,
702 uint32_t numEmittedVerts
,
705 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
707 uint32_t numOutputBytes
= AlignUp(numEmittedVerts
, 8) / 8;
709 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
711 uint8_t curInputByte
= pStreamIdBase
[2 * b
];
713 for (uint32_t i
= 0; i
< 4; ++i
)
715 if ((curInputByte
& 0x3) != stream
)
722 curInputByte
= pStreamIdBase
[2 * b
+ 1];
723 for (uint32_t i
= 0; i
< 4; ++i
)
725 if ((curInputByte
& 0x3) != stream
)
727 outByte
|= (1 << (i
+ 4));
732 *pCutBuffer
++ = outByte
;
736 // Buffers that are allocated if GS is enabled
740 uint8_t* pGsOut
[KNOB_SIMD_WIDTH
];
741 uint8_t* pGsTransposed
;
742 void* pStreamCutBuffer
;
745 //////////////////////////////////////////////////////////////////////////
746 /// @brief Transposes GS output from SOA to AOS to feed the primitive assembler
747 /// @param pDst - Destination buffer in AOS form for the current SIMD width, fed into the primitive
749 /// @param pSrc - Buffer of vertices in SOA form written by the geometry shader
750 /// @param numVerts - Number of vertices outputted by the GS
751 /// @param numAttribs - Number of attributes per vertex
752 template <typename SIMD_T
, uint32_t SimdWidth
>
753 void TransposeSOAtoAOS(uint8_t* pDst
, uint8_t* pSrc
, uint32_t numVerts
, uint32_t numAttribs
)
755 uint32_t srcVertexStride
= numAttribs
* sizeof(float) * 4;
756 uint32_t dstVertexStride
= numAttribs
* sizeof(Float
<SIMD_T
>) * 4;
758 OSALIGNSIMD16(uint32_t) gatherOffsets
[SimdWidth
];
760 for (uint32_t i
= 0; i
< SimdWidth
; ++i
)
762 gatherOffsets
[i
] = srcVertexStride
* i
;
764 auto vGatherOffsets
= SIMD_T::load_si((Integer
<SIMD_T
>*)&gatherOffsets
[0]);
766 uint32_t numSimd
= AlignUp(numVerts
, SimdWidth
) / SimdWidth
;
767 uint32_t remainingVerts
= numVerts
;
769 for (uint32_t s
= 0; s
< numSimd
; ++s
)
771 uint8_t* pSrcBase
= pSrc
+ s
* srcVertexStride
* SimdWidth
;
772 uint8_t* pDstBase
= pDst
+ s
* dstVertexStride
;
774 // Compute mask to prevent src overflow
775 uint32_t mask
= std::min(remainingVerts
, SimdWidth
);
776 mask
= GenMask(mask
);
777 auto vMask
= SIMD_T::vmask_ps(mask
);
778 auto viMask
= SIMD_T::castps_si(vMask
);
780 for (uint32_t a
= 0; a
< numAttribs
; ++a
)
782 auto attribGatherX
= SIMD_T::mask_i32gather_ps(
783 SIMD_T::setzero_ps(), (const float*)pSrcBase
, vGatherOffsets
, vMask
);
784 auto attribGatherY
= SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
785 (const float*)(pSrcBase
+ sizeof(float)),
789 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
790 (const float*)(pSrcBase
+ sizeof(float) * 2),
794 SIMD_T::mask_i32gather_ps(SIMD_T::setzero_ps(),
795 (const float*)(pSrcBase
+ sizeof(float) * 3),
799 SIMD_T::maskstore_ps((float*)pDstBase
, viMask
, attribGatherX
);
800 SIMD_T::maskstore_ps((float*)(pDstBase
+ sizeof(Float
<SIMD_T
>)), viMask
, attribGatherY
);
801 SIMD_T::maskstore_ps(
802 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 2), viMask
, attribGatherZ
);
803 SIMD_T::maskstore_ps(
804 (float*)(pDstBase
+ sizeof(Float
<SIMD_T
>) * 3), viMask
, attribGatherW
);
806 pSrcBase
+= sizeof(float) * 4;
807 pDstBase
+= sizeof(Float
<SIMD_T
>) * 4;
809 remainingVerts
-= SimdWidth
;
814 //////////////////////////////////////////////////////////////////////////
815 /// @brief Implements GS stage.
816 /// @param pDC - pointer to draw context.
817 /// @param workerId - thread's worker id. Even thread has a unique id.
818 /// @param pa - The primitive assembly object.
819 /// @param pGsOut - output stream for GS
820 template <typename HasStreamOutT
, typename HasRastT
>
821 static void GeometryShaderStage(DRAW_CONTEXT
* pDC
,
824 GsBuffers
* pGsBuffers
,
825 uint32_t* pSoPrimData
,
826 #if USE_SIMD16_FRONTEND
827 uint32_t numPrims_simd8
,
829 simdscalari
const& primID
)
831 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, pDC
->drawId
);
833 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
835 const API_STATE
& state
= GetApiState(pDC
);
836 const SWR_GS_STATE
* pState
= &state
.gsState
;
837 SWR_GS_CONTEXT gsContext
;
839 static uint8_t sNullBuffer
[128] = {0};
841 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
843 gsContext
.pStreams
[i
] = pGsBuffers
->pGsOut
[i
];
845 gsContext
.pVerts
= (simdvector
*)pGsBuffers
->pGsIn
;
846 gsContext
.PrimitiveID
= primID
;
848 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
849 simdvector attrib
[MAX_NUM_VERTS_PER_PRIM
];
851 // assemble all attributes for the input primitive
852 gsContext
.inputVertStride
= pState
->inputVertStride
;
853 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
855 uint32_t attribOffset
= slot
+ pState
->vertexAttribOffset
;
856 pa
.Assemble(attribOffset
, attrib
);
858 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
860 gsContext
.pVerts
[attribOffset
+ pState
->inputVertStride
* i
] = attrib
[i
];
864 // record valid prims from the frontend to avoid over binning the newly generated
866 #if USE_SIMD16_FRONTEND
867 uint32_t numInputPrims
= numPrims_simd8
;
869 uint32_t numInputPrims
= pa
.NumPrims();
872 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
874 gsContext
.InstanceID
= instance
;
875 gsContext
.mask
= GenerateMask(numInputPrims
);
877 // execute the geometry shader
878 state
.pfnGsFunc(GetPrivateState(pDC
), pWorkerData
, &gsContext
);
879 AR_EVENT(GSStats((HANDLE
)&gsContext
.stats
));
881 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
883 gsContext
.pStreams
[i
] += pState
->allocationSize
;
887 // set up new binner and state for the GS output topology
888 #if USE_SIMD16_FRONTEND
889 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
892 switch (pState
->outputTopology
)
895 pfnClipFunc
= ClipRectangles_simd16
;
897 case TOP_TRIANGLE_STRIP
:
898 pfnClipFunc
= ClipTriangles_simd16
;
901 pfnClipFunc
= ClipLines_simd16
;
904 pfnClipFunc
= ClipPoints_simd16
;
907 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
912 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
915 switch (pState
->outputTopology
)
918 pfnClipFunc
= ClipRectangles
;
920 case TOP_TRIANGLE_STRIP
:
921 pfnClipFunc
= ClipTriangles
;
924 pfnClipFunc
= ClipLines
;
927 pfnClipFunc
= ClipPoints
;
930 SWR_INVALID("Unexpected GS output topology: %d", pState
->outputTopology
);
935 // foreach input prim:
936 // - setup a new PA based on the emitted verts for that prim
937 // - loop over the new verts, calling PA to assemble each prim
938 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
940 uint32_t totalPrimsGenerated
= 0;
941 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
943 uint8_t* pInstanceBase
= (uint8_t*)pGsBuffers
->pGsOut
[inputPrim
];
945 // Vertex count is either emitted by shader or static
946 uint32_t vertexCount
= 0;
947 if (pState
->staticVertexCount
)
949 vertexCount
= pState
->staticVertexCount
;
953 // If emitted in shader, it should be the stored in the first dword of the output buffer
954 vertexCount
= *(uint32_t*)pInstanceBase
;
957 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
959 uint32_t numEmittedVerts
= vertexCount
;
960 if (numEmittedVerts
== 0)
965 uint8_t* pBase
= pInstanceBase
+ instance
* pState
->allocationSize
;
967 pState
->controlDataSize
== 0 ? &sNullBuffer
[0] : pBase
+ pState
->controlDataOffset
;
968 uint8_t* pVertexBaseAOS
= pBase
+ pState
->outputVertexOffset
;
970 #if USE_SIMD16_FRONTEND
971 TransposeSOAtoAOS
<SIMD512
, KNOB_SIMD16_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
974 pState
->outputVertexSize
);
976 TransposeSOAtoAOS
<SIMD256
, KNOB_SIMD_WIDTH
>((uint8_t*)pGsBuffers
->pGsTransposed
,
979 pState
->outputVertexSize
);
982 uint32_t numAttribs
= state
.feNumAttributes
;
984 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
986 bool processCutVerts
= false;
987 uint8_t* pCutBuffer
= pCutBase
;
989 // assign default stream ID, only relevant when GS is outputting a single stream
990 uint32_t streamID
= 0;
991 if (pState
->isSingleStream
)
993 processCutVerts
= true;
994 streamID
= pState
->singleStreamID
;
995 if (streamID
!= stream
)
1000 // early exit if this stream is not enabled for streamout
1001 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
1006 // multi-stream output, need to translate StreamID buffer to a cut buffer
1007 ProcessStreamIdBuffer(
1008 stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pGsBuffers
->pStreamCutBuffer
);
1009 pCutBuffer
= (uint8_t*)pGsBuffers
->pStreamCutBuffer
;
1010 processCutVerts
= false;
1013 #if USE_SIMD16_FRONTEND
1014 PA_STATE_CUT
gsPa(pDC
,
1015 (uint8_t*)pGsBuffers
->pGsTransposed
,
1017 pState
->outputVertexSize
,
1018 reinterpret_cast<simd16mask
*>(pCutBuffer
),
1021 pState
->outputTopology
,
1023 pa
.numVertsPerPrim
);
1026 PA_STATE_CUT
gsPa(pDC
,
1027 (uint8_t*)pGsBuffers
->pGsTransposed
,
1029 pState
->outputVertexSize
,
1033 pState
->outputTopology
,
1035 pa
.numVertsPerPrim
);
1038 while (gsPa
.GetNextStreamOutput())
1042 #if USE_SIMD16_FRONTEND
1043 simd16vector attrib_simd16
[3];
1045 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib_simd16
);
1048 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
1053 totalPrimsGenerated
+= gsPa
.NumPrims();
1055 if (HasStreamOutT::value
)
1057 #if ENABLE_AVX512_SIMD16
1058 gsPa
.useAlternateOffset
= false;
1060 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
1063 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
1065 #if USE_SIMD16_FRONTEND
1066 simd16scalari vPrimId
= _simd16_set1_epi32(pPrimitiveId
[inputPrim
]);
1068 // Gather data from the SVG if provided.
1069 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1070 simd16scalari vRtIdx
= SIMD16::setzero_si();
1071 SIMD16::Vec4 svgAttrib
[4];
1073 if (state
.backendState
.readViewportArrayIndex
||
1074 state
.backendState
.readRenderTargetArrayIndex
)
1076 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1079 if (state
.backendState
.readViewportArrayIndex
)
1082 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1083 gsPa
.viewportArrayActive
= true;
1085 if (state
.backendState
.readRenderTargetArrayIndex
)
1087 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1088 gsPa
.rtArrayActive
= true;
1092 // OOB VPAI indices => forced to zero.
1094 SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1095 simd16scalari vNumViewports
=
1096 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1097 simd16scalari vClearMask
=
1098 SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1099 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1101 gsPa
.useAlternateOffset
= false;
1106 GenMask(gsPa
.NumPrims()),
1112 simdscalari vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
1114 // Gather data from the SVG if provided.
1115 simdscalari vViewportIdx
= SIMD::setzero_si();
1116 simdscalari vRtIdx
= SIMD::setzero_si();
1117 SIMD::Vec4 svgAttrib
[4];
1119 if (state
.backendState
.readViewportArrayIndex
||
1120 state
.backendState
.readRenderTargetArrayIndex
)
1122 gsPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1125 if (state
.backendState
.readViewportArrayIndex
)
1128 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1130 // OOB VPAI indices => forced to zero.
1132 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1133 simdscalari vNumViewports
=
1134 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1135 simdscalari vClearMask
=
1136 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1137 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1138 gsPa
.viewportArrayActive
= true;
1140 if (state
.backendState
.readRenderTargetArrayIndex
)
1142 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1143 gsPa
.rtArrayActive
= true;
1150 GenMask(gsPa
.NumPrims()),
1157 } while (gsPa
.NextPrim());
1163 // update GS pipeline stats
1164 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
1165 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
1166 AR_EVENT(GSPrimInfo(numInputPrims
, totalPrimsGenerated
, numVertsPerPrim
* numInputPrims
));
1167 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEGeometryShader
, 1);
1170 //////////////////////////////////////////////////////////////////////////
1171 /// @brief Allocate GS buffers
1172 /// @param pDC - pointer to draw context.
1173 /// @param state - API state
1174 /// @param ppGsOut - pointer to GS output buffer allocation
1175 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
1176 template <typename SIMD_T
, uint32_t SIMD_WIDTH
>
1177 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
,
1178 const API_STATE
& state
,
1179 uint32_t vertsPerPrim
,
1180 GsBuffers
* pGsBuffers
)
1182 auto pArena
= pDC
->pArena
;
1183 SWR_ASSERT(pArena
!= nullptr);
1184 SWR_ASSERT(state
.gsState
.gsEnable
);
1186 const SWR_GS_STATE
& gsState
= state
.gsState
;
1188 // Allocate storage for vertex inputs
1189 uint32_t vertexInBufferSize
= gsState
.inputVertStride
* sizeof(simdvector
) * vertsPerPrim
;
1190 pGsBuffers
->pGsIn
= (uint8_t*)pArena
->AllocAligned(vertexInBufferSize
, 32);
1192 // Allocate arena space to hold GS output verts
1193 const uint32_t vertexBufferSize
= gsState
.instanceCount
* gsState
.allocationSize
;
1195 for (uint32_t i
= 0; i
< KNOB_SIMD_WIDTH
; ++i
)
1197 pGsBuffers
->pGsOut
[i
] = (uint8_t*)pArena
->AllocAligned(vertexBufferSize
, 32);
1200 // Allocate storage for transposed GS output
1201 uint32_t numSimdBatches
= AlignUp(gsState
.maxNumVerts
, SIMD_WIDTH
) / SIMD_WIDTH
;
1202 uint32_t transposedBufferSize
=
1203 numSimdBatches
* gsState
.outputVertexSize
* sizeof(Vec4
<SIMD_T
>);
1204 pGsBuffers
->pGsTransposed
= (uint8_t*)pArena
->AllocAligned(transposedBufferSize
, 32);
1206 // Allocate storage to hold temporary stream->cut buffer, if necessary
1207 if (state
.gsState
.isSingleStream
)
1209 pGsBuffers
->pStreamCutBuffer
= nullptr;
1213 pGsBuffers
->pStreamCutBuffer
=
1214 (uint8_t*)pArena
->AllocAligned(AlignUp(gsState
.maxNumVerts
* 2, 32), 32);
1218 //////////////////////////////////////////////////////////////////////////
1219 /// @brief Contains all data generated by the HS and passed to the
1220 /// tessellator and DS.
1221 struct TessellationThreadLocalData
1223 SWR_HS_CONTEXT hsContext
;
1228 size_t hsOutputAllocSize
;
1230 simdscalar
* pDSOutput
;
1231 size_t dsOutputAllocSize
;
1234 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
1236 //////////////////////////////////////////////////////////////////////////
1237 /// @brief Allocate tessellation data for this worker thread.
1239 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
1241 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
1242 if (gt_pTessellationThreadData
== nullptr)
1244 gt_pTessellationThreadData
=
1245 (TessellationThreadLocalData
*)AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
1246 memset((void*)gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
1250 //////////////////////////////////////////////////////////////////////////
1251 /// @brief Implements Tessellation Stages.
1252 /// @param pDC - pointer to draw context.
1253 /// @param workerId - thread's worker id. Even thread has a unique id.
1254 /// @param pa - The primitive assembly object.
1255 /// @param pGsOut - output stream for GS
1256 template <typename HasGeometryShaderT
, typename HasStreamOutT
, typename HasRastT
>
1257 static void TessellationStages(DRAW_CONTEXT
* pDC
,
1260 GsBuffers
* pGsBuffers
,
1261 uint32_t* pSoPrimData
,
1262 #if USE_SIMD16_FRONTEND
1263 uint32_t numPrims_simd8
,
1265 simdscalari
const& primID
)
1267 const API_STATE
& state
= GetApiState(pDC
);
1268 const SWR_TS_STATE
& tsState
= state
.tsState
;
1269 void* pWorkerData
= pDC
->pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1271 SWR_ASSERT(gt_pTessellationThreadData
);
1273 HANDLE tsCtx
= TSInitCtx(tsState
.domain
,
1274 tsState
.partitioning
,
1275 tsState
.tsOutputTopology
,
1276 gt_pTessellationThreadData
->pTxCtx
,
1277 gt_pTessellationThreadData
->tsCtxSize
);
1278 if (tsCtx
== nullptr)
1280 gt_pTessellationThreadData
->pTxCtx
=
1281 AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1282 tsCtx
= TSInitCtx(tsState
.domain
,
1283 tsState
.partitioning
,
1284 tsState
.tsOutputTopology
,
1285 gt_pTessellationThreadData
->pTxCtx
,
1286 gt_pTessellationThreadData
->tsCtxSize
);
1290 #if USE_SIMD16_FRONTEND
1291 PFN_PROCESS_PRIMS_SIMD16 pfnClipFunc
= nullptr;
1292 if (HasRastT::value
)
1294 switch (tsState
.postDSTopology
)
1296 case TOP_TRIANGLE_LIST
:
1297 pfnClipFunc
= ClipTriangles_simd16
;
1300 pfnClipFunc
= ClipLines_simd16
;
1302 case TOP_POINT_LIST
:
1303 pfnClipFunc
= ClipPoints_simd16
;
1306 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1311 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1312 if (HasRastT::value
)
1314 switch (tsState
.postDSTopology
)
1316 case TOP_TRIANGLE_LIST
:
1317 pfnClipFunc
= ClipTriangles
;
1320 pfnClipFunc
= ClipLines
;
1322 case TOP_POINT_LIST
:
1323 pfnClipFunc
= ClipPoints
;
1326 SWR_INVALID("Unexpected DS output topology: %d", tsState
.postDSTopology
);
1331 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1332 hsContext
.PrimitiveID
= primID
;
1333 hsContext
.outputSize
= tsState
.hsAllocationSize
;
1335 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1336 // Max storage for one attribute for an entire simdprimitive
1337 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1339 // Assemble position separately
1340 // TESS_TODO: this could be avoided - fix it
1341 pa
.Assemble(VERTEX_POSITION_SLOT
, simdattrib
);
1342 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
) {
1343 hsContext
.vert
[i
].attrib
[VERTEX_POSITION_SLOT
] = simdattrib
[i
];
1346 // assemble all attributes for the input primitives
1347 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1349 uint32_t attribSlot
= tsState
.srcVertexAttribOffset
+ slot
;
1350 pa
.Assemble(attribSlot
, simdattrib
);
1352 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1354 hsContext
.vert
[i
].attrib
[tsState
.vertexAttribOffset
+ slot
] = simdattrib
[i
];
1358 // Allocate HS output storage
1359 uint32_t requiredAllocSize
= KNOB_SIMD_WIDTH
* tsState
.hsAllocationSize
;
1361 if (requiredAllocSize
> gt_pTessellationThreadData
->hsOutputAllocSize
)
1363 AlignedFree(gt_pTessellationThreadData
->pHSOutput
);
1364 gt_pTessellationThreadData
->pHSOutput
= (uint8_t*)AlignedMalloc(requiredAllocSize
, 64);
1365 gt_pTessellationThreadData
->hsOutputAllocSize
= requiredAllocSize
;
1368 hsContext
.pCPout
= (ScalarPatch
*)gt_pTessellationThreadData
->pHSOutput
;
1371 //memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
1373 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1375 #if USE_SIMD16_FRONTEND
1376 uint32_t numPrims
= numPrims_simd8
;
1378 uint32_t numPrims
= pa
.NumPrims();
1380 hsContext
.mask
= GenerateMask(numPrims
);
1383 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEHullShader
, pDC
->drawId
);
1384 state
.pfnHsFunc(GetPrivateState(pDC
), pWorkerData
, &hsContext
);
1385 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEHullShader
, 0);
1387 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1388 AR_EVENT(HSStats((HANDLE
)&hsContext
.stats
));
1390 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1392 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1394 ScalarPatch
* pCPout
= (ScalarPatch
*)(gt_pTessellationThreadData
->pHSOutput
+ tsState
.hsAllocationSize
* p
);
1396 SWR_TESSELLATION_FACTORS tessFactors
;
1397 tessFactors
= hsContext
.pCPout
[p
].tessFactors
;
1400 SWR_TS_TESSELLATED_DATA tsData
= {0};
1401 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FETessellation
, pDC
->drawId
);
1402 TSTessellate(tsCtx
, tessFactors
, tsData
);
1403 AR_EVENT(TessPrimCount(1));
1404 RDTSC_END(pDC
->pContext
->pBucketMgr
, FETessellation
, 0);
1406 if (tsData
.NumPrimitives
== 0)
1410 SWR_ASSERT(tsData
.NumDomainPoints
);
1412 // Allocate DS Output memory
1413 uint32_t requiredDSVectorInvocations
=
1414 AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1415 #if USE_SIMD16_FRONTEND
1416 size_t requiredAllocSize
= sizeof(simdvector
) * RoundUpEven(requiredDSVectorInvocations
) *
1417 tsState
.dsAllocationSize
; // simd8 -> simd16, padding
1419 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.dsAllocationSize
;
1420 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1422 if (requiredAllocSize
> gt_pTessellationThreadData
->dsOutputAllocSize
)
1424 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1425 gt_pTessellationThreadData
->pDSOutput
=
1426 (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1427 gt_pTessellationThreadData
->dsOutputAllocSize
= requiredAllocSize
;
1429 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1430 SWR_ASSERT(gt_pTessellationThreadData
->dsOutputAllocSize
>= requiredAllocSize
);
1433 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1436 // Run Domain Shader
1437 SWR_DS_CONTEXT dsContext
;
1438 dsContext
.PrimitiveID
= pPrimId
[p
];
1439 dsContext
.pCpIn
= pCPout
;
1440 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1441 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1442 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1443 dsContext
.outVertexAttribOffset
= tsState
.dsOutVtxAttribOffset
;
1444 #if USE_SIMD16_FRONTEND
1445 dsContext
.vectorStride
= RoundUpEven(requiredDSVectorInvocations
); // simd8 -> simd16
1447 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1450 uint32_t dsInvocations
= 0;
1452 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
;
1453 ++dsContext
.vectorOffset
)
1455 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1457 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEDomainShader
, pDC
->drawId
);
1458 state
.pfnDsFunc(GetPrivateState(pDC
), pWorkerData
, &dsContext
);
1459 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEDomainShader
, 0);
1461 AR_EVENT(DSStats((HANDLE
)&dsContext
.stats
));
1463 dsInvocations
+= KNOB_SIMD_WIDTH
;
1465 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1467 #if USE_SIMD16_FRONTEND
1468 SWR_ASSERT(IsEven(dsContext
.vectorStride
)); // simd8 -> simd16
1473 #if USE_SIMD16_FRONTEND
1474 reinterpret_cast<const simd16scalar
*>(dsContext
.pOutputData
), // simd8 -> simd16
1475 dsContext
.vectorStride
/ 2, // simd8 -> simd16
1477 dsContext
.pOutputData
,
1478 dsContext
.vectorStride
,
1481 tsState
.numDsOutputAttribs
+ tsState
.dsOutVtxAttribOffset
,
1483 tsData
.NumPrimitives
,
1484 tsState
.postDSTopology
,
1485 NumVertsPerPrim(tsState
.postDSTopology
, false));
1487 while (tessPa
.HasWork())
1489 #if USE_SIMD16_FRONTEND
1490 const uint32_t numPrims
= tessPa
.NumPrims();
1491 const uint32_t numPrims_lo
= std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
1492 const uint32_t numPrims_hi
=
1493 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
1495 const simd16scalari primID
= _simd16_set1_epi32(dsContext
.PrimitiveID
);
1496 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
1497 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
1500 if (HasGeometryShaderT::value
)
1502 #if USE_SIMD16_FRONTEND
1503 tessPa
.useAlternateOffset
= false;
1504 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1505 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_lo
, primID_lo
);
1509 tessPa
.useAlternateOffset
= true;
1510 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1511 pDC
, workerId
, tessPa
, pGsBuffers
, pSoPrimData
, numPrims_hi
, primID_hi
);
1514 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1520 _simd_set1_epi32(dsContext
.PrimitiveID
));
1525 if (HasStreamOutT::value
)
1527 #if ENABLE_AVX512_SIMD16
1528 tessPa
.useAlternateOffset
= false;
1530 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1533 if (HasRastT::value
)
1535 #if USE_SIMD16_FRONTEND
1536 simd16vector prim_simd16
[3]; // Only deal with triangles, lines, or points
1538 simdvector prim
[3]; // Only deal with triangles, lines, or points
1540 RDTSC_BEGIN(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
1542 #if USE_SIMD16_FRONTEND
1543 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1545 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1547 RDTSC_END(pDC
->pContext
->pBucketMgr
, FEPAAssemble
, 1);
1548 SWR_ASSERT(assemble
);
1550 SWR_ASSERT(pfnClipFunc
);
1551 #if USE_SIMD16_FRONTEND
1552 // Gather data from the SVG if provided.
1553 simd16scalari vViewportIdx
= SIMD16::setzero_si();
1554 simd16scalari vRtIdx
= SIMD16::setzero_si();
1555 SIMD16::Vec4 svgAttrib
[4] = {SIMD16::setzero_ps()};
1557 if (state
.backendState
.readViewportArrayIndex
||
1558 state
.backendState
.readRenderTargetArrayIndex
)
1560 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1563 if (state
.backendState
.readViewportArrayIndex
)
1565 vViewportIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1566 tessPa
.viewportArrayActive
= true;
1568 if (state
.backendState
.readRenderTargetArrayIndex
)
1570 vRtIdx
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1571 tessPa
.rtArrayActive
= true;
1576 // OOB VPAI indices => forced to zero.
1577 vViewportIdx
= SIMD16::max_epi32(vViewportIdx
, SIMD16::setzero_si());
1578 simd16scalari vNumViewports
=
1579 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1580 simd16scalari vClearMask
= SIMD16::cmplt_epi32(vViewportIdx
, vNumViewports
);
1581 vViewportIdx
= SIMD16::and_si(vClearMask
, vViewportIdx
);
1583 tessPa
.useAlternateOffset
= false;
1594 // Gather data from the SGV if provided.
1595 simdscalari vViewportIdx
= SIMD::setzero_si();
1596 simdscalari vRtIdx
= SIMD::setzero_si();
1597 SIMD::Vec4 svgAttrib
[4];
1599 if (state
.backendState
.readViewportArrayIndex
||
1600 state
.backendState
.readRenderTargetArrayIndex
)
1602 tessPa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
1605 if (state
.backendState
.readViewportArrayIndex
)
1607 vViewportIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
1609 // OOB VPAI indices => forced to zero.
1610 vViewportIdx
= SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
1611 simdscalari vNumViewports
= SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
1612 simdscalari vClearMask
= SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
1613 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
1614 tessPa
.viewportArrayActive
= true;
1616 if (state
.backendState
.readRenderTargetArrayIndex
)
1618 vRtIdx
= SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
1619 tessPa
.rtArrayActive
= true;
1625 GenMask(tessPa
.NumPrims()),
1626 _simd_set1_epi32(dsContext
.PrimitiveID
),
1635 } // while (tessPa.HasWork())
1636 } // for (uint32_t p = 0; p < numPrims; ++p)
1638 #if USE_SIMD16_FRONTEND
1639 if (gt_pTessellationThreadData
->pDSOutput
!= nullptr)
1641 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1642 gt_pTessellationThreadData
->pDSOutput
= nullptr;
1644 gt_pTessellationThreadData
->dsOutputAllocSize
= 0;
1647 TSDestroyCtx(tsCtx
);
1650 THREAD
PA_STATE::SIMDVERTEX
* gpVertexStore
= nullptr;
1651 THREAD
uint32_t gVertexStoreSize
= 0;
1653 //////////////////////////////////////////////////////////////////////////
1654 /// @brief FE handler for SwrDraw.
1655 /// @tparam IsIndexedT - Is indexed drawing enabled
1656 /// @tparam HasTessellationT - Is tessellation enabled
1657 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1658 /// @tparam HasStreamOutT - Is stream-out enabled
1659 /// @tparam HasRastT - Is rasterization enabled
1660 /// @param pContext - pointer to SWR context.
1661 /// @param pDC - pointer to draw context.
1662 /// @param workerId - thread's worker id.
1663 /// @param pUserData - Pointer to DRAW_WORK
1664 template <typename IsIndexedT
,
1665 typename IsCutIndexEnabledT
,
1666 typename HasTessellationT
,
1667 typename HasGeometryShaderT
,
1668 typename HasStreamOutT
,
1670 void ProcessDraw(SWR_CONTEXT
* pContext
, DRAW_CONTEXT
* pDC
, uint32_t workerId
, void* pUserData
)
1672 #if KNOB_ENABLE_TOSS_POINTS
1673 if (KNOB_TOSS_QUEUE_FE
)
1679 RDTSC_BEGIN(pContext
->pBucketMgr
, FEProcessDraw
, pDC
->drawId
);
1681 void* pWorkerData
= pContext
->threadPool
.pThreadData
[workerId
].pWorkerPrivateData
;
1683 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1684 const API_STATE
& state
= GetApiState(pDC
);
1686 uint32_t indexSize
= 0;
1687 uint32_t endVertex
= work
.numVerts
;
1689 gfxptr_t xpLastRequestedIndex
= 0;
1690 if (IsIndexedT::value
)
1695 indexSize
= sizeof(uint32_t);
1698 indexSize
= sizeof(uint16_t);
1701 indexSize
= sizeof(uint8_t);
1704 SWR_INVALID("Invalid work.type: %d", work
.type
);
1706 xpLastRequestedIndex
= work
.xpIB
+ endVertex
* indexSize
;
1710 // No cuts, prune partial primitives.
1711 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1714 #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR)
1715 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1718 GsBuffers gsBuffers
;
1719 if (HasGeometryShaderT::value
)
1721 #if USE_SIMD16_FRONTEND
1722 AllocateGsBuffers
<SIMD512
, KNOB_SIMD16_WIDTH
>(
1723 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1725 AllocateGsBuffers
<SIMD256
, KNOB_SIMD_WIDTH
>(
1726 pDC
, state
, NumVertsPerPrim(state
.topology
, true), &gsBuffers
);
1730 if (HasTessellationT::value
)
1732 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1733 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1734 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1736 AllocateTessellationData(pContext
);
1740 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1741 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1742 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1745 // allocate space for streamout input prim data
1746 uint32_t* pSoPrimData
= nullptr;
1747 if (HasStreamOutT::value
)
1749 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1752 const uint32_t vertexCount
= NumVertsPerPrim(state
.topology
, true);
1753 #if USE_SIMD16_FRONTEND
1754 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simd16vector
);
1756 uint32_t simdVertexSizeBytes
= state
.frontendState
.vsVertexSize
* sizeof(simdvector
);
1759 SWR_ASSERT(vertexCount
<= MAX_NUM_VERTS_PER_PRIM
);
1761 // Compute storage requirements for vertex store
1762 // TODO: allocation needs to be rethought for better cut support
1763 uint32_t numVerts
= vertexCount
+ 2; // Need extra space for PA state machine
1764 uint32_t vertexStoreSize
= numVerts
* simdVertexSizeBytes
;
1766 // grow the vertex store for the PA as necessary
1767 if (gVertexStoreSize
< vertexStoreSize
)
1769 if (gpVertexStore
!= nullptr)
1771 AlignedFree(gpVertexStore
);
1772 gpVertexStore
= nullptr;
1775 SWR_ASSERT(gpVertexStore
== nullptr);
1777 gpVertexStore
= reinterpret_cast<PA_STATE::SIMDVERTEX
*>(AlignedMalloc(vertexStoreSize
, 64));
1778 gVertexStoreSize
= vertexStoreSize
;
1780 SWR_ASSERT(gpVertexStore
!= nullptr);
1783 // choose primitive assembler
1785 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
,
1790 state
.frontendState
.vsVertexSize
,
1791 GetNumVerts(state
.topology
, 1));
1792 PA_STATE
& pa
= paFactory
.GetPA();
1794 #if USE_SIMD16_FRONTEND
1795 #if USE_SIMD16_SHADERS
1801 SWR_VS_CONTEXT vsContext_lo
;
1802 SWR_VS_CONTEXT vsContext_hi
;
1804 #if USE_SIMD16_SHADERS
1805 vsContext_lo
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1806 vsContext_hi
.pVin
= reinterpret_cast<simdvertex
*>(&vin
);
1808 vsContext_lo
.pVin
= &vin_lo
;
1809 vsContext_hi
.pVin
= &vin_hi
;
1811 vsContext_lo
.AlternateOffset
= 0;
1812 vsContext_hi
.AlternateOffset
= 1;
1814 SWR_FETCH_CONTEXT fetchInfo_lo
= {0};
1816 fetchInfo_lo
.pStreams
= &state
.vertexBuffers
[0];
1817 fetchInfo_lo
.StartInstance
= work
.startInstance
;
1818 fetchInfo_lo
.StartVertex
= 0;
1820 if (IsIndexedT::value
)
1822 fetchInfo_lo
.BaseVertex
= work
.baseVertex
;
1824 // if the entire index buffer isn't being consumed, set the last index
1825 // so that fetches < a SIMD wide will be masked off
1826 fetchInfo_lo
.xpLastIndex
= state
.indexBuffer
.xpIndices
+ state
.indexBuffer
.size
;
1827 if (xpLastRequestedIndex
< fetchInfo_lo
.xpLastIndex
)
1829 fetchInfo_lo
.xpLastIndex
= xpLastRequestedIndex
;
1834 fetchInfo_lo
.StartVertex
= work
.startVertex
;
1837 SWR_FETCH_CONTEXT fetchInfo_hi
= fetchInfo_lo
;
1839 const simd16scalari vScale
=
1840 _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1842 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1846 simd16scalari vIndex
;
1848 if (IsIndexedT::value
)
1850 fetchInfo_lo
.xpIndices
= work
.xpIB
;
1851 fetchInfo_hi
.xpIndices
=
1852 fetchInfo_lo
.xpIndices
+ KNOB_SIMD_WIDTH
* indexSize
; // 1/2 of KNOB_SIMD16_WIDTH
1856 vIndex
= _simd16_add_epi32(_simd16_set1_epi32(work
.startVertexID
), vScale
);
1858 fetchInfo_lo
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(GetPrivateState(pDC
), &vIndex
);
1859 fetchInfo_hi
.xpIndices
= pDC
->pContext
->pfnMakeGfxPtr(
1860 GetPrivateState(pDC
),
1861 &vIndex
+ KNOB_SIMD_WIDTH
* sizeof(int32_t)); // 1/2 of KNOB_SIMD16_WIDTH
1864 fetchInfo_lo
.CurInstance
= instanceNum
;
1865 fetchInfo_hi
.CurInstance
= instanceNum
;
1867 vsContext_lo
.InstanceID
= instanceNum
;
1868 vsContext_hi
.InstanceID
= instanceNum
;
1870 while (pa
.HasWork())
1872 // GetNextVsOutput currently has the side effect of updating some PA state machine
1873 // state. So we need to keep this outside of (i < endVertex) check.
1875 simdmask
* pvCutIndices_lo
= nullptr;
1876 simdmask
* pvCutIndices_hi
= nullptr;
1878 if (IsIndexedT::value
)
1880 // simd16mask <=> simdmask[2]
1882 pvCutIndices_lo
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[0];
1883 pvCutIndices_hi
= &reinterpret_cast<simdmask
*>(&pa
.GetNextVsIndices())[1];
1886 simd16vertex
& vout
= pa
.GetNextVsOutput();
1888 vsContext_lo
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1889 vsContext_hi
.pVout
= reinterpret_cast<simdvertex
*>(&vout
);
1893 if (!IsIndexedT::value
)
1895 fetchInfo_lo
.xpLastIndex
= fetchInfo_lo
.xpIndices
;
1897 offset
= std::min(endVertex
- i
, (uint32_t)KNOB_SIMD16_WIDTH
);
1898 offset
*= 4; // convert from index to address
1899 #if USE_SIMD16_SHADERS
1900 fetchInfo_lo
.xpLastIndex
+= offset
;
1902 fetchInfo_lo
.xpLastIndex
+= std::min(offset
, (uint32_t)KNOB_SIMD_WIDTH
);
1904 std::min(offset
, (uint32_t)KNOB_SIMD16_WIDTH
) - KNOB_SIMD_WIDTH
;
1905 assert(offset
>= 0);
1906 fetchInfo_hi
.xpLastIndex
= fetchInfo_hi
.xpIndices
;
1907 fetchInfo_hi
.xpLastIndex
+= offset2
;
1910 // 1. Execute FS/VS for a single SIMD.
1911 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
1912 #if USE_SIMD16_SHADERS
1913 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin
);
1915 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_lo
, vin_lo
);
1917 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1919 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo_hi
, vin_hi
);
1922 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
1924 // forward fetch generated vertex IDs to the vertex shader
1925 #if USE_SIMD16_SHADERS
1927 vsContext_lo
.VertexID16
=
1928 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID
, 0);
1929 vsContext_lo
.VertexID16
=
1930 _simd16_insert_si(vsContext_lo
.VertexID16
, fetchInfo_lo
.VertexID2
, 1);
1932 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1933 vsContext_hi
.VertexID
= fetchInfo_lo
.VertexID2
;
1936 vsContext_lo
.VertexID
= fetchInfo_lo
.VertexID
;
1937 vsContext_hi
.VertexID
= fetchInfo_hi
.VertexID
;
1940 // Setup active mask for vertex shader.
1942 vsContext_lo
.mask16
= GenerateMask16(endVertex
- i
);
1944 vsContext_lo
.mask
= GenerateMask(endVertex
- i
);
1945 vsContext_hi
.mask
= GenerateMask(endVertex
- (i
+ KNOB_SIMD_WIDTH
));
1948 // forward cut mask to the PA
1949 if (IsIndexedT::value
)
1951 #if USE_SIMD16_SHADERS
1952 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1953 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask2
));
1955 *pvCutIndices_lo
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo
.CutMask
));
1956 *pvCutIndices_hi
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi
.CutMask
));
1960 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1962 #if KNOB_ENABLE_TOSS_POINTS
1963 if (!KNOB_TOSS_FETCH
)
1966 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
1968 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1969 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1971 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_lo
);
1972 AR_EVENT(VSStats((HANDLE
)&vsContext_lo
.stats
));
1974 if ((i
+ KNOB_SIMD_WIDTH
) < endVertex
) // 1/2 of KNOB_SIMD16_WIDTH
1976 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext_hi
);
1977 AR_EVENT(VSStats((HANDLE
)&vsContext_hi
.stats
));
1980 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
1982 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1986 // 2. Assemble primitives given the last two SIMD.
1989 simd16vector prim_simd16
[MAX_NUM_VERTS_PER_PRIM
];
1991 RDTSC_START(pContext
->pBucketMgr
, FEPAAssemble
);
1992 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim_simd16
);
1993 RDTSC_STOP(pContext
->pBucketMgr
, FEPAAssemble
, 1, 0);
1995 #if KNOB_ENABLE_TOSS_POINTS
1996 if (!KNOB_TOSS_FETCH
)
1999 #if KNOB_ENABLE_TOSS_POINTS
2005 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2007 const uint32_t numPrims
= pa
.NumPrims();
2008 const uint32_t numPrims_lo
=
2009 std::min
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
);
2010 const uint32_t numPrims_hi
=
2011 std::max
<uint32_t>(numPrims
, KNOB_SIMD_WIDTH
) - KNOB_SIMD_WIDTH
;
2013 const simd16scalari primID
= pa
.GetPrimID(work
.startPrimID
);
2014 const simdscalari primID_lo
= _simd16_extract_si(primID
, 0);
2015 const simdscalari primID_hi
= _simd16_extract_si(primID
, 1);
2017 if (HasTessellationT::value
)
2019 pa
.useAlternateOffset
= false;
2020 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2031 pa
.useAlternateOffset
= true;
2032 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2042 else if (HasGeometryShaderT::value
)
2044 pa
.useAlternateOffset
= false;
2045 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2055 pa
.useAlternateOffset
= true;
2056 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(pDC
,
2067 // If streamout is enabled then stream vertices out to memory.
2068 if (HasStreamOutT::value
)
2070 pa
.useAlternateOffset
= false;
2071 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2074 if (HasRastT::value
)
2076 SWR_ASSERT(pDC
->pState
->pfnProcessPrims_simd16
);
2077 // Gather data from the SVG if provided.
2078 simd16scalari vpai
= SIMD16::setzero_si();
2079 simd16scalari rtai
= SIMD16::setzero_si();
2080 SIMD16::Vec4 svgAttrib
[4];
2082 if (state
.backendState
.readViewportArrayIndex
||
2083 state
.backendState
.readRenderTargetArrayIndex
)
2085 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2088 if (state
.backendState
.readViewportArrayIndex
)
2090 vpai
= SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2091 pa
.viewportArrayActive
= true;
2093 if (state
.backendState
.readRenderTargetArrayIndex
)
2096 SIMD16::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2097 pa
.rtArrayActive
= true;
2101 // OOB VPAI indices => forced to zero.
2102 vpai
= SIMD16::max_epi32(vpai
, SIMD16::setzero_si());
2103 simd16scalari vNumViewports
=
2104 SIMD16::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2105 simd16scalari vClearMask
=
2106 SIMD16::cmplt_epi32(vpai
, vNumViewports
);
2107 vpai
= SIMD16::and_si(vClearMask
, vpai
);
2109 pa
.useAlternateOffset
= false;
2110 pDC
->pState
->pfnProcessPrims_simd16(pDC
,
2124 } while (pa
.NextPrim());
2126 if (IsIndexedT::value
)
2128 fetchInfo_lo
.xpIndices
= fetchInfo_lo
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2129 fetchInfo_hi
.xpIndices
= fetchInfo_hi
.xpIndices
+ KNOB_SIMD16_WIDTH
* indexSize
;
2133 vIndex
= _simd16_add_epi32(vIndex
, _simd16_set1_epi32(KNOB_SIMD16_WIDTH
));
2136 i
+= KNOB_SIMD16_WIDTH
;
2143 SWR_VS_CONTEXT vsContext
;
2144 SWR_FETCH_CONTEXT fetchInfo
= {0};
2146 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
2147 fetchInfo
.StartInstance
= work
.startInstance
;
2148 fetchInfo
.StartVertex
= 0;
2150 if (IsIndexedT::value
)
2152 fetchInfo
.BaseVertex
= work
.baseVertex
;
2154 // if the entire index buffer isn't being consumed, set the last index
2155 // so that fetches < a SIMD wide will be masked off
2156 fetchInfo
.pLastIndex
=
2157 (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
2158 if (xpLastRequestedIndex
< fetchInfo
.pLastIndex
)
2160 fetchInfo
.pLastIndex
= xpLastRequestedIndex
;
2165 fetchInfo
.StartVertex
= work
.startVertex
;
2168 const simdscalari vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
2170 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
2171 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
2176 if (IsIndexedT::value
)
2178 fetchInfo
.pIndices
= work
.pIB
;
2182 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
2183 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
2186 fetchInfo
.CurInstance
= instanceNum
;
2187 vsContext
.InstanceID
= instanceNum
;
2189 while (pa
.HasWork())
2191 // GetNextVsOutput currently has the side effect of updating some PA state machine
2192 // state. So we need to keep this outside of (i < endVertex) check.
2193 simdmask
* pvCutIndices
= nullptr;
2194 if (IsIndexedT::value
)
2196 pvCutIndices
= &pa
.GetNextVsIndices();
2199 simdvertex
& vout
= pa
.GetNextVsOutput();
2200 vsContext
.pVin
= &vout
;
2201 vsContext
.pVout
= &vout
;
2205 // 1. Execute FS/VS for a single SIMD.
2206 RDTSC_BEGIN(pContext
->pBucketMgr
, FEFetchShader
, pDC
->drawId
);
2207 state
.pfnFetchFunc(GetPrivateState(pDC
), pWorkerData
, fetchInfo
, vout
);
2208 RDTSC_END(pContext
->pBucketMgr
, FEFetchShader
, 0);
2210 // forward fetch generated vertex IDs to the vertex shader
2211 vsContext
.VertexID
= fetchInfo
.VertexID
;
2213 // Setup active mask for vertex shader.
2214 vsContext
.mask
= GenerateMask(endVertex
- i
);
2216 // forward cut mask to the PA
2217 if (IsIndexedT::value
)
2219 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
2222 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
2224 #if KNOB_ENABLE_TOSS_POINTS
2225 if (!KNOB_TOSS_FETCH
)
2228 RDTSC_BEGIN(pContext
->pBucketMgr
, FEVertexShader
, pDC
->drawId
);
2229 state
.pfnVertexFunc(GetPrivateState(pDC
), pWorkerData
, &vsContext
);
2230 RDTSC_END(pContext
->pBucketMgr
, FEVertexShader
, 0);
2232 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
2233 AR_EVENT(VSStats((HANDLE
)&vsContext
.stats
));
2237 // 2. Assemble primitives given the last two SIMD.
2240 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
2241 // PaAssemble returns false if there is not enough verts to assemble.
2242 RDTSC_BEGIN(pContext
->pBucketMgr
, FEPAAssemble
, pDC
->drawId
);
2243 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
2244 RDTSC_END(pContext
->pBucketMgr
, FEPAAssemble
, 1);
2246 #if KNOB_ENABLE_TOSS_POINTS
2247 if (!KNOB_TOSS_FETCH
)
2250 #if KNOB_ENABLE_TOSS_POINTS
2256 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
2258 if (HasTessellationT::value
)
2260 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
2266 pa
.GetPrimID(work
.startPrimID
));
2268 else if (HasGeometryShaderT::value
)
2270 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
2276 pa
.GetPrimID(work
.startPrimID
));
2280 // If streamout is enabled then stream vertices out to memory.
2281 if (HasStreamOutT::value
)
2283 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
2286 if (HasRastT::value
)
2288 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
2290 // Gather data from the SVG if provided.
2291 simdscalari vViewportIdx
= SIMD::setzero_si();
2292 simdscalari vRtIdx
= SIMD::setzero_si();
2293 SIMD::Vec4 svgAttrib
[4];
2295 if (state
.backendState
.readViewportArrayIndex
||
2296 state
.backendState
.readRenderTargetArrayIndex
)
2298 pa
.Assemble(VERTEX_SGV_SLOT
, svgAttrib
);
2301 if (state
.backendState
.readViewportArrayIndex
)
2304 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_VAI_COMP
]);
2306 // OOB VPAI indices => forced to zero.
2308 SIMD::max_epi32(vViewportIdx
, SIMD::setzero_si());
2309 simdscalari vNumViewports
=
2310 SIMD::set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
2311 simdscalari vClearMask
=
2312 SIMD::cmplt_epi32(vViewportIdx
, vNumViewports
);
2313 vViewportIdx
= SIMD::and_si(vClearMask
, vViewportIdx
);
2314 pa
.viewportArrayActive
= true;
2316 if (state
.backendState
.readRenderTargetArrayIndex
)
2319 SIMD::castps_si(svgAttrib
[0][VERTEX_SGV_RTAI_COMP
]);
2320 pa
.rtArrayActive
= true;
2323 pDC
->pState
->pfnProcessPrims(pDC
,
2327 GenMask(pa
.NumPrims()),
2328 pa
.GetPrimID(work
.startPrimID
),
2336 } while (pa
.NextPrim());
2338 if (IsIndexedT::value
)
2340 fetchInfo
.pIndices
=
2341 (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
2345 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
2348 i
+= KNOB_SIMD_WIDTH
;
2355 RDTSC_END(pContext
->pBucketMgr
, FEProcessDraw
, numPrims
* work
.numInstances
);
2358 struct FEDrawChooser
2360 typedef PFN_FE_WORK_FUNC FuncType
;
2362 template <typename
... ArgsB
>
2363 static FuncType
GetFunc()
2365 return ProcessDraw
<ArgsB
...>;
2369 // Selector for correct templated Draw front-end function
2370 PFN_FE_WORK_FUNC
GetProcessDrawFunc(bool IsIndexed
,
2371 bool IsCutIndexEnabled
,
2372 bool HasTessellation
,
2373 bool HasGeometryShader
,
2375 bool HasRasterization
)
2377 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
,