1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
42 #include "tessellator.h"
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE
uint32_t GenMask(uint32_t numBits
)
49 SWR_ASSERT(numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
50 return ((1U << numBits
) - 1);
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
56 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
58 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
70 SWR_CONTEXT
*pContext
,
77 work
.pfnWork
= ProcessSyncBE
;
79 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
80 pTileMgr
->enqueue(0, 0, &work
);
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
91 SWR_CONTEXT
*pContext
,
96 CLEAR_DESC
*pDesc
= (CLEAR_DESC
*)pUserData
;
97 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
99 // queue a clear to each macro tile
100 // compute macro tile bounds for the specified rect
101 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
102 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
103 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
104 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
108 work
.pfnWork
= ProcessClearBE
;
109 work
.desc
.clear
= *pDesc
;
111 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
113 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
115 pTileMgr
->enqueue(x
, y
, &work
);
120 //////////////////////////////////////////////////////////////////////////
121 /// @brief FE handler for SwrStoreTiles.
122 /// @param pContext - pointer to SWR context.
123 /// @param pDC - pointer to draw context.
124 /// @param workerId - thread's worker id. Even thread has a unique id.
125 /// @param pUserData - Pointer to user data passed back to callback.
126 /// @todo This should go away when we switch this to use compute threading.
127 void ProcessStoreTiles(
128 SWR_CONTEXT
*pContext
,
133 AR_BEGIN(FEProcessStoreTiles
, pDC
->drawId
);
134 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
135 STORE_TILES_DESC
* pDesc
= (STORE_TILES_DESC
*)pUserData
;
137 // queue a store to each macro tile
138 // compute macro tile bounds for the specified rect
139 uint32_t macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
140 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
141 uint32_t macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
142 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
146 work
.type
= STORETILES
;
147 work
.pfnWork
= ProcessStoreTileBE
;
148 work
.desc
.storeTiles
= *pDesc
;
150 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
152 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
154 pTileMgr
->enqueue(x
, y
, &work
);
158 AR_END(FEProcessStoreTiles
, 0);
161 //////////////////////////////////////////////////////////////////////////
162 /// @brief FE handler for SwrInvalidateTiles.
163 /// @param pContext - pointer to SWR context.
164 /// @param pDC - pointer to draw context.
165 /// @param workerId - thread's worker id. Even thread has a unique id.
166 /// @param pUserData - Pointer to user data passed back to callback.
167 /// @todo This should go away when we switch this to use compute threading.
168 void ProcessDiscardInvalidateTiles(
169 SWR_CONTEXT
*pContext
,
174 AR_BEGIN(FEProcessInvalidateTiles
, pDC
->drawId
);
175 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
176 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
178 // compute macro tile bounds for the specified rect
179 uint32_t macroTileXMin
= (pDesc
->rect
.xmin
+ KNOB_MACROTILE_X_DIM
- 1) / KNOB_MACROTILE_X_DIM
;
180 uint32_t macroTileXMax
= (pDesc
->rect
.xmax
/ KNOB_MACROTILE_X_DIM
) - 1;
181 uint32_t macroTileYMin
= (pDesc
->rect
.ymin
+ KNOB_MACROTILE_Y_DIM
- 1) / KNOB_MACROTILE_Y_DIM
;
182 uint32_t macroTileYMax
= (pDesc
->rect
.ymax
/ KNOB_MACROTILE_Y_DIM
) - 1;
184 if (pDesc
->fullTilesOnly
== false)
186 // include partial tiles
187 macroTileXMin
= pDesc
->rect
.xmin
/ KNOB_MACROTILE_X_DIM
;
188 macroTileXMax
= (pDesc
->rect
.xmax
- 1) / KNOB_MACROTILE_X_DIM
;
189 macroTileYMin
= pDesc
->rect
.ymin
/ KNOB_MACROTILE_Y_DIM
;
190 macroTileYMax
= (pDesc
->rect
.ymax
- 1) / KNOB_MACROTILE_Y_DIM
;
193 SWR_ASSERT(macroTileXMax
<= KNOB_NUM_HOT_TILES_X
);
194 SWR_ASSERT(macroTileYMax
<= KNOB_NUM_HOT_TILES_Y
);
196 macroTileXMax
= std::min
<int32_t>(macroTileXMax
, KNOB_NUM_HOT_TILES_X
);
197 macroTileYMax
= std::min
<int32_t>(macroTileYMax
, KNOB_NUM_HOT_TILES_Y
);
201 work
.type
= DISCARDINVALIDATETILES
;
202 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
203 work
.desc
.discardInvalidateTiles
= *pDesc
;
205 for (uint32_t x
= macroTileXMin
; x
<= macroTileXMax
; ++x
)
207 for (uint32_t y
= macroTileYMin
; y
<= macroTileYMax
; ++y
)
209 pTileMgr
->enqueue(x
, y
, &work
);
213 AR_END(FEProcessInvalidateTiles
, 0);
216 //////////////////////////////////////////////////////////////////////////
217 /// @brief Computes the number of primitives given the number of verts.
218 /// @param mode - primitive topology for draw operation.
219 /// @param numPrims - number of vertices or indices for draw.
220 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
221 uint32_t GetNumPrims(
222 PRIMITIVE_TOPOLOGY mode
,
227 case TOP_POINT_LIST
: return numPrims
;
228 case TOP_TRIANGLE_LIST
: return numPrims
/ 3;
229 case TOP_TRIANGLE_STRIP
: return numPrims
< 3 ? 0 : numPrims
- 2;
230 case TOP_TRIANGLE_FAN
: return numPrims
< 3 ? 0 : numPrims
- 2;
231 case TOP_TRIANGLE_DISC
: return numPrims
< 2 ? 0 : numPrims
- 1;
232 case TOP_QUAD_LIST
: return numPrims
/ 4;
233 case TOP_QUAD_STRIP
: return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
234 case TOP_LINE_STRIP
: return numPrims
< 2 ? 0 : numPrims
- 1;
235 case TOP_LINE_LIST
: return numPrims
/ 2;
236 case TOP_LINE_LOOP
: return numPrims
;
237 case TOP_RECT_LIST
: return numPrims
/ 3;
238 case TOP_LINE_LIST_ADJ
: return numPrims
/ 4;
239 case TOP_LISTSTRIP_ADJ
: return numPrims
< 3 ? 0 : numPrims
- 3;
240 case TOP_TRI_LIST_ADJ
: return numPrims
/ 6;
241 case TOP_TRI_STRIP_ADJ
: return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
243 case TOP_PATCHLIST_1
:
244 case TOP_PATCHLIST_2
:
245 case TOP_PATCHLIST_3
:
246 case TOP_PATCHLIST_4
:
247 case TOP_PATCHLIST_5
:
248 case TOP_PATCHLIST_6
:
249 case TOP_PATCHLIST_7
:
250 case TOP_PATCHLIST_8
:
251 case TOP_PATCHLIST_9
:
252 case TOP_PATCHLIST_10
:
253 case TOP_PATCHLIST_11
:
254 case TOP_PATCHLIST_12
:
255 case TOP_PATCHLIST_13
:
256 case TOP_PATCHLIST_14
:
257 case TOP_PATCHLIST_15
:
258 case TOP_PATCHLIST_16
:
259 case TOP_PATCHLIST_17
:
260 case TOP_PATCHLIST_18
:
261 case TOP_PATCHLIST_19
:
262 case TOP_PATCHLIST_20
:
263 case TOP_PATCHLIST_21
:
264 case TOP_PATCHLIST_22
:
265 case TOP_PATCHLIST_23
:
266 case TOP_PATCHLIST_24
:
267 case TOP_PATCHLIST_25
:
268 case TOP_PATCHLIST_26
:
269 case TOP_PATCHLIST_27
:
270 case TOP_PATCHLIST_28
:
271 case TOP_PATCHLIST_29
:
272 case TOP_PATCHLIST_30
:
273 case TOP_PATCHLIST_31
:
274 case TOP_PATCHLIST_32
:
275 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
278 case TOP_POINT_LIST_BF
:
279 case TOP_LINE_STRIP_CONT
:
280 case TOP_LINE_STRIP_BF
:
281 case TOP_LINE_STRIP_CONT_BF
:
282 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
283 case TOP_TRI_STRIP_REVERSE
:
284 case TOP_PATCHLIST_BASE
:
286 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
293 //////////////////////////////////////////////////////////////////////////
294 /// @brief Computes the number of verts given the number of primitives.
295 /// @param mode - primitive topology for draw operation.
296 /// @param numPrims - number of primitives for draw.
297 uint32_t GetNumVerts(
298 PRIMITIVE_TOPOLOGY mode
,
303 case TOP_POINT_LIST
: return numPrims
;
304 case TOP_TRIANGLE_LIST
: return numPrims
* 3;
305 case TOP_TRIANGLE_STRIP
: return numPrims
? numPrims
+ 2 : 0;
306 case TOP_TRIANGLE_FAN
: return numPrims
? numPrims
+ 2 : 0;
307 case TOP_TRIANGLE_DISC
: return numPrims
? numPrims
+ 1 : 0;
308 case TOP_QUAD_LIST
: return numPrims
* 4;
309 case TOP_QUAD_STRIP
: return numPrims
? numPrims
* 2 + 2 : 0;
310 case TOP_LINE_STRIP
: return numPrims
? numPrims
+ 1 : 0;
311 case TOP_LINE_LIST
: return numPrims
* 2;
312 case TOP_LINE_LOOP
: return numPrims
;
313 case TOP_RECT_LIST
: return numPrims
* 3;
314 case TOP_LINE_LIST_ADJ
: return numPrims
* 4;
315 case TOP_LISTSTRIP_ADJ
: return numPrims
? numPrims
+ 3 : 0;
316 case TOP_TRI_LIST_ADJ
: return numPrims
* 6;
317 case TOP_TRI_STRIP_ADJ
: return numPrims
? (numPrims
+ 2) * 2 : 0;
319 case TOP_PATCHLIST_1
:
320 case TOP_PATCHLIST_2
:
321 case TOP_PATCHLIST_3
:
322 case TOP_PATCHLIST_4
:
323 case TOP_PATCHLIST_5
:
324 case TOP_PATCHLIST_6
:
325 case TOP_PATCHLIST_7
:
326 case TOP_PATCHLIST_8
:
327 case TOP_PATCHLIST_9
:
328 case TOP_PATCHLIST_10
:
329 case TOP_PATCHLIST_11
:
330 case TOP_PATCHLIST_12
:
331 case TOP_PATCHLIST_13
:
332 case TOP_PATCHLIST_14
:
333 case TOP_PATCHLIST_15
:
334 case TOP_PATCHLIST_16
:
335 case TOP_PATCHLIST_17
:
336 case TOP_PATCHLIST_18
:
337 case TOP_PATCHLIST_19
:
338 case TOP_PATCHLIST_20
:
339 case TOP_PATCHLIST_21
:
340 case TOP_PATCHLIST_22
:
341 case TOP_PATCHLIST_23
:
342 case TOP_PATCHLIST_24
:
343 case TOP_PATCHLIST_25
:
344 case TOP_PATCHLIST_26
:
345 case TOP_PATCHLIST_27
:
346 case TOP_PATCHLIST_28
:
347 case TOP_PATCHLIST_29
:
348 case TOP_PATCHLIST_30
:
349 case TOP_PATCHLIST_31
:
350 case TOP_PATCHLIST_32
:
351 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
354 case TOP_POINT_LIST_BF
:
355 case TOP_LINE_STRIP_CONT
:
356 case TOP_LINE_STRIP_BF
:
357 case TOP_LINE_STRIP_CONT_BF
:
358 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
359 case TOP_TRI_STRIP_REVERSE
:
360 case TOP_PATCHLIST_BASE
:
362 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
369 //////////////////////////////////////////////////////////////////////////
370 /// @brief Return number of verts per primitive.
371 /// @param topology - topology
372 /// @param includeAdjVerts - include adjacent verts in primitive vertices
373 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
375 uint32_t numVerts
= 0;
379 case TOP_POINT_LIST_BF
:
384 case TOP_LINE_LIST_ADJ
:
386 case TOP_LINE_STRIP_CONT
:
387 case TOP_LINE_STRIP_BF
:
388 case TOP_LISTSTRIP_ADJ
:
391 case TOP_TRIANGLE_LIST
:
392 case TOP_TRIANGLE_STRIP
:
393 case TOP_TRIANGLE_FAN
:
394 case TOP_TRI_LIST_ADJ
:
395 case TOP_TRI_STRIP_ADJ
:
396 case TOP_TRI_STRIP_REVERSE
:
404 case TOP_PATCHLIST_1
:
405 case TOP_PATCHLIST_2
:
406 case TOP_PATCHLIST_3
:
407 case TOP_PATCHLIST_4
:
408 case TOP_PATCHLIST_5
:
409 case TOP_PATCHLIST_6
:
410 case TOP_PATCHLIST_7
:
411 case TOP_PATCHLIST_8
:
412 case TOP_PATCHLIST_9
:
413 case TOP_PATCHLIST_10
:
414 case TOP_PATCHLIST_11
:
415 case TOP_PATCHLIST_12
:
416 case TOP_PATCHLIST_13
:
417 case TOP_PATCHLIST_14
:
418 case TOP_PATCHLIST_15
:
419 case TOP_PATCHLIST_16
:
420 case TOP_PATCHLIST_17
:
421 case TOP_PATCHLIST_18
:
422 case TOP_PATCHLIST_19
:
423 case TOP_PATCHLIST_20
:
424 case TOP_PATCHLIST_21
:
425 case TOP_PATCHLIST_22
:
426 case TOP_PATCHLIST_23
:
427 case TOP_PATCHLIST_24
:
428 case TOP_PATCHLIST_25
:
429 case TOP_PATCHLIST_26
:
430 case TOP_PATCHLIST_27
:
431 case TOP_PATCHLIST_28
:
432 case TOP_PATCHLIST_29
:
433 case TOP_PATCHLIST_30
:
434 case TOP_PATCHLIST_31
:
435 case TOP_PATCHLIST_32
:
436 numVerts
= topology
- TOP_PATCHLIST_BASE
;
439 SWR_ASSERT(false, "Unsupported topology: %d", topology
);
447 case TOP_LISTSTRIP_ADJ
:
448 case TOP_LINE_LIST_ADJ
: numVerts
= 4; break;
449 case TOP_TRI_STRIP_ADJ
:
450 case TOP_TRI_LIST_ADJ
: numVerts
= 6; break;
458 //////////////////////////////////////////////////////////////////////////
459 /// @brief Generate mask from remaining work.
460 /// @param numWorkItems - Number of items being worked on by a SIMD.
461 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
463 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
464 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
465 return _simd_castps_si(vMask(mask
));
469 //////////////////////////////////////////////////////////////////////////
470 /// @brief Gather scissor rect data based on per-prim viewport indices.
471 /// @param pScissorsInFixedPoint - array of scissor rects in 16.8 fixed point.
472 /// @param pViewportIndex - array of per-primitive vewport indexes.
473 /// @param scisXmin - output vector of per-prmitive scissor rect Xmin data.
474 /// @param scisYmin - output vector of per-prmitive scissor rect Ymin data.
475 /// @param scisXmax - output vector of per-prmitive scissor rect Xmax data.
476 /// @param scisYmax - output vector of per-prmitive scissor rect Ymax data.
478 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
479 template<size_t SimdWidth
>
480 struct GatherScissors
482 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
483 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
484 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
486 SWR_ASSERT(0, "Unhandled Simd Width in Scissor Rect Gather");
491 struct GatherScissors
<8>
493 static void Gather(const SWR_RECT
* pScissorsInFixedPoint
, const uint32_t* pViewportIndex
,
494 simdscalari
&scisXmin
, simdscalari
&scisYmin
,
495 simdscalari
&scisXmax
, simdscalari
&scisYmax
)
497 scisXmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmin
,
498 pScissorsInFixedPoint
[pViewportIndex
[1]].xmin
,
499 pScissorsInFixedPoint
[pViewportIndex
[2]].xmin
,
500 pScissorsInFixedPoint
[pViewportIndex
[3]].xmin
,
501 pScissorsInFixedPoint
[pViewportIndex
[4]].xmin
,
502 pScissorsInFixedPoint
[pViewportIndex
[5]].xmin
,
503 pScissorsInFixedPoint
[pViewportIndex
[6]].xmin
,
504 pScissorsInFixedPoint
[pViewportIndex
[7]].xmin
);
505 scisYmin
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymin
,
506 pScissorsInFixedPoint
[pViewportIndex
[1]].ymin
,
507 pScissorsInFixedPoint
[pViewportIndex
[2]].ymin
,
508 pScissorsInFixedPoint
[pViewportIndex
[3]].ymin
,
509 pScissorsInFixedPoint
[pViewportIndex
[4]].ymin
,
510 pScissorsInFixedPoint
[pViewportIndex
[5]].ymin
,
511 pScissorsInFixedPoint
[pViewportIndex
[6]].ymin
,
512 pScissorsInFixedPoint
[pViewportIndex
[7]].ymin
);
513 scisXmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].xmax
,
514 pScissorsInFixedPoint
[pViewportIndex
[1]].xmax
,
515 pScissorsInFixedPoint
[pViewportIndex
[2]].xmax
,
516 pScissorsInFixedPoint
[pViewportIndex
[3]].xmax
,
517 pScissorsInFixedPoint
[pViewportIndex
[4]].xmax
,
518 pScissorsInFixedPoint
[pViewportIndex
[5]].xmax
,
519 pScissorsInFixedPoint
[pViewportIndex
[6]].xmax
,
520 pScissorsInFixedPoint
[pViewportIndex
[7]].xmax
);
521 scisYmax
= _simd_set_epi32(pScissorsInFixedPoint
[pViewportIndex
[0]].ymax
,
522 pScissorsInFixedPoint
[pViewportIndex
[1]].ymax
,
523 pScissorsInFixedPoint
[pViewportIndex
[2]].ymax
,
524 pScissorsInFixedPoint
[pViewportIndex
[3]].ymax
,
525 pScissorsInFixedPoint
[pViewportIndex
[4]].ymax
,
526 pScissorsInFixedPoint
[pViewportIndex
[5]].ymax
,
527 pScissorsInFixedPoint
[pViewportIndex
[6]].ymax
,
528 pScissorsInFixedPoint
[pViewportIndex
[7]].ymax
);
532 //////////////////////////////////////////////////////////////////////////
533 /// @brief StreamOut - Streams vertex data out to SO buffers.
534 /// Generally, we are only streaming out a SIMDs worth of triangles.
535 /// @param pDC - pointer to draw context.
536 /// @param workerId - thread's worker id. Even thread has a unique id.
537 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
538 static void StreamOut(
543 uint32_t streamIndex
)
545 SWR_CONTEXT
*pContext
= pDC
->pContext
;
547 AR_BEGIN(FEStreamout
, pDC
->drawId
);
549 const API_STATE
& state
= GetApiState(pDC
);
550 const SWR_STREAMOUT_STATE
&soState
= state
.soState
;
552 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
554 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
555 uint32_t primDataDwordVertexStride
= (KNOB_NUM_ATTRIBUTES
* sizeof(float) * 4) / sizeof(uint32_t);
557 SWR_STREAMOUT_CONTEXT soContext
= { 0 };
559 // Setup buffer state pointers.
560 for (uint32_t i
= 0; i
< 4; ++i
)
562 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
565 uint32_t numPrims
= pa
.NumPrims();
566 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
569 uint32_t soMask
= soState
.streamMasks
[streamIndex
];
571 // Write all entries into primitive data buffer for SOS.
572 while (_BitScanForward(&slot
, soMask
))
574 __m128 attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
575 uint32_t paSlot
= slot
+ VERTEX_ATTRIB_START_SLOT
;
576 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
578 // Attribute offset is relative offset from start of vertex.
579 // Note that attributes start at slot 1 in the PA buffer. We need to write this
580 // to prim data starting at slot 0. Which is why we do (slot - 1).
581 // Also note: GL works slightly differently, and needs slot 0
582 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
584 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
585 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
587 uint32_t* pPrimDataAttrib
= pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
589 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
591 soMask
&= ~(1 << slot
);
594 // Update pPrimData pointer
595 soContext
.pPrimData
= pPrimData
;
598 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr, "Trying to execute uninitialized streamout jit function.");
599 state
.pfnSoFunc
[streamIndex
](soContext
);
602 // Update SO write offset. The driver provides memory for the update.
603 for (uint32_t i
= 0; i
< 4; ++i
)
605 if (state
.soBuffer
[i
].pWriteOffset
)
607 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
610 if (state
.soBuffer
[i
].soWriteEnable
)
612 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
613 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
617 UPDATE_STAT_FE(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
618 UPDATE_STAT_FE(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
620 AR_END(FEStreamout
, 1);
623 //////////////////////////////////////////////////////////////////////////
624 /// @brief Computes number of invocations. The current index represents
625 /// the start of the SIMD. The max index represents how much work
626 /// items are remaining. If there is less then a SIMD's xmin of work
627 /// then return the remaining amount of work.
628 /// @param curIndex - The start index for the SIMD.
629 /// @param maxIndex - The last index for all work items.
630 static INLINE
uint32_t GetNumInvocations(
634 uint32_t remainder
= (maxIndex
- curIndex
);
635 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
638 //////////////////////////////////////////////////////////////////////////
639 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
640 /// The geometry shader will loop over each active streamout buffer, assembling
641 /// primitives for the downstream stages. When multistream output is enabled,
642 /// the generated stream ID buffer from the GS needs to be converted to a cut
643 /// buffer for the primitive assembler.
644 /// @param stream - stream id to generate the cut buffer for
645 /// @param pStreamIdBase - pointer to the stream ID buffer
646 /// @param numEmittedVerts - Number of total verts emitted by the GS
647 /// @param pCutBuffer - output buffer to write cuts to
648 void ProcessStreamIdBuffer(uint32_t stream
, uint8_t* pStreamIdBase
, uint32_t numEmittedVerts
, uint8_t *pCutBuffer
)
650 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
652 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
653 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
655 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
657 uint8_t curInputByte
= pStreamIdBase
[2*b
];
659 for (uint32_t i
= 0; i
< 4; ++i
)
661 if ((curInputByte
& 0x3) != stream
)
668 curInputByte
= pStreamIdBase
[2 * b
+ 1];
669 for (uint32_t i
= 0; i
< 4; ++i
)
671 if ((curInputByte
& 0x3) != stream
)
673 outByte
|= (1 << (i
+ 4));
678 *pCutBuffer
++ = outByte
;
682 THREAD SWR_GS_CONTEXT tlsGsContext
;
684 //////////////////////////////////////////////////////////////////////////
685 /// @brief Implements GS stage.
686 /// @param pDC - pointer to draw context.
687 /// @param workerId - thread's worker id. Even thread has a unique id.
688 /// @param pa - The primitive assembly object.
689 /// @param pGsOut - output stream for GS
691 typename HasStreamOutT
,
693 static void GeometryShaderStage(
699 void* pStreamCutBuffer
,
700 uint32_t* pSoPrimData
,
703 SWR_CONTEXT
*pContext
= pDC
->pContext
;
705 AR_BEGIN(FEGeometryShader
, pDC
->drawId
);
707 const API_STATE
& state
= GetApiState(pDC
);
708 const SWR_GS_STATE
* pState
= &state
.gsState
;
710 SWR_ASSERT(pGsOut
!= nullptr, "GS output buffer should be initialized");
711 SWR_ASSERT(pCutBuffer
!= nullptr, "GS output cut buffer should be initialized");
713 tlsGsContext
.pStream
= (uint8_t*)pGsOut
;
714 tlsGsContext
.pCutOrStreamIdBuffer
= (uint8_t*)pCutBuffer
;
715 tlsGsContext
.PrimitiveID
= primID
;
717 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
718 simdvector attrib
[MAX_ATTRIBUTES
];
720 // assemble all attributes for the input primitive
721 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
723 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
724 pa
.Assemble(attribSlot
, attrib
);
726 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
728 tlsGsContext
.vert
[i
].attrib
[attribSlot
] = attrib
[i
];
733 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
734 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
736 tlsGsContext
.vert
[i
].attrib
[VERTEX_POSITION_SLOT
] = attrib
[i
];
739 const uint32_t vertexStride
= sizeof(simdvertex
);
740 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
741 const uint32_t inputPrimStride
= numSimdBatches
* vertexStride
;
742 const uint32_t instanceStride
= inputPrimStride
* KNOB_SIMD_WIDTH
;
743 uint32_t cutPrimStride
;
744 uint32_t cutInstanceStride
;
746 if (pState
->isSingleStream
)
748 cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
749 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
753 cutPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
754 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
757 // record valid prims from the frontend to avoid over binning the newly generated
759 uint32_t numInputPrims
= pa
.NumPrims();
761 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
763 tlsGsContext
.InstanceID
= instance
;
764 tlsGsContext
.mask
= GenerateMask(numInputPrims
);
766 // execute the geometry shader
767 state
.pfnGsFunc(GetPrivateState(pDC
), &tlsGsContext
);
769 tlsGsContext
.pStream
+= instanceStride
;
770 tlsGsContext
.pCutOrStreamIdBuffer
+= cutInstanceStride
;
773 // set up new binner and state for the GS output topology
774 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
777 switch (pState
->outputTopology
)
779 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles
; break;
780 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines
; break;
781 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
782 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState
->outputTopology
);
786 // foreach input prim:
787 // - setup a new PA based on the emitted verts for that prim
788 // - loop over the new verts, calling PA to assemble each prim
789 uint32_t* pVertexCount
= (uint32_t*)&tlsGsContext
.vertexCount
;
790 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
792 uint32_t totalPrimsGenerated
= 0;
793 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
795 uint8_t* pInstanceBase
= (uint8_t*)pGsOut
+ inputPrim
* inputPrimStride
;
796 uint8_t* pCutBufferBase
= (uint8_t*)pCutBuffer
+ inputPrim
* cutPrimStride
;
797 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
799 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
800 if (numEmittedVerts
== 0)
805 uint8_t* pBase
= pInstanceBase
+ instance
* instanceStride
;
806 uint8_t* pCutBase
= pCutBufferBase
+ instance
* cutInstanceStride
;
808 uint32_t numAttribs
= state
.feNumAttributes
;
810 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
812 bool processCutVerts
= false;
814 uint8_t* pCutBuffer
= pCutBase
;
816 // assign default stream ID, only relevant when GS is outputting a single stream
817 uint32_t streamID
= 0;
818 if (pState
->isSingleStream
)
820 processCutVerts
= true;
821 streamID
= pState
->singleStreamID
;
822 if (streamID
!= stream
) continue;
826 // early exit if this stream is not enabled for streamout
827 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
832 // multi-stream output, need to translate StreamID buffer to a cut buffer
833 ProcessStreamIdBuffer(stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pStreamCutBuffer
);
834 pCutBuffer
= (uint8_t*)pStreamCutBuffer
;
835 processCutVerts
= false;
838 PA_STATE_CUT
gsPa(pDC
, pBase
, numEmittedVerts
, pCutBuffer
, numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
);
840 while (gsPa
.GetNextStreamOutput())
844 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
848 totalPrimsGenerated
+= gsPa
.NumPrims();
850 if (HasStreamOutT::value
)
852 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
855 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
858 // pull primitiveID from the GS output if available
859 if (state
.gsState
.emitsPrimitiveID
)
861 simdvector primIdAttrib
[3];
862 gsPa
.Assemble(VERTEX_PRIMID_SLOT
, primIdAttrib
);
863 vPrimId
= _simd_castps_si(primIdAttrib
[0].x
);
867 vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
870 // use viewport array index if GS declares it as an output attribute. Otherwise use index 0.
871 simdscalari vViewPortIdx
;
872 if (state
.gsState
.emitsViewportArrayIndex
)
874 simdvector vpiAttrib
[3];
875 gsPa
.Assemble(VERTEX_VIEWPORT_ARRAY_INDEX_SLOT
, vpiAttrib
);
877 // OOB indices => forced to zero.
878 simdscalari vNumViewports
= _simd_set1_epi32(KNOB_NUM_VIEWPORTS_SCISSORS
);
879 simdscalari vClearMask
= _simd_cmplt_epi32(_simd_castps_si(vpiAttrib
[0].x
), vNumViewports
);
880 vpiAttrib
[0].x
= _simd_and_ps(_simd_castsi_ps(vClearMask
), vpiAttrib
[0].x
);
882 vViewPortIdx
= _simd_castps_si(vpiAttrib
[0].x
);
886 vViewPortIdx
= _simd_set1_epi32(0);
889 pfnClipFunc(pDC
, gsPa
, workerId
, attrib
, GenMask(gsPa
.NumPrims()), vPrimId
, vViewPortIdx
);
892 } while (gsPa
.NextPrim());
898 // update GS pipeline stats
899 UPDATE_STAT_FE(GsInvocations
, numInputPrims
* pState
->instanceCount
);
900 UPDATE_STAT_FE(GsPrimitives
, totalPrimsGenerated
);
902 AR_END(FEGeometryShader
, 1);
905 //////////////////////////////////////////////////////////////////////////
906 /// @brief Allocate GS buffers
907 /// @param pDC - pointer to draw context.
908 /// @param state - API state
909 /// @param ppGsOut - pointer to GS output buffer allocation
910 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
911 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
, const API_STATE
& state
, void** ppGsOut
, void** ppCutBuffer
,
912 void **ppStreamCutBuffer
)
914 auto pArena
= pDC
->pArena
;
915 SWR_ASSERT(pArena
!= nullptr);
916 SWR_ASSERT(state
.gsState
.gsEnable
);
917 // allocate arena space to hold GS output verts
918 // @todo pack attribs
919 // @todo support multiple streams
920 const uint32_t vertexStride
= sizeof(simdvertex
);
921 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
922 uint32_t size
= state
.gsState
.instanceCount
* numSimdBatches
* vertexStride
* KNOB_SIMD_WIDTH
;
923 *ppGsOut
= pArena
->AllocAligned(size
, KNOB_SIMD_WIDTH
* sizeof(float));
925 const uint32_t cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
926 const uint32_t streamIdPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
927 const uint32_t cutBufferSize
= cutPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
928 const uint32_t streamIdSize
= streamIdPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
930 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
931 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
933 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
934 if (state
.gsState
.isSingleStream
)
936 *ppCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
937 *ppStreamCutBuffer
= nullptr;
941 *ppCutBuffer
= pArena
->AllocAligned(streamIdSize
, KNOB_SIMD_WIDTH
* sizeof(float));
942 *ppStreamCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
947 //////////////////////////////////////////////////////////////////////////
948 /// @brief Contains all data generated by the HS and passed to the
949 /// tessellator and DS.
950 struct TessellationThreadLocalData
952 SWR_HS_CONTEXT hsContext
;
953 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
957 simdscalar
* pDSOutput
;
958 size_t numDSOutputVectors
;
961 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
963 //////////////////////////////////////////////////////////////////////////
964 /// @brief Allocate tessellation data for this worker thread.
966 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
968 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
969 if (gt_pTessellationThreadData
== nullptr)
971 gt_pTessellationThreadData
= (TessellationThreadLocalData
*)
972 AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
973 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
977 //////////////////////////////////////////////////////////////////////////
978 /// @brief Implements Tessellation Stages.
979 /// @param pDC - pointer to draw context.
980 /// @param workerId - thread's worker id. Even thread has a unique id.
981 /// @param pa - The primitive assembly object.
982 /// @param pGsOut - output stream for GS
984 typename HasGeometryShaderT
,
985 typename HasStreamOutT
,
987 static void TessellationStages(
993 void* pCutStreamBuffer
,
994 uint32_t* pSoPrimData
,
997 SWR_CONTEXT
*pContext
= pDC
->pContext
;
998 const API_STATE
& state
= GetApiState(pDC
);
999 const SWR_TS_STATE
& tsState
= state
.tsState
;
1001 SWR_ASSERT(gt_pTessellationThreadData
);
1003 HANDLE tsCtx
= TSInitCtx(
1005 tsState
.partitioning
,
1006 tsState
.tsOutputTopology
,
1007 gt_pTessellationThreadData
->pTxCtx
,
1008 gt_pTessellationThreadData
->tsCtxSize
);
1009 if (tsCtx
== nullptr)
1011 gt_pTessellationThreadData
->pTxCtx
= AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
1014 tsState
.partitioning
,
1015 tsState
.tsOutputTopology
,
1016 gt_pTessellationThreadData
->pTxCtx
,
1017 gt_pTessellationThreadData
->tsCtxSize
);
1021 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1022 if (HasRastT::value
)
1024 switch (tsState
.postDSTopology
)
1026 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles
; break;
1027 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines
; break;
1028 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
1029 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState
.postDSTopology
);
1033 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1034 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
1035 hsContext
.PrimitiveID
= primID
;
1037 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1038 // Max storage for one attribute for an entire simdprimitive
1039 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1041 // assemble all attributes for the input primitives
1042 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1044 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
1045 pa
.Assemble(attribSlot
, simdattrib
);
1047 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1049 hsContext
.vert
[i
].attrib
[attribSlot
] = simdattrib
[i
];
1054 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1057 uint32_t numPrims
= pa
.NumPrims();
1058 hsContext
.mask
= GenerateMask(numPrims
);
1061 AR_BEGIN(FEHullShader
, pDC
->drawId
);
1062 state
.pfnHsFunc(GetPrivateState(pDC
), &hsContext
);
1063 AR_END(FEHullShader
, 0);
1065 UPDATE_STAT_FE(HsInvocations
, numPrims
);
1067 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1069 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1072 SWR_TS_TESSELLATED_DATA tsData
= { 0 };
1073 AR_BEGIN(FETessellation
, pDC
->drawId
);
1074 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1075 AR_END(FETessellation
, 0);
1077 if (tsData
.NumPrimitives
== 0)
1081 SWR_ASSERT(tsData
.NumDomainPoints
);
1083 // Allocate DS Output memory
1084 uint32_t requiredDSVectorInvocations
= AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1085 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.numDsOutputAttribs
;
1086 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1087 if (requiredDSOutputVectors
> gt_pTessellationThreadData
->numDSOutputVectors
)
1089 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1090 gt_pTessellationThreadData
->pDSOutput
= (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1091 gt_pTessellationThreadData
->numDSOutputVectors
= requiredDSOutputVectors
;
1093 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1094 SWR_ASSERT(gt_pTessellationThreadData
->numDSOutputVectors
>= requiredDSOutputVectors
);
1097 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1100 // Run Domain Shader
1101 SWR_DS_CONTEXT dsContext
;
1102 dsContext
.PrimitiveID
= pPrimId
[p
];
1103 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1104 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1105 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1106 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1107 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1109 uint32_t dsInvocations
= 0;
1111 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
; ++dsContext
.vectorOffset
)
1113 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1115 AR_BEGIN(FEDomainShader
, pDC
->drawId
);
1116 state
.pfnDsFunc(GetPrivateState(pDC
), &dsContext
);
1117 AR_END(FEDomainShader
, 0);
1119 dsInvocations
+= KNOB_SIMD_WIDTH
;
1121 UPDATE_STAT_FE(DsInvocations
, tsData
.NumDomainPoints
);
1125 dsContext
.pOutputData
,
1126 dsContext
.vectorStride
,
1127 tsState
.numDsOutputAttribs
,
1129 tsData
.NumPrimitives
,
1130 tsState
.postDSTopology
);
1132 while (tessPa
.HasWork())
1134 if (HasGeometryShaderT::value
)
1136 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1137 pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
,
1138 _simd_set1_epi32(dsContext
.PrimitiveID
));
1142 if (HasStreamOutT::value
)
1144 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1147 if (HasRastT::value
)
1149 simdvector prim
[3]; // Only deal with triangles, lines, or points
1150 AR_BEGIN(FEPAAssemble
, pDC
->drawId
);
1151 #if SWR_ENABLE_ASSERTS
1154 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1155 AR_END(FEPAAssemble
, 1);
1156 SWR_ASSERT(assemble
);
1158 SWR_ASSERT(pfnClipFunc
);
1159 pfnClipFunc(pDC
, tessPa
, workerId
, prim
,
1160 GenMask(tessPa
.NumPrims()), _simd_set1_epi32(dsContext
.PrimitiveID
), _simd_set1_epi32(0));
1166 } // while (tessPa.HasWork())
1167 } // for (uint32_t p = 0; p < numPrims; ++p)
1169 TSDestroyCtx(tsCtx
);
1172 //////////////////////////////////////////////////////////////////////////
1173 /// @brief FE handler for SwrDraw.
1174 /// @tparam IsIndexedT - Is indexed drawing enabled
1175 /// @tparam HasTessellationT - Is tessellation enabled
1176 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1177 /// @tparam HasStreamOutT - Is stream-out enabled
1178 /// @tparam HasRastT - Is rasterization enabled
1179 /// @param pContext - pointer to SWR context.
1180 /// @param pDC - pointer to draw context.
1181 /// @param workerId - thread's worker id.
1182 /// @param pUserData - Pointer to DRAW_WORK
1184 typename IsIndexedT
,
1185 typename IsCutIndexEnabledT
,
1186 typename HasTessellationT
,
1187 typename HasGeometryShaderT
,
1188 typename HasStreamOutT
,
1191 SWR_CONTEXT
*pContext
,
1197 #if KNOB_ENABLE_TOSS_POINTS
1198 if (KNOB_TOSS_QUEUE_FE
)
1204 AR_BEGIN(FEProcessDraw
, pDC
->drawId
);
1206 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1207 const API_STATE
& state
= GetApiState(pDC
);
1208 __m256i vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1209 SWR_VS_CONTEXT vsContext
;
1213 uint32_t endVertex
= work
.numVerts
;
1215 const int32_t* pLastRequestedIndex
= nullptr;
1216 if (IsIndexedT::value
)
1221 indexSize
= sizeof(uint32_t);
1222 pLastRequestedIndex
= &(work
.pIB
[endVertex
]);
1225 indexSize
= sizeof(uint16_t);
1226 // nasty address offset to last index
1227 pLastRequestedIndex
= (int32_t*)(&(((uint16_t*)work
.pIB
)[endVertex
]));
1230 indexSize
= sizeof(uint8_t);
1231 // nasty address offset to last index
1232 pLastRequestedIndex
= (int32_t*)(&(((uint8_t*)work
.pIB
)[endVertex
]));
1240 // No cuts, prune partial primitives.
1241 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1244 SWR_FETCH_CONTEXT fetchInfo
= { 0 };
1245 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
1246 fetchInfo
.StartInstance
= work
.startInstance
;
1247 fetchInfo
.StartVertex
= 0;
1249 vsContext
.pVin
= &vin
;
1251 if (IsIndexedT::value
)
1253 fetchInfo
.BaseVertex
= work
.baseVertex
;
1255 // if the entire index buffer isn't being consumed, set the last index
1256 // so that fetches < a SIMD wide will be masked off
1257 fetchInfo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1258 if (pLastRequestedIndex
< fetchInfo
.pLastIndex
)
1260 fetchInfo
.pLastIndex
= pLastRequestedIndex
;
1265 fetchInfo
.StartVertex
= work
.startVertex
;
1268 #ifdef KNOB_ENABLE_RDTSC
1269 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1272 void* pGsOut
= nullptr;
1273 void* pCutBuffer
= nullptr;
1274 void* pStreamCutBuffer
= nullptr;
1275 if (HasGeometryShaderT::value
)
1277 AllocateGsBuffers(pDC
, state
, &pGsOut
, &pCutBuffer
, &pStreamCutBuffer
);
1280 if (HasTessellationT::value
)
1282 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1283 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1284 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1286 AllocateTessellationData(pContext
);
1290 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1291 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1292 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1295 // allocate space for streamout input prim data
1296 uint32_t* pSoPrimData
= nullptr;
1297 if (HasStreamOutT::value
)
1299 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1302 // choose primitive assembler
1303 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
, state
.topology
, work
.numVerts
);
1304 PA_STATE
& pa
= paFactory
.GetPA();
1306 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1307 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1312 if (IsIndexedT::value
)
1314 fetchInfo
.pIndices
= work
.pIB
;
1318 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
1319 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
1322 fetchInfo
.CurInstance
= instanceNum
;
1323 vsContext
.InstanceID
= instanceNum
;
1325 while (pa
.HasWork())
1327 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1328 // So we need to keep this outside of (i < endVertex) check.
1329 simdmask
* pvCutIndices
= nullptr;
1330 if (IsIndexedT::value
)
1332 pvCutIndices
= &pa
.GetNextVsIndices();
1335 simdvertex
& vout
= pa
.GetNextVsOutput();
1336 vsContext
.pVout
= &vout
;
1341 // 1. Execute FS/VS for a single SIMD.
1342 AR_BEGIN(FEFetchShader
, pDC
->drawId
);
1343 state
.pfnFetchFunc(fetchInfo
, vin
);
1344 AR_END(FEFetchShader
, 0);
1346 // forward fetch generated vertex IDs to the vertex shader
1347 vsContext
.VertexID
= fetchInfo
.VertexID
;
1349 // Setup active mask for vertex shader.
1350 vsContext
.mask
= GenerateMask(endVertex
- i
);
1352 // forward cut mask to the PA
1353 if (IsIndexedT::value
)
1355 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
1358 UPDATE_STAT_FE(IaVertices
, GetNumInvocations(i
, endVertex
));
1360 #if KNOB_ENABLE_TOSS_POINTS
1361 if (!KNOB_TOSS_FETCH
)
1364 AR_BEGIN(FEVertexShader
, pDC
->drawId
);
1365 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext
);
1366 AR_END(FEVertexShader
, 0);
1368 UPDATE_STAT_FE(VsInvocations
, GetNumInvocations(i
, endVertex
));
1372 // 2. Assemble primitives given the last two SIMD.
1375 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
1376 // PaAssemble returns false if there is not enough verts to assemble.
1377 AR_BEGIN(FEPAAssemble
, pDC
->drawId
);
1378 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1379 AR_END(FEPAAssemble
, 1);
1381 #if KNOB_ENABLE_TOSS_POINTS
1382 if (!KNOB_TOSS_FETCH
)
1385 #if KNOB_ENABLE_TOSS_POINTS
1391 UPDATE_STAT_FE(IaPrimitives
, pa
.NumPrims());
1393 if (HasTessellationT::value
)
1395 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
1396 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1398 else if (HasGeometryShaderT::value
)
1400 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1401 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1405 // If streamout is enabled then stream vertices out to memory.
1406 if (HasStreamOutT::value
)
1408 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1411 if (HasRastT::value
)
1413 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
1414 pDC
->pState
->pfnProcessPrims(pDC
, pa
, workerId
, prim
,
1415 GenMask(pa
.NumPrims()), pa
.GetPrimID(work
.startPrimID
), _simd_set1_epi32(0));
1421 } while (pa
.NextPrim());
1423 i
+= KNOB_SIMD_WIDTH
;
1424 if (IsIndexedT::value
)
1426 fetchInfo
.pIndices
= (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
1430 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
1436 AR_END(FEProcessDraw
, numPrims
* work
.numInstances
);
1439 struct FEDrawChooser
1441 typedef PFN_FE_WORK_FUNC FuncType
;
1443 template <typename
... ArgsB
>
1444 static FuncType
GetFunc()
1446 return ProcessDraw
<ArgsB
...>;
1451 // Selector for correct templated Draw front-end function
1452 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
1454 bool IsCutIndexEnabled
,
1455 bool HasTessellation
,
1456 bool HasGeometryShader
,
1458 bool HasRasterization
)
1460 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
, IsCutIndexEnabled
, HasTessellation
, HasGeometryShader
, HasStreamOut
, HasRasterization
);
1463 //////////////////////////////////////////////////////////////////////////
1464 /// @brief Processes attributes for the backend based on linkage mask and
1465 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1466 /// @param pDC - Draw context
1467 /// @param pa - Primitive Assembly state
1468 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1469 /// @param pLinkageMap - maps VS attribute slot to PS slot
1470 /// @param triIndex - Triangle to process attributes for
1471 /// @param pBuffer - Output result
1472 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
1473 INLINE
void ProcessAttributes(
1480 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
1481 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1482 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1483 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
1484 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
1485 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
1487 static const float constTable
[3][4] = {
1488 {0.0f
, 0.0f
, 0.0f
, 0.0f
},
1489 {0.0f
, 0.0f
, 0.0f
, 1.0f
},
1490 {1.0f
, 1.0f
, 1.0f
, 1.0f
}
1493 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
1496 if (IsSwizzledT::value
)
1498 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
1499 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
1504 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
1507 __m128 attrib
[3]; // triangle attribs (always 4 wide)
1508 float* pAttribStart
= pBuffer
;
1510 if (HasConstantInterpT::value
|| IsDegenerate::value
)
1512 if (_bittest(&constantInterpMask
, i
))
1515 uint32_t adjustedTriIndex
;
1516 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
1517 static const int32_t quadProvokingTri
[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1518 static const uint32_t quadProvokingVertex
[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1519 static const int32_t qstripProvokingTri
[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1520 static const uint32_t qstripProvokingVertex
[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1524 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
1525 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
1527 case TOP_QUAD_STRIP
:
1528 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
1529 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
1531 case TOP_TRIANGLE_STRIP
:
1532 adjustedTriIndex
= triIndex
;
1533 vid
= (triIndex
& 1)
1534 ? tristripProvokingVertex
[provokingVertex
]
1538 adjustedTriIndex
= triIndex
;
1539 vid
= provokingVertex
;
1543 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
1545 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1547 _mm_store_ps(pBuffer
, attrib
[vid
]);
1553 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
1555 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1557 _mm_store_ps(pBuffer
, attrib
[i
]);
1564 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
1566 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1568 _mm_store_ps(pBuffer
, attrib
[i
]);
1573 // pad out the attrib buffer to 3 verts to ensure the triangle
1574 // interpolation code in the pixel shader works correctly for the
1575 // 3 topologies - point, line, tri. This effectively zeros out the
1576 // effect of the missing vertices in the triangle interpolation.
1577 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
1579 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
1583 // check for constant source overrides
1584 if (IsSwizzledT::value
)
1586 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
1590 while (_BitScanForward(&comp
, mask
))
1592 mask
&= ~(1 << comp
);
1594 float constantValue
= 0.0f
;
1595 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
1597 case SWR_CONSTANT_SOURCE_CONST_0000
:
1598 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
1599 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
1600 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
1602 case SWR_CONSTANT_SOURCE_PRIM_ID
:
1603 constantValue
= *(float*)&primId
;
1607 // apply constant value to all 3 vertices
1608 for (uint32_t v
= 0; v
< 3; ++v
)
1610 pAttribStart
[comp
+ v
* 4] = constantValue
;
1619 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
1621 struct ProcessAttributesChooser
1623 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
1625 template <typename
... ArgsB
>
1626 static FuncType
GetFunc()
1628 return ProcessAttributes
<ArgsB
...>;
1632 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
1634 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
1637 //////////////////////////////////////////////////////////////////////////
1638 /// @brief Processes enabled user clip distances. Loads the active clip
1639 /// distances from the PA, sets up barycentric equations, and
1640 /// stores the results to the output buffer
1641 /// @param pa - Primitive Assembly state
1642 /// @param primIndex - primitive index to process
1643 /// @param clipDistMask - mask of enabled clip distances
1644 /// @param pUserClipBuffer - buffer to store results
1645 template<uint32_t NumVerts
>
1646 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float* pUserClipBuffer
)
1649 while (_BitScanForward(&clipDist
, clipDistMask
))
1651 clipDistMask
&= ~(1 << clipDist
);
1652 uint32_t clipSlot
= clipDist
>> 2;
1653 uint32_t clipComp
= clipDist
& 0x3;
1654 uint32_t clipAttribSlot
= clipSlot
== 0 ?
1655 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
1657 __m128 primClipDist
[3];
1658 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
1660 float vertClipDist
[NumVerts
];
1661 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1663 OSALIGNSIMD(float) aVertClipDist
[4];
1664 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
1665 vertClipDist
[e
] = aVertClipDist
[clipComp
];
1668 // setup plane equations for barycentric interpolation in the backend
1669 float baryCoeff
[NumVerts
];
1670 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
1672 baryCoeff
[e
] = vertClipDist
[e
] - vertClipDist
[NumVerts
- 1];
1674 baryCoeff
[NumVerts
- 1] = vertClipDist
[NumVerts
- 1];
1676 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1678 *(pUserClipBuffer
++) = baryCoeff
[e
];
1683 //////////////////////////////////////////////////////////////////////////
1684 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1685 /// Point precision from FP32.
1686 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
1687 INLINE simdscalari
fpToFixedPointVertical(const simdscalar vIn
)
1689 simdscalar vFixed
= _simd_mul_ps(vIn
, _simd_set1_ps(PT::ScaleT::value
));
1690 return _simd_cvtps_epi32(vFixed
);
1693 //////////////////////////////////////////////////////////////////////////
1694 /// @brief Helper function to set the X,Y coords of a triangle to the
1695 /// requested Fixed Point precision from FP32.
1696 /// @param tri: simdvector[3] of FP triangle verts
1697 /// @param vXi: fixed point X coords of tri verts
1698 /// @param vYi: fixed point Y coords of tri verts
1699 INLINE
static void FPToFixedPoint(const simdvector
* const tri
, simdscalari (&vXi
)[3], simdscalari (&vYi
)[3])
1701 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
1702 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
1703 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
1704 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
1705 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
1706 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
1709 //////////////////////////////////////////////////////////////////////////
1710 /// @brief Calculate bounding box for current triangle
1711 /// @tparam CT: ConservativeRastFETraits type
1712 /// @param vX: fixed point X position for triangle verts
1713 /// @param vY: fixed point Y position for triangle verts
1714 /// @param bbox: fixed point bbox
1715 /// *Note*: expects vX, vY to be in the correct precision for the type
1716 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1717 template <typename CT
>
1718 INLINE
void calcBoundingBoxIntVertical(const simdvector
* const tri
, simdscalari (&vX
)[3], simdscalari (&vY
)[3], simdBBox
&bbox
)
1720 simdscalari vMinX
= vX
[0];
1721 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
1722 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
1724 simdscalari vMaxX
= vX
[0];
1725 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
1726 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
1728 simdscalari vMinY
= vY
[0];
1729 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
1730 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
1732 simdscalari vMaxY
= vY
[0];
1733 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
1734 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
1742 //////////////////////////////////////////////////////////////////////////
1743 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1744 /// Offsets BBox for conservative rast
1746 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simdvector
* const tri
, simdscalari (&vX
)[3], simdscalari (&vY
)[3], simdBBox
&bbox
)
1748 // FE conservative rast traits
1749 typedef FEConservativeRastT CT
;
1751 simdscalari vMinX
= vX
[0];
1752 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
1753 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
1755 simdscalari vMaxX
= vX
[0];
1756 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
1757 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
1759 simdscalari vMinY
= vY
[0];
1760 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
1761 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
1763 simdscalari vMaxY
= vY
[0];
1764 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
1765 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
1767 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1768 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1769 bbox
.xmin
= _simd_sub_epi32(vMinX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1770 bbox
.xmax
= _simd_add_epi32(vMaxX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1771 bbox
.ymin
= _simd_sub_epi32(vMinY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1772 bbox
.ymax
= _simd_add_epi32(vMaxY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1775 //////////////////////////////////////////////////////////////////////////
1776 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1777 /// culling, viewport transform, etc.
1778 /// @param pDC - pointer to draw context.
1779 /// @param pa - The primitive assembly object.
1780 /// @param workerId - thread's worker id. Even thread has a unique id.
1781 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1782 /// @param primID - Primitive ID for each triangle.
1783 /// @param viewportIdx - viewport array index for each triangle.
1784 /// @tparam CT - ConservativeRastFETraits
1785 template <typename CT
>
1793 simdscalari viewportIdx
)
1795 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1797 AR_BEGIN(FEBinTriangles
, pDC
->drawId
);
1799 const API_STATE
& state
= GetApiState(pDC
);
1800 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1801 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1802 const SWR_GS_STATE
& gsState
= state
.gsState
;
1803 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1806 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
1807 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
1808 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
1810 if (feState
.vpTransformDisable
)
1812 // RHW is passed in directly when VP transform is disabled
1813 vRecipW0
= tri
[0].v
[3];
1814 vRecipW1
= tri
[1].v
[3];
1815 vRecipW2
= tri
[2].v
[3];
1819 // Perspective divide
1820 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
1821 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
1822 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
1824 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
1825 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
1826 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
1828 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
1829 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
1830 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
1832 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
1833 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
1834 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
1836 // Viewport transform to screen space coords
1837 if (state
.gsState
.emitsViewportArrayIndex
)
1839 viewportTransform
<3>(tri
, state
.vpMatrices
, viewportIdx
);
1843 viewportTransform
<3>(tri
, state
.vpMatrices
);
1847 // Adjust for pixel center location
1848 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1849 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
1850 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
1852 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
1853 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
1855 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
1856 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
1858 simdscalari vXi
[3], vYi
[3];
1859 // Set vXi, vYi to required fixed point precision
1860 FPToFixedPoint(tri
, vXi
, vYi
);
1863 simdscalari vAi
[3], vBi
[3];
1864 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
1867 simdscalari vDet
[2];
1868 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
1871 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
1872 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
1874 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
1876 uint32_t origTriMask
= triMask
;
1877 // don't cull degenerate triangles if we're conservatively rasterizing
1878 if(!CT::IsConservativeT::value
)
1880 triMask
&= ~cullZeroAreaMask
;
1883 // determine front winding tris
1885 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1886 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
1887 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
1888 int cwTriMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/2) );
1890 uint32_t frontWindingTris
;
1891 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
1893 frontWindingTris
= cwTriMask
;
1897 frontWindingTris
= ~cwTriMask
;
1902 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1904 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1905 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1906 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1907 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1908 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1909 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1912 triMask
&= ~cullTris
;
1914 if (origTriMask
^ triMask
)
1916 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1919 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1920 // compute per tri backface
1921 uint32_t frontFaceMask
= frontWindingTris
;
1922 uint32_t *pPrimID
= (uint32_t *)&primID
;
1923 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
1925 // for center sample pattern, all samples are at pixel center; calculate coverage
1926 // once at center and broadcast the results in the backend
1927 const SWR_MULTISAMPLE_COUNT sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1928 uint32_t edgeEnable
;
1929 PFN_WORK_FUNC pfnWork
;
1930 if(CT::IsConservativeT::value
)
1932 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1933 // used to call the appropriate templated rasterizer function
1934 if(cullZeroAreaMask
> 0)
1937 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
1938 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
1939 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
1942 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
1943 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
1944 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
1947 // if v0 == v1 & v1 == v2, v0 == v2
1948 uint32_t e2Mask
= e0Mask
& e1Mask
;
1949 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1951 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1952 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1953 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1954 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1955 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1956 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1957 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1959 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1963 edgeEnable
= 0x00FFFFFF;
1968 // degenerate triangles won't be sent to rasterizer; just enable all edges
1969 pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
1970 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, ALL_EDGES_VALID
,
1971 (state
.scissorsTileAligned
== false));
1976 goto endBinTriangles
;
1979 // Calc bounding box of triangles
1981 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1983 // determine if triangle falls between pixel centers and discard
1984 // only discard for non-MSAA case and when conservative rast is disabled
1985 // (xmin + 127) & ~255
1986 // (xmax + 128) & ~255
1987 if(rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&& (!CT::IsConservativeT::value
))
1989 origTriMask
= triMask
;
1993 simdscalari xmin
= _simd_add_epi32(bbox
.xmin
, _simd_set1_epi32(127));
1994 xmin
= _simd_and_si(xmin
, _simd_set1_epi32(~255));
1995 simdscalari xmax
= _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(128));
1996 xmax
= _simd_and_si(xmax
, _simd_set1_epi32(~255));
1998 simdscalari vMaskH
= _simd_cmpeq_epi32(xmin
, xmax
);
2000 simdscalari ymin
= _simd_add_epi32(bbox
.ymin
, _simd_set1_epi32(127));
2001 ymin
= _simd_and_si(ymin
, _simd_set1_epi32(~255));
2002 simdscalari ymax
= _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(128));
2003 ymax
= _simd_and_si(ymax
, _simd_set1_epi32(~255));
2005 simdscalari vMaskV
= _simd_cmpeq_epi32(ymin
, ymax
);
2006 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
2007 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
2010 triMask
&= ~cullCenterMask
;
2012 if(origTriMask
^ triMask
)
2014 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
2018 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2019 // Gather the AOS effective scissor rects based on the per-prim VP index.
2020 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2021 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2022 if (state
.gsState
.emitsViewportArrayIndex
)
2024 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2025 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2027 else // broadcast fast path for non-VPAI case.
2029 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2030 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2031 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2032 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2035 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2036 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2037 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2038 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2040 if(CT::IsConservativeT::value
)
2042 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
2043 // some area. Bump the xmax/ymax edges out
2044 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.ymin
, bbox
.ymax
);
2045 bbox
.ymax
= _simd_blendv_epi32(bbox
.ymax
, _simd_add_epi32(bbox
.ymax
, _simd_set1_epi32(1)), topEqualsBottom
);
2046 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.xmin
, bbox
.xmax
);
2047 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, _simd_add_epi32(bbox
.xmax
, _simd_set1_epi32(1)), leftEqualsRight
);
2050 // Cull tris completely outside scissor
2052 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2053 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2054 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2055 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2056 triMask
= triMask
& ~maskOutsideScissor
;
2061 goto endBinTriangles
;
2064 // Convert triangle bbox to macrotile units.
2065 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2066 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2067 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2068 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2070 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2071 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2072 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2073 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2074 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2076 // transpose verts needed for backend
2077 /// @todo modify BE to take non-transformed verts
2078 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2079 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
2080 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
2081 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
2082 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
2084 // store render target array index
2085 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2086 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2088 simdvector vRtai
[3];
2089 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2091 vRtaii
= _simd_castps_si(vRtai
[0].x
);
2092 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2096 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2099 // scan remaining valid triangles and bin each separately
2100 while (_BitScanForward(&triIndex
, triMask
))
2102 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2103 uint32_t numScalarAttribs
= linkageCount
* 4;
2109 if(CT::IsConservativeT::value
)
2111 // only rasterize valid edges if we have a degenerate primitive
2112 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
2113 work
.pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
2114 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, triEdgeEnable
,
2115 (state
.scissorsTileAligned
== false));
2117 // Degenerate triangles are required to be constant interpolated
2118 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
2122 isDegenerate
= false;
2123 work
.pfnWork
= pfnWork
;
2126 // Select attribute processor
2127 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
2128 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
2130 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2132 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
2133 desc
.triFlags
.primID
= pPrimID
[triIndex
];
2134 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
2135 desc
.triFlags
.viewportIndex
= pViewportIndex
[triIndex
];
2137 auto pArena
= pDC
->pArena
;
2138 SWR_ASSERT(pArena
!= nullptr);
2140 // store active attribs
2141 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2142 desc
.pAttribs
= pAttribs
;
2143 desc
.numAttribs
= linkageCount
;
2144 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
2146 // store triangle vertex data
2147 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2149 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
2150 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
2151 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
2152 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
2154 // store user clip distances
2155 if (rastState
.clipDistanceMask
)
2157 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2158 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2159 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2162 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
2164 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
2166 #if KNOB_ENABLE_TOSS_POINTS
2167 if (!KNOB_TOSS_SETUP_TRIS
)
2170 pTileMgr
->enqueue(x
, y
, &work
);
2174 triMask
&= ~(1 << triIndex
);
2178 AR_END(FEBinTriangles
, 1);
2181 struct FEBinTrianglesChooser
2183 typedef PFN_PROCESS_PRIMS FuncType
;
2185 template <typename
... ArgsB
>
2186 static FuncType
GetFunc()
2188 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
2192 // Selector for correct templated BinTrinagles function
2193 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
2195 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
2198 //////////////////////////////////////////////////////////////////////////
2199 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2200 /// @param pDC - pointer to draw context.
2201 /// @param pa - The primitive assembly object.
2202 /// @param workerId - thread's worker id. Even thread has a unique id.
2203 /// @param tri - Contains point position data for SIMDs worth of points.
2204 /// @param primID - Primitive ID for each point.
2212 simdscalari viewportIdx
)
2214 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2216 AR_BEGIN(FEBinPoints
, pDC
->drawId
);
2218 simdvector
& primVerts
= prim
[0];
2220 const API_STATE
& state
= GetApiState(pDC
);
2221 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2222 const SWR_GS_STATE
& gsState
= state
.gsState
;
2223 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2224 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2226 // Select attribute processor
2227 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
2228 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2230 if (!feState
.vpTransformDisable
)
2232 // perspective divide
2233 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
2234 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
2235 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
2236 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
2238 // viewport transform to screen coords
2239 if (state
.gsState
.emitsViewportArrayIndex
)
2241 viewportTransform
<1>(&primVerts
, state
.vpMatrices
, viewportIdx
);
2245 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
2249 // adjust for pixel center location
2250 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2251 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
2252 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
2254 // convert to fixed point
2255 simdscalari vXi
, vYi
;
2256 vXi
= fpToFixedPointVertical(primVerts
.x
);
2257 vYi
= fpToFixedPointVertical(primVerts
.y
);
2259 if (CanUseSimplePoints(pDC
))
2261 // adjust for ymin-xmin rule
2262 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
2263 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
2265 // cull points off the ymin-xmin edge of the viewport
2266 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
2267 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
2269 // compute macro tile coordinates
2270 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2271 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2273 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
2274 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
2275 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
2277 // compute raster tile coordinates
2278 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
2279 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
2281 // compute raster tile relative x,y for coverage mask
2282 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
2283 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
2285 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
2286 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
2288 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
2289 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
2290 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
2291 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
2293 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
2294 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
2295 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
2296 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
2298 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
2299 _simd_store_ps((float*)aZ
, primVerts
.z
);
2301 // store render target array index
2302 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2303 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2306 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
2307 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
2308 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2312 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2315 uint32_t *pPrimID
= (uint32_t *)&primID
;
2316 DWORD primIndex
= 0;
2318 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
2320 // scan remaining valid triangles and bin each separately
2321 while (_BitScanForward(&primIndex
, primMask
))
2323 uint32_t linkageCount
= backendState
.numAttributes
;
2324 uint32_t numScalarAttribs
= linkageCount
* 4;
2329 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2331 // points are always front facing
2332 desc
.triFlags
.frontFacing
= 1;
2333 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2334 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2335 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2337 work
.pfnWork
= RasterizeSimplePoint
;
2339 auto pArena
= pDC
->pArena
;
2340 SWR_ASSERT(pArena
!= nullptr);
2343 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
2344 desc
.pAttribs
= pAttribs
;
2345 desc
.numAttribs
= linkageCount
;
2347 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
2349 // store raster tile aligned x, y, perspective correct z
2350 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2351 desc
.pTriBuffer
= pTriBuffer
;
2352 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
2353 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
2354 *pTriBuffer
= aZ
[primIndex
];
2356 uint32_t tX
= aTileRelativeX
[primIndex
];
2357 uint32_t tY
= aTileRelativeY
[primIndex
];
2359 // pack the relative x,y into the coverageMask, the rasterizer will
2360 // generate the true coverage mask from it
2361 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
2364 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2365 #if KNOB_ENABLE_TOSS_POINTS
2366 if (!KNOB_TOSS_SETUP_TRIS
)
2369 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
2371 primMask
&= ~(1 << primIndex
);
2376 // non simple points need to be potentially binned to multiple macro tiles
2377 simdscalar vPointSize
;
2378 if (rastState
.pointParam
)
2381 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
2382 vPointSize
= size
[0].x
;
2386 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
2389 // bloat point to bbox
2391 bbox
.xmin
= bbox
.xmax
= vXi
;
2392 bbox
.ymin
= bbox
.ymax
= vYi
;
2394 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
2395 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2396 bbox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2397 bbox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2398 bbox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2399 bbox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2401 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2402 // Gather the AOS effective scissor rects based on the per-prim VP index.
2403 /// @todo: Look at speeding this up -- weigh against corresponding costs in rasterizer.
2404 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2405 if (state
.gsState
.emitsViewportArrayIndex
)
2407 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2408 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2410 else // broadcast fast path for non-VPAI case.
2412 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2413 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2414 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2415 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2418 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2419 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2420 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2421 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2423 // Cull bloated points completely outside scissor
2424 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2425 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2426 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2427 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2428 primMask
= primMask
& ~maskOutsideScissor
;
2430 // Convert bbox to macrotile units.
2431 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2432 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2433 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2434 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2436 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2437 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2438 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2439 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2440 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2442 // store render target array index
2443 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2444 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2446 simdvector vRtai
[2];
2447 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2448 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2449 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2453 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2456 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
2457 _simd_store_ps((float*)aPointSize
, vPointSize
);
2459 uint32_t *pPrimID
= (uint32_t *)&primID
;
2461 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
2462 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
2463 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
2465 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
2466 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
2467 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
2469 // scan remaining valid prims and bin each separately
2470 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
2472 while (_BitScanForward(&primIndex
, primMask
))
2474 uint32_t linkageCount
= backendState
.numAttributes
;
2475 uint32_t numScalarAttribs
= linkageCount
* 4;
2480 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2482 desc
.triFlags
.frontFacing
= 1;
2483 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2484 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2485 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2486 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2488 work
.pfnWork
= RasterizeTriPoint
;
2490 auto pArena
= pDC
->pArena
;
2491 SWR_ASSERT(pArena
!= nullptr);
2493 // store active attribs
2494 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2495 desc
.numAttribs
= linkageCount
;
2496 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2498 // store point vertex data
2499 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2500 desc
.pTriBuffer
= pTriBuffer
;
2501 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2502 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2503 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2505 // store user clip distances
2506 if (rastState
.clipDistanceMask
)
2508 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2509 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2510 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2513 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2514 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2516 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2518 #if KNOB_ENABLE_TOSS_POINTS
2519 if (!KNOB_TOSS_SETUP_TRIS
)
2522 pTileMgr
->enqueue(x
, y
, &work
);
2527 primMask
&= ~(1 << primIndex
);
2531 AR_END(FEBinPoints
, 1);
2534 //////////////////////////////////////////////////////////////////////////
2535 /// @brief Bin SIMD lines to the backend.
2536 /// @param pDC - pointer to draw context.
2537 /// @param pa - The primitive assembly object.
2538 /// @param workerId - thread's worker id. Even thread has a unique id.
2539 /// @param tri - Contains line position data for SIMDs worth of points.
2540 /// @param primID - Primitive ID for each line.
2541 /// @param viewportIdx - Viewport Array Index for each line.
2549 simdscalari viewportIdx
)
2551 SWR_CONTEXT
*pContext
= pDC
->pContext
;
2553 AR_BEGIN(FEBinLines
, pDC
->drawId
);
2555 const API_STATE
& state
= GetApiState(pDC
);
2556 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2557 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2558 const SWR_GS_STATE
& gsState
= state
.gsState
;
2560 // Select attribute processor
2561 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2562 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2564 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
2565 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
2567 if (!feState
.vpTransformDisable
)
2569 // perspective divide
2570 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2571 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2573 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW0
);
2574 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW1
);
2576 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW0
);
2577 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW1
);
2579 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW0
);
2580 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW1
);
2582 // viewport transform to screen coords
2583 if (state
.gsState
.emitsViewportArrayIndex
)
2585 viewportTransform
<2>(prim
, state
.vpMatrices
, viewportIdx
);
2589 viewportTransform
<2>(prim
, state
.vpMatrices
);
2593 // adjust for pixel center location
2594 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2595 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2596 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2598 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2599 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2601 // convert to fixed point
2602 simdscalari vXi
[2], vYi
[2];
2603 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2604 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2605 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2606 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2608 // compute x-major vs y-major mask
2609 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2610 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2611 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2612 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2614 // cull zero-length lines
2615 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2616 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2618 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2620 uint32_t *pPrimID
= (uint32_t *)&primID
;
2621 const uint32_t *pViewportIndex
= (uint32_t *)&viewportIdx
;
2623 simdscalar vUnused
= _simd_setzero_ps();
2625 // Calc bounding box of lines
2627 bbox
.xmin
= _simd_min_epi32(vXi
[0], vXi
[1]);
2628 bbox
.xmax
= _simd_max_epi32(vXi
[0], vXi
[1]);
2629 bbox
.ymin
= _simd_min_epi32(vYi
[0], vYi
[1]);
2630 bbox
.ymax
= _simd_max_epi32(vYi
[0], vYi
[1]);
2632 // bloat bbox by line width along minor axis
2633 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2634 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2636 bloatBox
.xmin
= _simd_sub_epi32(bbox
.xmin
, vHalfWidthi
);
2637 bloatBox
.xmax
= _simd_add_epi32(bbox
.xmax
, vHalfWidthi
);
2638 bloatBox
.ymin
= _simd_sub_epi32(bbox
.ymin
, vHalfWidthi
);
2639 bloatBox
.ymax
= _simd_add_epi32(bbox
.ymax
, vHalfWidthi
);
2641 bbox
.xmin
= _simd_blendv_epi32(bbox
.xmin
, bloatBox
.xmin
, vYmajorMask
);
2642 bbox
.xmax
= _simd_blendv_epi32(bbox
.xmax
, bloatBox
.xmax
, vYmajorMask
);
2643 bbox
.ymin
= _simd_blendv_epi32(bloatBox
.ymin
, bbox
.ymin
, vYmajorMask
);
2644 bbox
.ymax
= _simd_blendv_epi32(bloatBox
.ymax
, bbox
.ymax
, vYmajorMask
);
2646 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
2647 simdscalari scisXmin
, scisYmin
, scisXmax
, scisYmax
;
2648 if (state
.gsState
.emitsViewportArrayIndex
)
2650 GatherScissors
<KNOB_SIMD_WIDTH
>::Gather(&state
.scissorsInFixedPoint
[0], pViewportIndex
,
2651 scisXmin
, scisYmin
, scisXmax
, scisYmax
);
2653 else // broadcast fast path for non-VPAI case.
2655 scisXmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmin
);
2656 scisYmin
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymin
);
2657 scisXmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].xmax
);
2658 scisYmax
= _simd_set1_epi32(state
.scissorsInFixedPoint
[0].ymax
);
2661 bbox
.xmin
= _simd_max_epi32(bbox
.xmin
, scisXmin
);
2662 bbox
.ymin
= _simd_max_epi32(bbox
.ymin
, scisYmin
);
2663 bbox
.xmax
= _simd_min_epi32(_simd_sub_epi32(bbox
.xmax
, _simd_set1_epi32(1)), scisXmax
);
2664 bbox
.ymax
= _simd_min_epi32(_simd_sub_epi32(bbox
.ymax
, _simd_set1_epi32(1)), scisYmax
);
2666 // Cull prims completely outside scissor
2668 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.xmin
, bbox
.xmax
);
2669 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.ymin
, bbox
.ymax
);
2670 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2671 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2672 primMask
= primMask
& ~maskOutsideScissor
;
2680 // Convert triangle bbox to macrotile units.
2681 bbox
.xmin
= _simd_srai_epi32(bbox
.xmin
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2682 bbox
.ymin
= _simd_srai_epi32(bbox
.ymin
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2683 bbox
.xmax
= _simd_srai_epi32(bbox
.xmax
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2684 bbox
.ymax
= _simd_srai_epi32(bbox
.ymax
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2686 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2687 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.xmin
);
2688 _simd_store_si((simdscalari
*)aMTRight
, bbox
.xmax
);
2689 _simd_store_si((simdscalari
*)aMTTop
, bbox
.ymin
);
2690 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.ymax
);
2692 // transpose verts needed for backend
2693 /// @todo modify BE to take non-transformed verts
2694 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2695 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2696 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2697 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2698 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2700 // store render target array index
2701 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2702 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2704 simdvector vRtai
[2];
2705 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2706 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2707 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2711 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2714 // scan remaining valid prims and bin each separately
2716 while (_BitScanForward(&primIndex
, primMask
))
2718 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2719 uint32_t numScalarAttribs
= linkageCount
* 4;
2724 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2726 desc
.triFlags
.frontFacing
= 1;
2727 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2728 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2729 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2730 desc
.triFlags
.viewportIndex
= pViewportIndex
[primIndex
];
2732 work
.pfnWork
= RasterizeLine
;
2734 auto pArena
= pDC
->pArena
;
2735 SWR_ASSERT(pArena
!= nullptr);
2737 // store active attribs
2738 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2739 desc
.numAttribs
= linkageCount
;
2740 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2742 // store line vertex data
2743 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2744 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2745 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2746 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2747 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2749 // store user clip distances
2750 if (rastState
.clipDistanceMask
)
2752 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2753 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2754 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2757 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2758 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2760 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2762 #if KNOB_ENABLE_TOSS_POINTS
2763 if (!KNOB_TOSS_SETUP_TRIS
)
2766 pTileMgr
->enqueue(x
, y
, &work
);
2771 primMask
&= ~(1 << primIndex
);
2776 AR_END(FEBinLines
, 1);