1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
36 #include "conservativeRast.h"
42 #include "tessellator.h"
45 //////////////////////////////////////////////////////////////////////////
46 /// @brief Helper macro to generate a bitmask
47 static INLINE
uint32_t GenMask(uint32_t numBits
)
49 SWR_ASSERT(numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
50 return ((1U << numBits
) - 1);
53 //////////////////////////////////////////////////////////////////////////
54 /// @brief Offsets added to post-viewport vertex positions based on
56 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
58 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
59 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
62 //////////////////////////////////////////////////////////////////////////
63 /// @brief FE handler for SwrSync.
64 /// @param pContext - pointer to SWR context.
65 /// @param pDC - pointer to draw context.
66 /// @param workerId - thread's worker id. Even thread has a unique id.
67 /// @param pUserData - Pointer to user data passed back to sync callback.
68 /// @todo This should go away when we switch this to use compute threading.
70 SWR_CONTEXT
*pContext
,
77 work
.pfnWork
= ProcessSyncBE
;
79 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
80 pTileMgr
->enqueue(0, 0, &work
);
83 //////////////////////////////////////////////////////////////////////////
84 /// @brief FE handler for SwrClearRenderTarget.
85 /// @param pContext - pointer to SWR context.
86 /// @param pDC - pointer to draw context.
87 /// @param workerId - thread's worker id. Even thread has a unique id.
88 /// @param pUserData - Pointer to user data passed back to clear callback.
89 /// @todo This should go away when we switch this to use compute threading.
91 SWR_CONTEXT
*pContext
,
96 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
97 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
99 const API_STATE
& state
= GetApiState(pDC
);
101 // queue a clear to each macro tile
102 // compute macro tile bounds for the current scissor/viewport
103 uint32_t macroTileLeft
= state
.scissorInFixedPoint
.left
/ KNOB_MACROTILE_X_DIM_FIXED
;
104 uint32_t macroTileRight
= state
.scissorInFixedPoint
.right
/ KNOB_MACROTILE_X_DIM_FIXED
;
105 uint32_t macroTileTop
= state
.scissorInFixedPoint
.top
/ KNOB_MACROTILE_Y_DIM_FIXED
;
106 uint32_t macroTileBottom
= state
.scissorInFixedPoint
.bottom
/ KNOB_MACROTILE_Y_DIM_FIXED
;
110 work
.pfnWork
= ProcessClearBE
;
111 work
.desc
.clear
= *pClear
;
113 for (uint32_t y
= macroTileTop
; y
<= macroTileBottom
; ++y
)
115 for (uint32_t x
= macroTileLeft
; x
<= macroTileRight
; ++x
)
117 pTileMgr
->enqueue(x
, y
, &work
);
122 //////////////////////////////////////////////////////////////////////////
123 /// @brief FE handler for SwrStoreTiles.
124 /// @param pContext - pointer to SWR context.
125 /// @param pDC - pointer to draw context.
126 /// @param workerId - thread's worker id. Even thread has a unique id.
127 /// @param pUserData - Pointer to user data passed back to callback.
128 /// @todo This should go away when we switch this to use compute threading.
129 void ProcessStoreTiles(
130 SWR_CONTEXT
*pContext
,
135 RDTSC_START(FEProcessStoreTiles
);
136 STORE_TILES_DESC
*pStore
= (STORE_TILES_DESC
*)pUserData
;
137 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
139 const API_STATE
& state
= GetApiState(pDC
);
141 // queue a store to each macro tile
142 // compute macro tile bounds for the current render target
143 const uint32_t macroWidth
= KNOB_MACROTILE_X_DIM
;
144 const uint32_t macroHeight
= KNOB_MACROTILE_Y_DIM
;
146 uint32_t numMacroTilesX
= ((uint32_t)state
.vp
[0].width
+ (uint32_t)state
.vp
[0].x
+ (macroWidth
- 1)) / macroWidth
;
147 uint32_t numMacroTilesY
= ((uint32_t)state
.vp
[0].height
+ (uint32_t)state
.vp
[0].y
+ (macroHeight
- 1)) / macroHeight
;
151 work
.type
= STORETILES
;
152 work
.pfnWork
= ProcessStoreTileBE
;
153 work
.desc
.storeTiles
= *pStore
;
155 for (uint32_t x
= 0; x
< numMacroTilesX
; ++x
)
157 for (uint32_t y
= 0; y
< numMacroTilesY
; ++y
)
159 pTileMgr
->enqueue(x
, y
, &work
);
163 RDTSC_STOP(FEProcessStoreTiles
, 0, pDC
->drawId
);
166 //////////////////////////////////////////////////////////////////////////
167 /// @brief FE handler for SwrInvalidateTiles.
168 /// @param pContext - pointer to SWR context.
169 /// @param pDC - pointer to draw context.
170 /// @param workerId - thread's worker id. Even thread has a unique id.
171 /// @param pUserData - Pointer to user data passed back to callback.
172 /// @todo This should go away when we switch this to use compute threading.
173 void ProcessDiscardInvalidateTiles(
174 SWR_CONTEXT
*pContext
,
179 RDTSC_START(FEProcessInvalidateTiles
);
180 DISCARD_INVALIDATE_TILES_DESC
*pInv
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
181 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
185 if (pInv
->rect
.top
| pInv
->rect
.bottom
| pInv
->rect
.right
| pInv
->rect
.left
)
192 // Use viewport dimensions
193 const API_STATE
& state
= GetApiState(pDC
);
195 rect
.left
= (uint32_t)state
.vp
[0].x
;
196 rect
.right
= (uint32_t)(state
.vp
[0].x
+ state
.vp
[0].width
);
197 rect
.top
= (uint32_t)state
.vp
[0].y
;
198 rect
.bottom
= (uint32_t)(state
.vp
[0].y
+ state
.vp
[0].height
);
201 // queue a store to each macro tile
202 // compute macro tile bounds for the current render target
203 uint32_t macroWidth
= KNOB_MACROTILE_X_DIM
;
204 uint32_t macroHeight
= KNOB_MACROTILE_Y_DIM
;
206 // Setup region assuming full tiles
207 uint32_t macroTileStartX
= (rect
.left
+ (macroWidth
- 1)) / macroWidth
;
208 uint32_t macroTileStartY
= (rect
.top
+ (macroHeight
- 1)) / macroHeight
;
210 uint32_t macroTileEndX
= rect
.right
/ macroWidth
;
211 uint32_t macroTileEndY
= rect
.bottom
/ macroHeight
;
213 if (pInv
->fullTilesOnly
== false)
215 // include partial tiles
216 macroTileStartX
= rect
.left
/ macroWidth
;
217 macroTileStartY
= rect
.top
/ macroHeight
;
219 macroTileEndX
= (rect
.right
+ macroWidth
- 1) / macroWidth
;
220 macroTileEndY
= (rect
.bottom
+ macroHeight
- 1) / macroHeight
;
223 SWR_ASSERT(macroTileEndX
<= KNOB_NUM_HOT_TILES_X
);
224 SWR_ASSERT(macroTileEndY
<= KNOB_NUM_HOT_TILES_Y
);
226 macroTileEndX
= std::min
<uint32_t>(macroTileEndX
, KNOB_NUM_HOT_TILES_X
);
227 macroTileEndY
= std::min
<uint32_t>(macroTileEndY
, KNOB_NUM_HOT_TILES_Y
);
231 work
.type
= DISCARDINVALIDATETILES
;
232 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
233 work
.desc
.discardInvalidateTiles
= *pInv
;
235 for (uint32_t x
= macroTileStartX
; x
< macroTileEndX
; ++x
)
237 for (uint32_t y
= macroTileStartY
; y
< macroTileEndY
; ++y
)
239 pTileMgr
->enqueue(x
, y
, &work
);
243 RDTSC_STOP(FEProcessInvalidateTiles
, 0, pDC
->drawId
);
246 //////////////////////////////////////////////////////////////////////////
247 /// @brief Computes the number of primitives given the number of verts.
248 /// @param mode - primitive topology for draw operation.
249 /// @param numPrims - number of vertices or indices for draw.
250 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
251 uint32_t GetNumPrims(
252 PRIMITIVE_TOPOLOGY mode
,
257 case TOP_POINT_LIST
: return numPrims
;
258 case TOP_TRIANGLE_LIST
: return numPrims
/ 3;
259 case TOP_TRIANGLE_STRIP
: return numPrims
< 3 ? 0 : numPrims
- 2;
260 case TOP_TRIANGLE_FAN
: return numPrims
< 3 ? 0 : numPrims
- 2;
261 case TOP_TRIANGLE_DISC
: return numPrims
< 2 ? 0 : numPrims
- 1;
262 case TOP_QUAD_LIST
: return numPrims
/ 4;
263 case TOP_QUAD_STRIP
: return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
264 case TOP_LINE_STRIP
: return numPrims
< 2 ? 0 : numPrims
- 1;
265 case TOP_LINE_LIST
: return numPrims
/ 2;
266 case TOP_LINE_LOOP
: return numPrims
;
267 case TOP_RECT_LIST
: return numPrims
/ 3;
268 case TOP_LINE_LIST_ADJ
: return numPrims
/ 4;
269 case TOP_LISTSTRIP_ADJ
: return numPrims
< 3 ? 0 : numPrims
- 3;
270 case TOP_TRI_LIST_ADJ
: return numPrims
/ 6;
271 case TOP_TRI_STRIP_ADJ
: return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
273 case TOP_PATCHLIST_1
:
274 case TOP_PATCHLIST_2
:
275 case TOP_PATCHLIST_3
:
276 case TOP_PATCHLIST_4
:
277 case TOP_PATCHLIST_5
:
278 case TOP_PATCHLIST_6
:
279 case TOP_PATCHLIST_7
:
280 case TOP_PATCHLIST_8
:
281 case TOP_PATCHLIST_9
:
282 case TOP_PATCHLIST_10
:
283 case TOP_PATCHLIST_11
:
284 case TOP_PATCHLIST_12
:
285 case TOP_PATCHLIST_13
:
286 case TOP_PATCHLIST_14
:
287 case TOP_PATCHLIST_15
:
288 case TOP_PATCHLIST_16
:
289 case TOP_PATCHLIST_17
:
290 case TOP_PATCHLIST_18
:
291 case TOP_PATCHLIST_19
:
292 case TOP_PATCHLIST_20
:
293 case TOP_PATCHLIST_21
:
294 case TOP_PATCHLIST_22
:
295 case TOP_PATCHLIST_23
:
296 case TOP_PATCHLIST_24
:
297 case TOP_PATCHLIST_25
:
298 case TOP_PATCHLIST_26
:
299 case TOP_PATCHLIST_27
:
300 case TOP_PATCHLIST_28
:
301 case TOP_PATCHLIST_29
:
302 case TOP_PATCHLIST_30
:
303 case TOP_PATCHLIST_31
:
304 case TOP_PATCHLIST_32
:
305 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
308 case TOP_POINT_LIST_BF
:
309 case TOP_LINE_STRIP_CONT
:
310 case TOP_LINE_STRIP_BF
:
311 case TOP_LINE_STRIP_CONT_BF
:
312 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
313 case TOP_TRI_STRIP_REVERSE
:
314 case TOP_PATCHLIST_BASE
:
316 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
323 //////////////////////////////////////////////////////////////////////////
324 /// @brief Computes the number of verts given the number of primitives.
325 /// @param mode - primitive topology for draw operation.
326 /// @param numPrims - number of primitives for draw.
327 uint32_t GetNumVerts(
328 PRIMITIVE_TOPOLOGY mode
,
333 case TOP_POINT_LIST
: return numPrims
;
334 case TOP_TRIANGLE_LIST
: return numPrims
* 3;
335 case TOP_TRIANGLE_STRIP
: return numPrims
? numPrims
+ 2 : 0;
336 case TOP_TRIANGLE_FAN
: return numPrims
? numPrims
+ 2 : 0;
337 case TOP_TRIANGLE_DISC
: return numPrims
? numPrims
+ 1 : 0;
338 case TOP_QUAD_LIST
: return numPrims
* 4;
339 case TOP_QUAD_STRIP
: return numPrims
? numPrims
* 2 + 2 : 0;
340 case TOP_LINE_STRIP
: return numPrims
? numPrims
+ 1 : 0;
341 case TOP_LINE_LIST
: return numPrims
* 2;
342 case TOP_LINE_LOOP
: return numPrims
;
343 case TOP_RECT_LIST
: return numPrims
* 3;
344 case TOP_LINE_LIST_ADJ
: return numPrims
* 4;
345 case TOP_LISTSTRIP_ADJ
: return numPrims
? numPrims
+ 3 : 0;
346 case TOP_TRI_LIST_ADJ
: return numPrims
* 6;
347 case TOP_TRI_STRIP_ADJ
: return numPrims
? (numPrims
+ 2) * 2 : 0;
349 case TOP_PATCHLIST_1
:
350 case TOP_PATCHLIST_2
:
351 case TOP_PATCHLIST_3
:
352 case TOP_PATCHLIST_4
:
353 case TOP_PATCHLIST_5
:
354 case TOP_PATCHLIST_6
:
355 case TOP_PATCHLIST_7
:
356 case TOP_PATCHLIST_8
:
357 case TOP_PATCHLIST_9
:
358 case TOP_PATCHLIST_10
:
359 case TOP_PATCHLIST_11
:
360 case TOP_PATCHLIST_12
:
361 case TOP_PATCHLIST_13
:
362 case TOP_PATCHLIST_14
:
363 case TOP_PATCHLIST_15
:
364 case TOP_PATCHLIST_16
:
365 case TOP_PATCHLIST_17
:
366 case TOP_PATCHLIST_18
:
367 case TOP_PATCHLIST_19
:
368 case TOP_PATCHLIST_20
:
369 case TOP_PATCHLIST_21
:
370 case TOP_PATCHLIST_22
:
371 case TOP_PATCHLIST_23
:
372 case TOP_PATCHLIST_24
:
373 case TOP_PATCHLIST_25
:
374 case TOP_PATCHLIST_26
:
375 case TOP_PATCHLIST_27
:
376 case TOP_PATCHLIST_28
:
377 case TOP_PATCHLIST_29
:
378 case TOP_PATCHLIST_30
:
379 case TOP_PATCHLIST_31
:
380 case TOP_PATCHLIST_32
:
381 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
384 case TOP_POINT_LIST_BF
:
385 case TOP_LINE_STRIP_CONT
:
386 case TOP_LINE_STRIP_BF
:
387 case TOP_LINE_STRIP_CONT_BF
:
388 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
389 case TOP_TRI_STRIP_REVERSE
:
390 case TOP_PATCHLIST_BASE
:
392 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
399 //////////////////////////////////////////////////////////////////////////
400 /// @brief Return number of verts per primitive.
401 /// @param topology - topology
402 /// @param includeAdjVerts - include adjacent verts in primitive vertices
403 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
405 uint32_t numVerts
= 0;
409 case TOP_POINT_LIST_BF
:
414 case TOP_LINE_LIST_ADJ
:
416 case TOP_LINE_STRIP_CONT
:
417 case TOP_LINE_STRIP_BF
:
418 case TOP_LISTSTRIP_ADJ
:
421 case TOP_TRIANGLE_LIST
:
422 case TOP_TRIANGLE_STRIP
:
423 case TOP_TRIANGLE_FAN
:
424 case TOP_TRI_LIST_ADJ
:
425 case TOP_TRI_STRIP_ADJ
:
426 case TOP_TRI_STRIP_REVERSE
:
434 case TOP_PATCHLIST_1
:
435 case TOP_PATCHLIST_2
:
436 case TOP_PATCHLIST_3
:
437 case TOP_PATCHLIST_4
:
438 case TOP_PATCHLIST_5
:
439 case TOP_PATCHLIST_6
:
440 case TOP_PATCHLIST_7
:
441 case TOP_PATCHLIST_8
:
442 case TOP_PATCHLIST_9
:
443 case TOP_PATCHLIST_10
:
444 case TOP_PATCHLIST_11
:
445 case TOP_PATCHLIST_12
:
446 case TOP_PATCHLIST_13
:
447 case TOP_PATCHLIST_14
:
448 case TOP_PATCHLIST_15
:
449 case TOP_PATCHLIST_16
:
450 case TOP_PATCHLIST_17
:
451 case TOP_PATCHLIST_18
:
452 case TOP_PATCHLIST_19
:
453 case TOP_PATCHLIST_20
:
454 case TOP_PATCHLIST_21
:
455 case TOP_PATCHLIST_22
:
456 case TOP_PATCHLIST_23
:
457 case TOP_PATCHLIST_24
:
458 case TOP_PATCHLIST_25
:
459 case TOP_PATCHLIST_26
:
460 case TOP_PATCHLIST_27
:
461 case TOP_PATCHLIST_28
:
462 case TOP_PATCHLIST_29
:
463 case TOP_PATCHLIST_30
:
464 case TOP_PATCHLIST_31
:
465 case TOP_PATCHLIST_32
:
466 numVerts
= topology
- TOP_PATCHLIST_BASE
;
469 SWR_ASSERT(false, "Unsupported topology: %d", topology
);
477 case TOP_LISTSTRIP_ADJ
:
478 case TOP_LINE_LIST_ADJ
: numVerts
= 4; break;
479 case TOP_TRI_STRIP_ADJ
:
480 case TOP_TRI_LIST_ADJ
: numVerts
= 6; break;
488 //////////////////////////////////////////////////////////////////////////
489 /// @brief Generate mask from remaining work.
490 /// @param numWorkItems - Number of items being worked on by a SIMD.
491 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
493 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
494 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
495 return _simd_castps_si(vMask(mask
));
498 //////////////////////////////////////////////////////////////////////////
499 /// @brief StreamOut - Streams vertex data out to SO buffers.
500 /// Generally, we are only streaming out a SIMDs worth of triangles.
501 /// @param pDC - pointer to draw context.
502 /// @param workerId - thread's worker id. Even thread has a unique id.
503 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
504 static void StreamOut(
509 uint32_t streamIndex
)
511 RDTSC_START(FEStreamout
);
513 SWR_CONTEXT
* pContext
= pDC
->pContext
;
515 const API_STATE
& state
= GetApiState(pDC
);
516 const SWR_STREAMOUT_STATE
&soState
= state
.soState
;
518 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
520 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
521 uint32_t primDataDwordVertexStride
= (KNOB_NUM_ATTRIBUTES
* sizeof(float) * 4) / sizeof(uint32_t);
523 SWR_STREAMOUT_CONTEXT soContext
= { 0 };
525 // Setup buffer state pointers.
526 for (uint32_t i
= 0; i
< 4; ++i
)
528 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
531 uint32_t numPrims
= pa
.NumPrims();
532 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
535 uint32_t soMask
= soState
.streamMasks
[streamIndex
];
537 // Write all entries into primitive data buffer for SOS.
538 while (_BitScanForward(&slot
, soMask
))
540 __m128 attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
541 uint32_t paSlot
= slot
+ VERTEX_ATTRIB_START_SLOT
;
542 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
544 // Attribute offset is relative offset from start of vertex.
545 // Note that attributes start at slot 1 in the PA buffer. We need to write this
546 // to prim data starting at slot 0. Which is why we do (slot - 1).
547 // Also note: GL works slightly differently, and needs slot 0
548 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
550 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
551 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
553 uint32_t* pPrimDataAttrib
= pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
555 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
557 soMask
&= ~(1 << slot
);
560 // Update pPrimData pointer
561 soContext
.pPrimData
= pPrimData
;
564 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr, "Trying to execute uninitialized streamout jit function.");
565 state
.pfnSoFunc
[streamIndex
](soContext
);
568 // Update SO write offset. The driver provides memory for the update.
569 for (uint32_t i
= 0; i
< 4; ++i
)
571 if (state
.soBuffer
[i
].pWriteOffset
)
573 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
576 if (state
.soBuffer
[i
].soWriteEnable
)
578 pDC
->dynState
.SoWriteOffset
[i
] = soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
579 pDC
->dynState
.SoWriteOffsetDirty
[i
] = true;
583 UPDATE_STAT(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
584 UPDATE_STAT(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
586 RDTSC_STOP(FEStreamout
, 1, 0);
589 //////////////////////////////////////////////////////////////////////////
590 /// @brief Computes number of invocations. The current index represents
591 /// the start of the SIMD. The max index represents how much work
592 /// items are remaining. If there is less then a SIMD's left of work
593 /// then return the remaining amount of work.
594 /// @param curIndex - The start index for the SIMD.
595 /// @param maxIndex - The last index for all work items.
596 static INLINE
uint32_t GetNumInvocations(
600 uint32_t remainder
= (maxIndex
- curIndex
);
601 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
604 //////////////////////////////////////////////////////////////////////////
605 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
606 /// The geometry shader will loop over each active streamout buffer, assembling
607 /// primitives for the downstream stages. When multistream output is enabled,
608 /// the generated stream ID buffer from the GS needs to be converted to a cut
609 /// buffer for the primitive assembler.
610 /// @param stream - stream id to generate the cut buffer for
611 /// @param pStreamIdBase - pointer to the stream ID buffer
612 /// @param numEmittedVerts - Number of total verts emitted by the GS
613 /// @param pCutBuffer - output buffer to write cuts to
614 void ProcessStreamIdBuffer(uint32_t stream
, uint8_t* pStreamIdBase
, uint32_t numEmittedVerts
, uint8_t *pCutBuffer
)
616 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
618 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
619 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
621 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
623 uint8_t curInputByte
= pStreamIdBase
[2*b
];
625 for (uint32_t i
= 0; i
< 4; ++i
)
627 if ((curInputByte
& 0x3) != stream
)
634 curInputByte
= pStreamIdBase
[2 * b
+ 1];
635 for (uint32_t i
= 0; i
< 4; ++i
)
637 if ((curInputByte
& 0x3) != stream
)
639 outByte
|= (1 << (i
+ 4));
644 *pCutBuffer
++ = outByte
;
648 THREAD SWR_GS_CONTEXT tlsGsContext
;
650 //////////////////////////////////////////////////////////////////////////
651 /// @brief Implements GS stage.
652 /// @param pDC - pointer to draw context.
653 /// @param workerId - thread's worker id. Even thread has a unique id.
654 /// @param pa - The primitive assembly object.
655 /// @param pGsOut - output stream for GS
657 typename HasStreamOutT
,
659 static void GeometryShaderStage(
665 void* pStreamCutBuffer
,
666 uint32_t* pSoPrimData
,
669 RDTSC_START(FEGeometryShader
);
671 SWR_CONTEXT
* pContext
= pDC
->pContext
;
673 const API_STATE
& state
= GetApiState(pDC
);
674 const SWR_GS_STATE
* pState
= &state
.gsState
;
676 SWR_ASSERT(pGsOut
!= nullptr, "GS output buffer should be initialized");
677 SWR_ASSERT(pCutBuffer
!= nullptr, "GS output cut buffer should be initialized");
679 tlsGsContext
.pStream
= (uint8_t*)pGsOut
;
680 tlsGsContext
.pCutOrStreamIdBuffer
= (uint8_t*)pCutBuffer
;
681 tlsGsContext
.PrimitiveID
= primID
;
683 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
684 simdvector attrib
[MAX_ATTRIBUTES
];
686 // assemble all attributes for the input primitive
687 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
689 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
690 pa
.Assemble(attribSlot
, attrib
);
692 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
694 tlsGsContext
.vert
[i
].attrib
[attribSlot
] = attrib
[i
];
699 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
700 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
702 tlsGsContext
.vert
[i
].attrib
[VERTEX_POSITION_SLOT
] = attrib
[i
];
705 const uint32_t vertexStride
= sizeof(simdvertex
);
706 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
707 const uint32_t inputPrimStride
= numSimdBatches
* vertexStride
;
708 const uint32_t instanceStride
= inputPrimStride
* KNOB_SIMD_WIDTH
;
709 uint32_t cutPrimStride
;
710 uint32_t cutInstanceStride
;
712 if (pState
->isSingleStream
)
714 cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
715 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
719 cutPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
720 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
723 // record valid prims from the frontend to avoid over binning the newly generated
725 uint32_t numInputPrims
= pa
.NumPrims();
727 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
729 tlsGsContext
.InstanceID
= instance
;
730 tlsGsContext
.mask
= GenerateMask(numInputPrims
);
732 // execute the geometry shader
733 state
.pfnGsFunc(GetPrivateState(pDC
), &tlsGsContext
);
735 tlsGsContext
.pStream
+= instanceStride
;
736 tlsGsContext
.pCutOrStreamIdBuffer
+= cutInstanceStride
;
739 // set up new binner and state for the GS output topology
740 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
743 switch (pState
->outputTopology
)
745 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles
; break;
746 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines
; break;
747 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
748 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState
->outputTopology
);
752 // foreach input prim:
753 // - setup a new PA based on the emitted verts for that prim
754 // - loop over the new verts, calling PA to assemble each prim
755 uint32_t* pVertexCount
= (uint32_t*)&tlsGsContext
.vertexCount
;
756 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
758 uint32_t totalPrimsGenerated
= 0;
759 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
761 uint8_t* pInstanceBase
= (uint8_t*)pGsOut
+ inputPrim
* inputPrimStride
;
762 uint8_t* pCutBufferBase
= (uint8_t*)pCutBuffer
+ inputPrim
* cutPrimStride
;
763 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
765 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
766 if (numEmittedVerts
== 0)
771 uint8_t* pBase
= pInstanceBase
+ instance
* instanceStride
;
772 uint8_t* pCutBase
= pCutBufferBase
+ instance
* cutInstanceStride
;
774 uint32_t numAttribs
= state
.feNumAttributes
;
776 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
778 bool processCutVerts
= false;
780 uint8_t* pCutBuffer
= pCutBase
;
782 // assign default stream ID, only relevant when GS is outputting a single stream
783 uint32_t streamID
= 0;
784 if (pState
->isSingleStream
)
786 processCutVerts
= true;
787 streamID
= pState
->singleStreamID
;
788 if (streamID
!= stream
) continue;
792 // early exit if this stream is not enabled for streamout
793 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
798 // multi-stream output, need to translate StreamID buffer to a cut buffer
799 ProcessStreamIdBuffer(stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pStreamCutBuffer
);
800 pCutBuffer
= (uint8_t*)pStreamCutBuffer
;
801 processCutVerts
= false;
804 PA_STATE_CUT
gsPa(pDC
, pBase
, numEmittedVerts
, pCutBuffer
, numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
);
806 while (gsPa
.GetNextStreamOutput())
810 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
814 totalPrimsGenerated
+= gsPa
.NumPrims();
816 if (HasStreamOutT::value
)
818 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
821 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
824 // pull primitiveID from the GS output if available
825 if (state
.gsState
.emitsPrimitiveID
)
827 simdvector primIdAttrib
[3];
828 gsPa
.Assemble(VERTEX_PRIMID_SLOT
, primIdAttrib
);
829 vPrimId
= _simd_castps_si(primIdAttrib
[0].x
);
833 vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
836 pfnClipFunc(pDC
, gsPa
, workerId
, attrib
, GenMask(gsPa
.NumPrims()), vPrimId
);
839 } while (gsPa
.NextPrim());
845 // update GS pipeline stats
846 UPDATE_STAT(GsInvocations
, numInputPrims
* pState
->instanceCount
);
847 UPDATE_STAT(GsPrimitives
, totalPrimsGenerated
);
849 RDTSC_STOP(FEGeometryShader
, 1, 0);
852 //////////////////////////////////////////////////////////////////////////
853 /// @brief Allocate GS buffers
854 /// @param pDC - pointer to draw context.
855 /// @param state - API state
856 /// @param ppGsOut - pointer to GS output buffer allocation
857 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
858 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
, const API_STATE
& state
, void** ppGsOut
, void** ppCutBuffer
,
859 void **ppStreamCutBuffer
)
861 auto pArena
= pDC
->pArena
;
862 SWR_ASSERT(pArena
!= nullptr);
863 SWR_ASSERT(state
.gsState
.gsEnable
);
864 // allocate arena space to hold GS output verts
865 // @todo pack attribs
866 // @todo support multiple streams
867 const uint32_t vertexStride
= sizeof(simdvertex
);
868 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
869 uint32_t size
= state
.gsState
.instanceCount
* numSimdBatches
* vertexStride
* KNOB_SIMD_WIDTH
;
870 *ppGsOut
= pArena
->AllocAligned(size
, KNOB_SIMD_WIDTH
* sizeof(float));
872 const uint32_t cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
873 const uint32_t streamIdPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
874 const uint32_t cutBufferSize
= cutPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
875 const uint32_t streamIdSize
= streamIdPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
877 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
878 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
880 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
881 if (state
.gsState
.isSingleStream
)
883 *ppCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
884 *ppStreamCutBuffer
= nullptr;
888 *ppCutBuffer
= pArena
->AllocAligned(streamIdSize
, KNOB_SIMD_WIDTH
* sizeof(float));
889 *ppStreamCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
894 //////////////////////////////////////////////////////////////////////////
895 /// @brief Contains all data generated by the HS and passed to the
896 /// tessellator and DS.
897 struct TessellationThreadLocalData
899 SWR_HS_CONTEXT hsContext
;
900 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
904 simdscalar
* pDSOutput
;
905 size_t numDSOutputVectors
;
908 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
910 //////////////////////////////////////////////////////////////////////////
911 /// @brief Allocate tessellation data for this worker thread.
913 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
915 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
916 if (gt_pTessellationThreadData
== nullptr)
918 gt_pTessellationThreadData
= (TessellationThreadLocalData
*)
919 AlignedMalloc(sizeof(TessellationThreadLocalData
), 64);
920 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
924 //////////////////////////////////////////////////////////////////////////
925 /// @brief Implements Tessellation Stages.
926 /// @param pDC - pointer to draw context.
927 /// @param workerId - thread's worker id. Even thread has a unique id.
928 /// @param pa - The primitive assembly object.
929 /// @param pGsOut - output stream for GS
931 typename HasGeometryShaderT
,
932 typename HasStreamOutT
,
934 static void TessellationStages(
940 void* pCutStreamBuffer
,
941 uint32_t* pSoPrimData
,
944 const API_STATE
& state
= GetApiState(pDC
);
945 const SWR_TS_STATE
& tsState
= state
.tsState
;
946 SWR_CONTEXT
*pContext
= pDC
->pContext
; // Needed for UPDATE_STATS macro
948 SWR_ASSERT(gt_pTessellationThreadData
);
950 HANDLE tsCtx
= TSInitCtx(
952 tsState
.partitioning
,
953 tsState
.tsOutputTopology
,
954 gt_pTessellationThreadData
->pTxCtx
,
955 gt_pTessellationThreadData
->tsCtxSize
);
956 if (tsCtx
== nullptr)
958 gt_pTessellationThreadData
->pTxCtx
= AlignedMalloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
961 tsState
.partitioning
,
962 tsState
.tsOutputTopology
,
963 gt_pTessellationThreadData
->pTxCtx
,
964 gt_pTessellationThreadData
->tsCtxSize
);
968 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
971 switch (tsState
.postDSTopology
)
973 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles
; break;
974 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines
; break;
975 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
976 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState
.postDSTopology
);
980 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
981 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
982 hsContext
.PrimitiveID
= primID
;
984 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
985 // Max storage for one attribute for an entire simdprimitive
986 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
988 // assemble all attributes for the input primitives
989 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
991 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
992 pa
.Assemble(attribSlot
, simdattrib
);
994 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
996 hsContext
.vert
[i
].attrib
[attribSlot
] = simdattrib
[i
];
1001 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1004 uint32_t numPrims
= pa
.NumPrims();
1005 hsContext
.mask
= GenerateMask(numPrims
);
1008 RDTSC_START(FEHullShader
);
1009 state
.pfnHsFunc(GetPrivateState(pDC
), &hsContext
);
1010 RDTSC_STOP(FEHullShader
, 0, 0);
1012 UPDATE_STAT(HsInvocations
, numPrims
);
1014 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1016 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1019 SWR_TS_TESSELLATED_DATA tsData
= { 0 };
1020 RDTSC_START(FETessellation
);
1021 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1022 RDTSC_STOP(FETessellation
, 0, 0);
1024 if (tsData
.NumPrimitives
== 0)
1028 SWR_ASSERT(tsData
.NumDomainPoints
);
1030 // Allocate DS Output memory
1031 uint32_t requiredDSVectorInvocations
= AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1032 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.numDsOutputAttribs
;
1033 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1034 if (requiredDSOutputVectors
> gt_pTessellationThreadData
->numDSOutputVectors
)
1036 AlignedFree(gt_pTessellationThreadData
->pDSOutput
);
1037 gt_pTessellationThreadData
->pDSOutput
= (simdscalar
*)AlignedMalloc(requiredAllocSize
, 64);
1038 gt_pTessellationThreadData
->numDSOutputVectors
= requiredDSOutputVectors
;
1040 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1041 SWR_ASSERT(gt_pTessellationThreadData
->numDSOutputVectors
>= requiredDSOutputVectors
);
1044 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1047 // Run Domain Shader
1048 SWR_DS_CONTEXT dsContext
;
1049 dsContext
.PrimitiveID
= pPrimId
[p
];
1050 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1051 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1052 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1053 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1054 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1056 uint32_t dsInvocations
= 0;
1058 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
; ++dsContext
.vectorOffset
)
1060 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1062 RDTSC_START(FEDomainShader
);
1063 state
.pfnDsFunc(GetPrivateState(pDC
), &dsContext
);
1064 RDTSC_STOP(FEDomainShader
, 0, 0);
1066 dsInvocations
+= KNOB_SIMD_WIDTH
;
1068 UPDATE_STAT(DsInvocations
, tsData
.NumDomainPoints
);
1072 dsContext
.pOutputData
,
1073 dsContext
.vectorStride
,
1074 tsState
.numDsOutputAttribs
,
1076 tsData
.NumPrimitives
,
1077 tsState
.postDSTopology
);
1079 while (tessPa
.HasWork())
1081 if (HasGeometryShaderT::value
)
1083 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1084 pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
,
1085 _simd_set1_epi32(dsContext
.PrimitiveID
));
1089 if (HasStreamOutT::value
)
1091 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1094 if (HasRastT::value
)
1096 simdvector prim
[3]; // Only deal with triangles, lines, or points
1097 RDTSC_START(FEPAAssemble
);
1098 #if SWR_ENABLE_ASSERTS
1101 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1102 RDTSC_STOP(FEPAAssemble
, 1, 0);
1103 SWR_ASSERT(assemble
);
1105 SWR_ASSERT(pfnClipFunc
);
1106 pfnClipFunc(pDC
, tessPa
, workerId
, prim
,
1107 GenMask(tessPa
.NumPrims()), _simd_set1_epi32(dsContext
.PrimitiveID
));
1113 } // while (tessPa.HasWork())
1114 } // for (uint32_t p = 0; p < numPrims; ++p)
1116 TSDestroyCtx(tsCtx
);
1119 //////////////////////////////////////////////////////////////////////////
1120 /// @brief FE handler for SwrDraw.
1121 /// @tparam IsIndexedT - Is indexed drawing enabled
1122 /// @tparam HasTessellationT - Is tessellation enabled
1123 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1124 /// @tparam HasStreamOutT - Is stream-out enabled
1125 /// @tparam HasRastT - Is rasterization enabled
1126 /// @param pContext - pointer to SWR context.
1127 /// @param pDC - pointer to draw context.
1128 /// @param workerId - thread's worker id.
1129 /// @param pUserData - Pointer to DRAW_WORK
1131 typename IsIndexedT
,
1132 typename IsCutIndexEnabledT
,
1133 typename HasTessellationT
,
1134 typename HasGeometryShaderT
,
1135 typename HasStreamOutT
,
1138 SWR_CONTEXT
*pContext
,
1144 #if KNOB_ENABLE_TOSS_POINTS
1145 if (KNOB_TOSS_QUEUE_FE
)
1151 RDTSC_START(FEProcessDraw
);
1153 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1154 const API_STATE
& state
= GetApiState(pDC
);
1155 __m256i vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1156 SWR_VS_CONTEXT vsContext
;
1160 uint32_t endVertex
= work
.numVerts
;
1162 const int32_t* pLastRequestedIndex
= nullptr;
1163 if (IsIndexedT::value
)
1168 indexSize
= sizeof(uint32_t);
1169 pLastRequestedIndex
= &(work
.pIB
[endVertex
]);
1172 indexSize
= sizeof(uint16_t);
1173 // nasty address offset to last index
1174 pLastRequestedIndex
= (int32_t*)(&(((uint16_t*)work
.pIB
)[endVertex
]));
1177 indexSize
= sizeof(uint8_t);
1178 // nasty address offset to last index
1179 pLastRequestedIndex
= (int32_t*)(&(((uint8_t*)work
.pIB
)[endVertex
]));
1187 // No cuts, prune partial primitives.
1188 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1191 SWR_FETCH_CONTEXT fetchInfo
= { 0 };
1192 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
1193 fetchInfo
.StartInstance
= work
.startInstance
;
1194 fetchInfo
.StartVertex
= 0;
1196 vsContext
.pVin
= &vin
;
1198 if (IsIndexedT::value
)
1200 fetchInfo
.BaseVertex
= work
.baseVertex
;
1202 // if the entire index buffer isn't being consumed, set the last index
1203 // so that fetches < a SIMD wide will be masked off
1204 fetchInfo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1205 if (pLastRequestedIndex
< fetchInfo
.pLastIndex
)
1207 fetchInfo
.pLastIndex
= pLastRequestedIndex
;
1212 fetchInfo
.StartVertex
= work
.startVertex
;
1215 #ifdef KNOB_ENABLE_RDTSC
1216 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1219 void* pGsOut
= nullptr;
1220 void* pCutBuffer
= nullptr;
1221 void* pStreamCutBuffer
= nullptr;
1222 if (HasGeometryShaderT::value
)
1224 AllocateGsBuffers(pDC
, state
, &pGsOut
, &pCutBuffer
, &pStreamCutBuffer
);
1227 if (HasTessellationT::value
)
1229 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1230 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1231 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1233 AllocateTessellationData(pContext
);
1237 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1238 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1239 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1242 // allocate space for streamout input prim data
1243 uint32_t* pSoPrimData
= nullptr;
1244 if (HasStreamOutT::value
)
1246 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1249 // choose primitive assembler
1250 PA_FACTORY
<IsIndexedT
, IsCutIndexEnabledT
> paFactory(pDC
, state
.topology
, work
.numVerts
);
1251 PA_STATE
& pa
= paFactory
.GetPA();
1253 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1254 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1259 if (IsIndexedT::value
)
1261 fetchInfo
.pIndices
= work
.pIB
;
1265 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
1266 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
1269 fetchInfo
.CurInstance
= instanceNum
;
1270 vsContext
.InstanceID
= instanceNum
;
1272 while (pa
.HasWork())
1274 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1275 // So we need to keep this outside of (i < endVertex) check.
1276 simdmask
* pvCutIndices
= nullptr;
1277 if (IsIndexedT::value
)
1279 pvCutIndices
= &pa
.GetNextVsIndices();
1282 simdvertex
& vout
= pa
.GetNextVsOutput();
1283 vsContext
.pVout
= &vout
;
1288 // 1. Execute FS/VS for a single SIMD.
1289 RDTSC_START(FEFetchShader
);
1290 state
.pfnFetchFunc(fetchInfo
, vin
);
1291 RDTSC_STOP(FEFetchShader
, 0, 0);
1293 // forward fetch generated vertex IDs to the vertex shader
1294 vsContext
.VertexID
= fetchInfo
.VertexID
;
1296 // Setup active mask for vertex shader.
1297 vsContext
.mask
= GenerateMask(endVertex
- i
);
1299 // forward cut mask to the PA
1300 if (IsIndexedT::value
)
1302 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
1305 UPDATE_STAT(IaVertices
, GetNumInvocations(i
, endVertex
));
1307 #if KNOB_ENABLE_TOSS_POINTS
1308 if (!KNOB_TOSS_FETCH
)
1311 RDTSC_START(FEVertexShader
);
1312 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext
);
1313 RDTSC_STOP(FEVertexShader
, 0, 0);
1315 UPDATE_STAT(VsInvocations
, GetNumInvocations(i
, endVertex
));
1319 // 2. Assemble primitives given the last two SIMD.
1322 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
1323 // PaAssemble returns false if there is not enough verts to assemble.
1324 RDTSC_START(FEPAAssemble
);
1325 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1326 RDTSC_STOP(FEPAAssemble
, 1, 0);
1328 #if KNOB_ENABLE_TOSS_POINTS
1329 if (!KNOB_TOSS_FETCH
)
1332 #if KNOB_ENABLE_TOSS_POINTS
1338 UPDATE_STAT(IaPrimitives
, pa
.NumPrims());
1340 if (HasTessellationT::value
)
1342 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
1343 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1345 else if (HasGeometryShaderT::value
)
1347 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1348 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1352 // If streamout is enabled then stream vertices out to memory.
1353 if (HasStreamOutT::value
)
1355 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1358 if (HasRastT::value
)
1360 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
1361 pDC
->pState
->pfnProcessPrims(pDC
, pa
, workerId
, prim
,
1362 GenMask(pa
.NumPrims()), pa
.GetPrimID(work
.startPrimID
));
1368 } while (pa
.NextPrim());
1370 i
+= KNOB_SIMD_WIDTH
;
1371 if (IsIndexedT::value
)
1373 fetchInfo
.pIndices
= (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
1377 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
1383 RDTSC_STOP(FEProcessDraw
, numPrims
* work
.numInstances
, pDC
->drawId
);
1386 struct FEDrawChooser
1388 typedef PFN_FE_WORK_FUNC FuncType
;
1390 template <typename
... ArgsB
>
1391 static FuncType
GetFunc()
1393 return ProcessDraw
<ArgsB
...>;
1398 // Selector for correct templated Draw front-end function
1399 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
1401 bool IsCutIndexEnabled
,
1402 bool HasTessellation
,
1403 bool HasGeometryShader
,
1405 bool HasRasterization
)
1407 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
, IsCutIndexEnabled
, HasTessellation
, HasGeometryShader
, HasStreamOut
, HasRasterization
);
1410 //////////////////////////////////////////////////////////////////////////
1411 /// @brief Processes attributes for the backend based on linkage mask and
1412 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1413 /// @param pDC - Draw context
1414 /// @param pa - Primitive Assembly state
1415 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1416 /// @param pLinkageMap - maps VS attribute slot to PS slot
1417 /// @param triIndex - Triangle to process attributes for
1418 /// @param pBuffer - Output result
1419 template<typename NumVertsT
, typename IsSwizzledT
, typename HasConstantInterpT
, typename IsDegenerate
>
1420 INLINE
void ProcessAttributes(
1427 static_assert(NumVertsT::value
> 0 && NumVertsT::value
<= 3, "Invalid value for NumVertsT");
1428 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
1429 // Conservative Rasterization requires degenerate tris to have constant attribute interpolation
1430 LONG constantInterpMask
= IsDegenerate::value
? 0xFFFFFFFF : backendState
.constantInterpolationMask
;
1431 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
1432 const PRIMITIVE_TOPOLOGY topo
= pDC
->pState
->state
.topology
;
1434 static const float constTable
[3][4] = {
1435 {0.0f
, 0.0f
, 0.0f
, 0.0f
},
1436 {0.0f
, 0.0f
, 0.0f
, 1.0f
},
1437 {1.0f
, 1.0f
, 1.0f
, 1.0f
}
1440 for (uint32_t i
= 0; i
< backendState
.numAttributes
; ++i
)
1443 if (IsSwizzledT::value
)
1445 SWR_ATTRIB_SWIZZLE attribSwizzle
= backendState
.swizzleMap
[i
];
1446 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ attribSwizzle
.sourceAttrib
;
1451 inputSlot
= VERTEX_ATTRIB_START_SLOT
+ i
;
1454 __m128 attrib
[3]; // triangle attribs (always 4 wide)
1455 float* pAttribStart
= pBuffer
;
1457 if (HasConstantInterpT::value
|| IsDegenerate::value
)
1459 if (_bittest(&constantInterpMask
, i
))
1462 uint32_t adjustedTriIndex
;
1463 static const uint32_t tristripProvokingVertex
[] = { 0, 2, 1 };
1464 static const int32_t quadProvokingTri
[2][4] = { {0, 0, 0, 1}, {0, -1, 0, 0} };
1465 static const uint32_t quadProvokingVertex
[2][4] = { {0, 1, 2, 2}, {0, 1, 1, 2} };
1466 static const int32_t qstripProvokingTri
[2][4] = { {0, 0, 0, 1}, {-1, 0, 0, 0} };
1467 static const uint32_t qstripProvokingVertex
[2][4] = { {0, 1, 2, 1}, {0, 0, 2, 1} };
1471 adjustedTriIndex
= triIndex
+ quadProvokingTri
[triIndex
& 1][provokingVertex
];
1472 vid
= quadProvokingVertex
[triIndex
& 1][provokingVertex
];
1474 case TOP_QUAD_STRIP
:
1475 adjustedTriIndex
= triIndex
+ qstripProvokingTri
[triIndex
& 1][provokingVertex
];
1476 vid
= qstripProvokingVertex
[triIndex
& 1][provokingVertex
];
1478 case TOP_TRIANGLE_STRIP
:
1479 adjustedTriIndex
= triIndex
;
1480 vid
= (triIndex
& 1)
1481 ? tristripProvokingVertex
[provokingVertex
]
1485 adjustedTriIndex
= triIndex
;
1486 vid
= provokingVertex
;
1490 pa
.AssembleSingle(inputSlot
, adjustedTriIndex
, attrib
);
1492 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1494 _mm_store_ps(pBuffer
, attrib
[vid
]);
1500 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
1502 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1504 _mm_store_ps(pBuffer
, attrib
[i
]);
1511 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
1513 for (uint32_t i
= 0; i
< NumVertsT::value
; ++i
)
1515 _mm_store_ps(pBuffer
, attrib
[i
]);
1520 // pad out the attrib buffer to 3 verts to ensure the triangle
1521 // interpolation code in the pixel shader works correctly for the
1522 // 3 topologies - point, line, tri. This effectively zeros out the
1523 // effect of the missing vertices in the triangle interpolation.
1524 for (uint32_t v
= NumVertsT::value
; v
< 3; ++v
)
1526 _mm_store_ps(pBuffer
, attrib
[NumVertsT::value
- 1]);
1530 // check for constant source overrides
1531 if (IsSwizzledT::value
)
1533 uint32_t mask
= backendState
.swizzleMap
[i
].componentOverrideMask
;
1537 while (_BitScanForward(&comp
, mask
))
1539 mask
&= ~(1 << comp
);
1541 float constantValue
= 0.0f
;
1542 switch ((SWR_CONSTANT_SOURCE
)backendState
.swizzleMap
[i
].constantSource
)
1544 case SWR_CONSTANT_SOURCE_CONST_0000
:
1545 case SWR_CONSTANT_SOURCE_CONST_0001_FLOAT
:
1546 case SWR_CONSTANT_SOURCE_CONST_1111_FLOAT
:
1547 constantValue
= constTable
[backendState
.swizzleMap
[i
].constantSource
][comp
];
1549 case SWR_CONSTANT_SOURCE_PRIM_ID
:
1550 constantValue
= *(float*)&primId
;
1554 // apply constant value to all 3 vertices
1555 for (uint32_t v
= 0; v
< 3; ++v
)
1557 pAttribStart
[comp
+ v
* 4] = constantValue
;
1566 typedef void(*PFN_PROCESS_ATTRIBUTES
)(DRAW_CONTEXT
*, PA_STATE
&, uint32_t, uint32_t, float*);
1568 struct ProcessAttributesChooser
1570 typedef PFN_PROCESS_ATTRIBUTES FuncType
;
1572 template <typename
... ArgsB
>
1573 static FuncType
GetFunc()
1575 return ProcessAttributes
<ArgsB
...>;
1579 PFN_PROCESS_ATTRIBUTES
GetProcessAttributesFunc(uint32_t NumVerts
, bool IsSwizzled
, bool HasConstantInterp
, bool IsDegenerate
= false)
1581 return TemplateArgUnroller
<ProcessAttributesChooser
>::GetFunc(IntArg
<1, 3>{NumVerts
}, IsSwizzled
, HasConstantInterp
, IsDegenerate
);
1584 //////////////////////////////////////////////////////////////////////////
1585 /// @brief Processes enabled user clip distances. Loads the active clip
1586 /// distances from the PA, sets up barycentric equations, and
1587 /// stores the results to the output buffer
1588 /// @param pa - Primitive Assembly state
1589 /// @param primIndex - primitive index to process
1590 /// @param clipDistMask - mask of enabled clip distances
1591 /// @param pUserClipBuffer - buffer to store results
1592 template<uint32_t NumVerts
>
1593 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float* pUserClipBuffer
)
1596 while (_BitScanForward(&clipDist
, clipDistMask
))
1598 clipDistMask
&= ~(1 << clipDist
);
1599 uint32_t clipSlot
= clipDist
>> 2;
1600 uint32_t clipComp
= clipDist
& 0x3;
1601 uint32_t clipAttribSlot
= clipSlot
== 0 ?
1602 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
1604 __m128 primClipDist
[3];
1605 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
1607 float vertClipDist
[NumVerts
];
1608 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1610 OSALIGNSIMD(float) aVertClipDist
[4];
1611 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
1612 vertClipDist
[e
] = aVertClipDist
[clipComp
];
1615 // setup plane equations for barycentric interpolation in the backend
1616 float baryCoeff
[NumVerts
];
1617 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
1619 baryCoeff
[e
] = vertClipDist
[e
] - vertClipDist
[NumVerts
- 1];
1621 baryCoeff
[NumVerts
- 1] = vertClipDist
[NumVerts
- 1];
1623 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1625 *(pUserClipBuffer
++) = baryCoeff
[e
];
1630 //////////////////////////////////////////////////////////////////////////
1631 /// @brief Convert the X,Y coords of a triangle to the requested Fixed
1632 /// Point precision from FP32.
1633 template <typename PT
= FixedPointTraits
<Fixed_16_8
>>
1634 INLINE simdscalari
fpToFixedPointVertical(const simdscalar vIn
)
1636 simdscalar vFixed
= _simd_mul_ps(vIn
, _simd_set1_ps(PT::ScaleT::value
));
1637 return _simd_cvtps_epi32(vFixed
);
1640 //////////////////////////////////////////////////////////////////////////
1641 /// @brief Helper function to set the X,Y coords of a triangle to the
1642 /// requested Fixed Point precision from FP32.
1643 /// @param tri: simdvector[3] of FP triangle verts
1644 /// @param vXi: fixed point X coords of tri verts
1645 /// @param vYi: fixed point Y coords of tri verts
1646 INLINE
static void FPToFixedPoint(const simdvector
* const tri
, simdscalari (&vXi
)[3], simdscalari (&vYi
)[3])
1648 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
1649 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
1650 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
1651 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
1652 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
1653 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
1656 //////////////////////////////////////////////////////////////////////////
1657 /// @brief Calculate bounding box for current triangle
1658 /// @tparam CT: ConservativeRastFETraits type
1659 /// @param vX: fixed point X position for triangle verts
1660 /// @param vY: fixed point Y position for triangle verts
1661 /// @param bbox: fixed point bbox
1662 /// *Note*: expects vX, vY to be in the correct precision for the type
1663 /// of rasterization. This avoids unnecessary FP->fixed conversions.
1664 template <typename CT
>
1665 INLINE
void calcBoundingBoxIntVertical(const simdvector
* const tri
, simdscalari (&vX
)[3], simdscalari (&vY
)[3], simdBBox
&bbox
)
1667 simdscalari vMinX
= vX
[0];
1668 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
1669 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
1671 simdscalari vMaxX
= vX
[0];
1672 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
1673 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
1675 simdscalari vMinY
= vY
[0];
1676 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
1677 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
1679 simdscalari vMaxY
= vY
[0];
1680 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
1681 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
1686 bbox
.bottom
= vMaxY
;
1689 //////////////////////////////////////////////////////////////////////////
1690 /// @brief FEConservativeRastT specialization of calcBoundingBoxIntVertical
1691 /// Offsets BBox for conservative rast
1693 INLINE
void calcBoundingBoxIntVertical
<FEConservativeRastT
>(const simdvector
* const tri
, simdscalari (&vX
)[3], simdscalari (&vY
)[3], simdBBox
&bbox
)
1695 // FE conservative rast traits
1696 typedef FEConservativeRastT CT
;
1698 simdscalari vMinX
= vX
[0];
1699 vMinX
= _simd_min_epi32(vMinX
, vX
[1]);
1700 vMinX
= _simd_min_epi32(vMinX
, vX
[2]);
1702 simdscalari vMaxX
= vX
[0];
1703 vMaxX
= _simd_max_epi32(vMaxX
, vX
[1]);
1704 vMaxX
= _simd_max_epi32(vMaxX
, vX
[2]);
1706 simdscalari vMinY
= vY
[0];
1707 vMinY
= _simd_min_epi32(vMinY
, vY
[1]);
1708 vMinY
= _simd_min_epi32(vMinY
, vY
[2]);
1710 simdscalari vMaxY
= vY
[0];
1711 vMaxY
= _simd_max_epi32(vMaxY
, vY
[1]);
1712 vMaxY
= _simd_max_epi32(vMaxY
, vY
[2]);
1714 /// Bounding box needs to be expanded by 1/512 before snapping to 16.8 for conservative rasterization
1715 /// expand bbox by 1/256; coverage will be correctly handled in the rasterizer.
1716 bbox
.left
= _simd_sub_epi32(vMinX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1717 bbox
.right
= _simd_add_epi32(vMaxX
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1718 bbox
.top
= _simd_sub_epi32(vMinY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1719 bbox
.bottom
= _simd_add_epi32(vMaxY
, _simd_set1_epi32(CT::BoundingBoxOffsetT::value
));
1722 //////////////////////////////////////////////////////////////////////////
1723 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1724 /// culling, viewport transform, etc.
1725 /// @param pDC - pointer to draw context.
1726 /// @param pa - The primitive assembly object.
1727 /// @param workerId - thread's worker id. Even thread has a unique id.
1728 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1729 /// @param primID - Primitive ID for each triangle.
1730 /// @tparam CT - ConservativeRastFETraits
1731 template <typename CT
>
1740 RDTSC_START(FEBinTriangles
);
1742 const API_STATE
& state
= GetApiState(pDC
);
1743 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1744 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1745 const SWR_GS_STATE
& gsState
= state
.gsState
;
1746 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1749 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
1750 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
1751 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
1753 if (!feState
.vpTransformDisable
)
1755 // perspective divide
1756 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
1757 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
1758 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
1760 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
1761 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
1762 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
1764 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
1765 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
1766 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
1768 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
1769 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
1770 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
1772 // viewport transform to screen coords
1773 viewportTransform
<3>(tri
, state
.vpMatrices
);
1776 // adjust for pixel center location
1777 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1778 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
1779 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
1781 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
1782 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
1784 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
1785 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
1787 simdscalari vXi
[3], vYi
[3];
1788 // Set vXi, vYi to required fixed point precision
1789 FPToFixedPoint(tri
, vXi
, vYi
);
1792 simdscalari vAi
[3], vBi
[3];
1793 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
1796 simdscalari vDet
[2];
1797 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
1800 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
1801 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
1803 int cullZeroAreaMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/ 2));
1805 uint32_t origTriMask
= triMask
;
1806 // don't cull degenerate triangles if we're conservatively rasterizing
1807 if(!CT::IsConservativeT::value
)
1809 triMask
&= ~cullZeroAreaMask
;
1812 // determine front winding tris
1814 // CCW det <= 0; 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1815 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
1816 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
1817 int cwTriMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/2) );
1819 uint32_t frontWindingTris
;
1820 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
1822 frontWindingTris
= cwTriMask
;
1826 frontWindingTris
= ~cwTriMask
;
1831 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1833 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1834 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1835 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1836 // 0 area triangles are marked as backfacing, which is required behavior for conservative rast
1837 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1838 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1841 triMask
&= ~cullTris
;
1843 if (origTriMask
^ triMask
)
1845 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1848 /// Note: these variable initializations must stay above any 'goto endBenTriangles'
1849 // compute per tri backface
1850 uint32_t frontFaceMask
= frontWindingTris
;
1851 uint32_t *pPrimID
= (uint32_t *)&primID
;
1853 // for center sample pattern, all samples are at pixel center; calculate coverage
1854 // once at center and broadcast the results in the backend
1855 const SWR_MULTISAMPLE_COUNT sampleCount
= (rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
) ? rastState
.sampleCount
: SWR_MULTISAMPLE_1X
;
1856 uint32_t edgeEnable
;
1857 PFN_WORK_FUNC pfnWork
;
1858 if(CT::IsConservativeT::value
)
1860 // determine which edges of the degenerate tri, if any, are valid to rasterize.
1861 // used to call the appropriate templated rasterizer function
1862 if(cullZeroAreaMask
> 0)
1865 simdscalari x0x1Mask
= _simd_cmpeq_epi32(vXi
[0], vXi
[1]);
1866 simdscalari y0y1Mask
= _simd_cmpeq_epi32(vYi
[0], vYi
[1]);
1867 uint32_t e0Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x0x1Mask
, y0y1Mask
)));
1870 simdscalari x1x2Mask
= _simd_cmpeq_epi32(vXi
[1], vXi
[2]);
1871 simdscalari y1y2Mask
= _simd_cmpeq_epi32(vYi
[1], vYi
[2]);
1872 uint32_t e1Mask
= _simd_movemask_ps(_simd_castsi_ps(_simd_and_si(x1x2Mask
, y1y2Mask
)));
1875 // if v0 == v1 & v1 == v2, v0 == v2
1876 uint32_t e2Mask
= e0Mask
& e1Mask
;
1877 SWR_ASSERT(KNOB_SIMD_WIDTH
== 8, "Need to update degenerate mask code for avx512");
1879 // edge order: e0 = v0v1, e1 = v1v2, e2 = v0v2
1880 // 32 bit binary: 0000 0000 0010 0100 1001 0010 0100 1001
1881 e0Mask
= pdep_u32(e0Mask
, 0x00249249);
1882 // 32 bit binary: 0000 0000 0100 1001 0010 0100 1001 0010
1883 e1Mask
= pdep_u32(e1Mask
, 0x00492492);
1884 // 32 bit binary: 0000 0000 1001 0010 0100 1001 0010 0100
1885 e2Mask
= pdep_u32(e2Mask
, 0x00924924);
1887 edgeEnable
= (0x00FFFFFF & (~(e0Mask
| e1Mask
| e2Mask
)));
1891 edgeEnable
= 0x00FFFFFF;
1896 // degenerate triangles won't be sent to rasterizer; just enable all edges
1897 pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
1898 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, ALL_EDGES_VALID
,
1899 (rastState
.scissorEnable
> 0));
1904 goto endBinTriangles
;
1907 // Calc bounding box of triangles
1909 calcBoundingBoxIntVertical
<CT
>(tri
, vXi
, vYi
, bbox
);
1911 // determine if triangle falls between pixel centers and discard
1912 // only discard for non-MSAA case and when conservative rast is disabled
1913 // (left + 127) & ~255
1914 // (right + 128) & ~255
1915 if(rastState
.sampleCount
== SWR_MULTISAMPLE_1X
&& (!CT::IsConservativeT::value
))
1917 origTriMask
= triMask
;
1921 simdscalari left
= _simd_add_epi32(bbox
.left
, _simd_set1_epi32(127));
1922 left
= _simd_and_si(left
, _simd_set1_epi32(~255));
1923 simdscalari right
= _simd_add_epi32(bbox
.right
, _simd_set1_epi32(128));
1924 right
= _simd_and_si(right
, _simd_set1_epi32(~255));
1926 simdscalari vMaskH
= _simd_cmpeq_epi32(left
, right
);
1928 simdscalari top
= _simd_add_epi32(bbox
.top
, _simd_set1_epi32(127));
1929 top
= _simd_and_si(top
, _simd_set1_epi32(~255));
1930 simdscalari bottom
= _simd_add_epi32(bbox
.bottom
, _simd_set1_epi32(128));
1931 bottom
= _simd_and_si(bottom
, _simd_set1_epi32(~255));
1933 simdscalari vMaskV
= _simd_cmpeq_epi32(top
, bottom
);
1934 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
1935 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
1938 triMask
&= ~cullCenterMask
;
1940 if(origTriMask
^ triMask
)
1942 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1946 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1947 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
1948 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
1949 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
1950 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
1952 if(CT::IsConservativeT::value
)
1954 // in the case where a degenerate triangle is on a scissor edge, we need to make sure the primitive bbox has
1955 // some area. Bump the right/bottom edges out
1956 simdscalari topEqualsBottom
= _simd_cmpeq_epi32(bbox
.top
, bbox
.bottom
);
1957 bbox
.bottom
= _simd_blendv_epi32(bbox
.bottom
, _simd_add_epi32(bbox
.bottom
, _simd_set1_epi32(1)), topEqualsBottom
);
1958 simdscalari leftEqualsRight
= _simd_cmpeq_epi32(bbox
.left
, bbox
.right
);
1959 bbox
.right
= _simd_blendv_epi32(bbox
.right
, _simd_add_epi32(bbox
.right
, _simd_set1_epi32(1)), leftEqualsRight
);
1962 // Cull tris completely outside scissor
1964 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
1965 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
1966 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1967 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1968 triMask
= triMask
& ~maskOutsideScissor
;
1973 goto endBinTriangles
;
1976 // Convert triangle bbox to macrotile units.
1977 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1978 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1979 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1980 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1982 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1983 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
1984 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
1985 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
1986 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
1988 // transpose verts needed for backend
1989 /// @todo modify BE to take non-transformed verts
1990 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
1991 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
1992 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
1993 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
1994 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
1996 // store render target array index
1997 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1998 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2000 simdvector vRtai
[3];
2001 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2003 vRtaii
= _simd_castps_si(vRtai
[0].x
);
2004 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2008 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2011 // scan remaining valid triangles and bin each separately
2012 while (_BitScanForward(&triIndex
, triMask
))
2014 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2015 uint32_t numScalarAttribs
= linkageCount
* 4;
2021 if(CT::IsConservativeT::value
)
2023 // only rasterize valid edges if we have a degenerate primitive
2024 int32_t triEdgeEnable
= (edgeEnable
>> (triIndex
* 3)) & ALL_EDGES_VALID
;
2025 work
.pfnWork
= GetRasterizerFunc(sampleCount
, (rastState
.conservativeRast
> 0),
2026 (SWR_INPUT_COVERAGE
)pDC
->pState
->state
.psState
.inputCoverage
, triEdgeEnable
,
2027 (rastState
.scissorEnable
> 0));
2029 // Degenerate triangles are required to be constant interpolated
2030 isDegenerate
= (triEdgeEnable
!= ALL_EDGES_VALID
) ? true : false;
2034 isDegenerate
= false;
2035 work
.pfnWork
= pfnWork
;
2038 // Select attribute processor
2039 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(3,
2040 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
, isDegenerate
);
2042 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2044 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
2045 desc
.triFlags
.primID
= pPrimID
[triIndex
];
2046 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
2048 auto pArena
= pDC
->pArena
;
2049 SWR_ASSERT(pArena
!= nullptr);
2051 // store active attribs
2052 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2053 desc
.pAttribs
= pAttribs
;
2054 desc
.numAttribs
= linkageCount
;
2055 pfnProcessAttribs(pDC
, pa
, triIndex
, pPrimID
[triIndex
], desc
.pAttribs
);
2057 // store triangle vertex data
2058 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2060 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
2061 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
2062 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
2063 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
2065 // store user clip distances
2066 if (rastState
.clipDistanceMask
)
2068 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2069 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
2070 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2073 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
2075 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
2077 #if KNOB_ENABLE_TOSS_POINTS
2078 if (!KNOB_TOSS_SETUP_TRIS
)
2081 pTileMgr
->enqueue(x
, y
, &work
);
2085 triMask
&= ~(1 << triIndex
);
2089 RDTSC_STOP(FEBinTriangles
, 1, 0);
2092 struct FEBinTrianglesChooser
2094 typedef PFN_PROCESS_PRIMS FuncType
;
2096 template <typename
... ArgsB
>
2097 static FuncType
GetFunc()
2099 return BinTriangles
<ConservativeRastFETraits
<ArgsB
...>>;
2103 // Selector for correct templated BinTrinagles function
2104 PFN_PROCESS_PRIMS
GetBinTrianglesFunc(bool IsConservative
)
2106 return TemplateArgUnroller
<FEBinTrianglesChooser
>::GetFunc(IsConservative
);
2109 //////////////////////////////////////////////////////////////////////////
2110 /// @brief Bin SIMD points to the backend. Only supports point size of 1
2111 /// @param pDC - pointer to draw context.
2112 /// @param pa - The primitive assembly object.
2113 /// @param workerId - thread's worker id. Even thread has a unique id.
2114 /// @param tri - Contains point position data for SIMDs worth of points.
2115 /// @param primID - Primitive ID for each point.
2124 RDTSC_START(FEBinPoints
);
2126 simdvector
& primVerts
= prim
[0];
2128 const API_STATE
& state
= GetApiState(pDC
);
2129 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2130 const SWR_GS_STATE
& gsState
= state
.gsState
;
2131 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2133 // Select attribute processor
2134 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(1,
2135 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2137 if (!feState
.vpTransformDisable
)
2139 // perspective divide
2140 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
2141 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
2142 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
2143 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
2145 // viewport transform to screen coords
2146 viewportTransform
<1>(&primVerts
, state
.vpMatrices
);
2149 // adjust for pixel center location
2150 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2151 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
2152 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
2154 // convert to fixed point
2155 simdscalari vXi
, vYi
;
2156 vXi
= fpToFixedPointVertical(primVerts
.x
);
2157 vYi
= fpToFixedPointVertical(primVerts
.y
);
2159 if (CanUseSimplePoints(pDC
))
2161 // adjust for top-left rule
2162 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
2163 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
2165 // cull points off the top-left edge of the viewport
2166 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
2167 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
2169 // compute macro tile coordinates
2170 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2171 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2173 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
2174 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
2175 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
2177 // compute raster tile coordinates
2178 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
2179 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
2181 // compute raster tile relative x,y for coverage mask
2182 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
2183 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
2185 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
2186 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
2188 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
2189 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
2190 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
2191 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
2193 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
2194 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
2195 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
2196 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
2198 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
2199 _simd_store_ps((float*)aZ
, primVerts
.z
);
2201 // store render target array index
2202 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2203 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2206 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
2207 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
2208 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2212 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2215 uint32_t *pPrimID
= (uint32_t *)&primID
;
2216 DWORD primIndex
= 0;
2218 const SWR_BACKEND_STATE
& backendState
= pDC
->pState
->state
.backendState
;
2220 // scan remaining valid triangles and bin each separately
2221 while (_BitScanForward(&primIndex
, primMask
))
2223 uint32_t linkageCount
= backendState
.numAttributes
;
2224 uint32_t numScalarAttribs
= linkageCount
* 4;
2229 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2231 // points are always front facing
2232 desc
.triFlags
.frontFacing
= 1;
2233 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2234 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2236 work
.pfnWork
= RasterizeSimplePoint
;
2238 auto pArena
= pDC
->pArena
;
2239 SWR_ASSERT(pArena
!= nullptr);
2242 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
2243 desc
.pAttribs
= pAttribs
;
2244 desc
.numAttribs
= linkageCount
;
2246 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], pAttribs
);
2248 // store raster tile aligned x, y, perspective correct z
2249 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2250 desc
.pTriBuffer
= pTriBuffer
;
2251 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
2252 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
2253 *pTriBuffer
= aZ
[primIndex
];
2255 uint32_t tX
= aTileRelativeX
[primIndex
];
2256 uint32_t tY
= aTileRelativeY
[primIndex
];
2258 // pack the relative x,y into the coverageMask, the rasterizer will
2259 // generate the true coverage mask from it
2260 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
2263 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2264 #if KNOB_ENABLE_TOSS_POINTS
2265 if (!KNOB_TOSS_SETUP_TRIS
)
2268 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
2270 primMask
&= ~(1 << primIndex
);
2275 // non simple points need to be potentially binned to multiple macro tiles
2276 simdscalar vPointSize
;
2277 if (rastState
.pointParam
)
2280 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
2281 vPointSize
= size
[0].x
;
2285 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
2288 // bloat point to bbox
2290 bbox
.left
= bbox
.right
= vXi
;
2291 bbox
.top
= bbox
.bottom
= vYi
;
2293 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
2294 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2295 bbox
.left
= _simd_sub_epi32(bbox
.left
, vHalfWidthi
);
2296 bbox
.right
= _simd_add_epi32(bbox
.right
, vHalfWidthi
);
2297 bbox
.top
= _simd_sub_epi32(bbox
.top
, vHalfWidthi
);
2298 bbox
.bottom
= _simd_add_epi32(bbox
.bottom
, vHalfWidthi
);
2300 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2301 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
2302 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
2303 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
2304 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
2306 // Cull bloated points completely outside scissor
2307 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
2308 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
2309 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2310 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2311 primMask
= primMask
& ~maskOutsideScissor
;
2313 // Convert bbox to macrotile units.
2314 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2315 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2316 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2317 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2319 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2320 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
2321 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
2322 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
2323 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
2325 // store render target array index
2326 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2327 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2329 simdvector vRtai
[2];
2330 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2331 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2332 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2336 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2339 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
2340 _simd_store_ps((float*)aPointSize
, vPointSize
);
2342 uint32_t *pPrimID
= (uint32_t *)&primID
;
2344 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
2345 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
2346 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
2348 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
2349 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
2350 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
2352 // scan remaining valid prims and bin each separately
2353 const SWR_BACKEND_STATE
& backendState
= state
.backendState
;
2355 while (_BitScanForward(&primIndex
, primMask
))
2357 uint32_t linkageCount
= backendState
.numAttributes
;
2358 uint32_t numScalarAttribs
= linkageCount
* 4;
2363 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2365 desc
.triFlags
.frontFacing
= 1;
2366 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2367 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2368 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2370 work
.pfnWork
= RasterizeTriPoint
;
2372 auto pArena
= pDC
->pArena
;
2373 SWR_ASSERT(pArena
!= nullptr);
2375 // store active attribs
2376 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2377 desc
.numAttribs
= linkageCount
;
2378 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2380 // store point vertex data
2381 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2382 desc
.pTriBuffer
= pTriBuffer
;
2383 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2384 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2385 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2387 // store user clip distances
2388 if (rastState
.clipDistanceMask
)
2390 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2391 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2392 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2395 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2396 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2398 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2400 #if KNOB_ENABLE_TOSS_POINTS
2401 if (!KNOB_TOSS_SETUP_TRIS
)
2404 pTileMgr
->enqueue(x
, y
, &work
);
2409 primMask
&= ~(1 << primIndex
);
2416 RDTSC_STOP(FEBinPoints
, 1, 0);
2419 //////////////////////////////////////////////////////////////////////////
2420 /// @brief Bin SIMD lines to the backend.
2421 /// @param pDC - pointer to draw context.
2422 /// @param pa - The primitive assembly object.
2423 /// @param workerId - thread's worker id. Even thread has a unique id.
2424 /// @param tri - Contains line position data for SIMDs worth of points.
2425 /// @param primID - Primitive ID for each line.
2434 RDTSC_START(FEBinLines
);
2436 const API_STATE
& state
= GetApiState(pDC
);
2437 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2438 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2439 const SWR_GS_STATE
& gsState
= state
.gsState
;
2441 // Select attribute processor
2442 PFN_PROCESS_ATTRIBUTES pfnProcessAttribs
= GetProcessAttributesFunc(2,
2443 state
.backendState
.swizzleEnable
, state
.backendState
.constantInterpolationMask
);
2445 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
2446 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
2448 if (!feState
.vpTransformDisable
)
2450 // perspective divide
2451 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2452 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2454 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW0
);
2455 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW1
);
2457 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW0
);
2458 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW1
);
2460 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW0
);
2461 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW1
);
2463 // viewport transform to screen coords
2464 viewportTransform
<2>(prim
, state
.vpMatrices
);
2467 // adjust for pixel center location
2468 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2469 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2470 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2472 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2473 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2475 // convert to fixed point
2476 simdscalari vXi
[2], vYi
[2];
2477 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2478 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2479 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2480 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2482 // compute x-major vs y-major mask
2483 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2484 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2485 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2486 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2488 // cull zero-length lines
2489 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2490 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2492 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2494 uint32_t *pPrimID
= (uint32_t *)&primID
;
2496 simdscalar vUnused
= _simd_setzero_ps();
2498 // Calc bounding box of lines
2500 bbox
.left
= _simd_min_epi32(vXi
[0], vXi
[1]);
2501 bbox
.right
= _simd_max_epi32(vXi
[0], vXi
[1]);
2502 bbox
.top
= _simd_min_epi32(vYi
[0], vYi
[1]);
2503 bbox
.bottom
= _simd_max_epi32(vYi
[0], vYi
[1]);
2505 // bloat bbox by line width along minor axis
2506 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2507 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2509 bloatBox
.left
= _simd_sub_epi32(bbox
.left
, vHalfWidthi
);
2510 bloatBox
.right
= _simd_add_epi32(bbox
.right
, vHalfWidthi
);
2511 bloatBox
.top
= _simd_sub_epi32(bbox
.top
, vHalfWidthi
);
2512 bloatBox
.bottom
= _simd_add_epi32(bbox
.bottom
, vHalfWidthi
);
2514 bbox
.left
= _simd_blendv_epi32(bbox
.left
, bloatBox
.left
, vYmajorMask
);
2515 bbox
.right
= _simd_blendv_epi32(bbox
.right
, bloatBox
.right
, vYmajorMask
);
2516 bbox
.top
= _simd_blendv_epi32(bloatBox
.top
, bbox
.top
, vYmajorMask
);
2517 bbox
.bottom
= _simd_blendv_epi32(bloatBox
.bottom
, bbox
.bottom
, vYmajorMask
);
2519 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2520 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
2521 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
2522 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
2523 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
2525 // Cull prims completely outside scissor
2527 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
2528 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
2529 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2530 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2531 primMask
= primMask
& ~maskOutsideScissor
;
2539 // Convert triangle bbox to macrotile units.
2540 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2541 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2542 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2543 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2545 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2546 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
2547 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
2548 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
2549 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
2551 // transpose verts needed for backend
2552 /// @todo modify BE to take non-transformed verts
2553 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2554 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2555 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2556 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2557 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2559 // store render target array index
2560 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2561 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2563 simdvector vRtai
[2];
2564 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2565 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2566 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2570 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2573 // scan remaining valid prims and bin each separately
2575 while (_BitScanForward(&primIndex
, primMask
))
2577 uint32_t linkageCount
= state
.backendState
.numAttributes
;
2578 uint32_t numScalarAttribs
= linkageCount
* 4;
2583 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2585 desc
.triFlags
.frontFacing
= 1;
2586 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2587 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2588 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2590 work
.pfnWork
= RasterizeLine
;
2592 auto pArena
= pDC
->pArena
;
2593 SWR_ASSERT(pArena
!= nullptr);
2595 // store active attribs
2596 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2597 desc
.numAttribs
= linkageCount
;
2598 pfnProcessAttribs(pDC
, pa
, primIndex
, pPrimID
[primIndex
], desc
.pAttribs
);
2600 // store line vertex data
2601 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2602 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2603 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2604 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2605 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2607 // store user clip distances
2608 if (rastState
.clipDistanceMask
)
2610 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2611 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2612 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2615 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2616 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2618 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2620 #if KNOB_ENABLE_TOSS_POINTS
2621 if (!KNOB_TOSS_SETUP_TRIS
)
2624 pTileMgr
->enqueue(x
, y
, &work
);
2629 primMask
&= ~(1 << primIndex
);
2634 RDTSC_STOP(FEBinLines
, 1, 0);