1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Implementation for Frontend which handles vertex processing,
26 * primitive assembly, clipping, binning, etc.
28 ******************************************************************************/
34 #include "rdtsc_core.h"
35 #include "rasterizer.h"
41 #include "tessellator.h"
44 //////////////////////////////////////////////////////////////////////////
45 /// @brief Helper macro to generate a bitmask
46 static INLINE
uint32_t GenMask(uint32_t numBits
)
48 SWR_ASSERT(numBits
<= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits
, __FUNCTION__
);
49 return ((1U << numBits
) - 1);
52 //////////////////////////////////////////////////////////////////////////
53 /// @brief Offsets added to post-viewport vertex positions based on
55 static const simdscalar g_pixelOffsets
[SWR_PIXEL_LOCATION_UL
+ 1] =
57 _simd_set1_ps(0.0f
), // SWR_PIXEL_LOCATION_CENTER
58 _simd_set1_ps(0.5f
), // SWR_PIXEL_LOCATION_UL
61 //////////////////////////////////////////////////////////////////////////
62 /// @brief FE handler for SwrSync.
63 /// @param pContext - pointer to SWR context.
64 /// @param pDC - pointer to draw context.
65 /// @param workerId - thread's worker id. Even thread has a unique id.
66 /// @param pUserData - Pointer to user data passed back to sync callback.
67 /// @todo This should go away when we switch this to use compute threading.
69 SWR_CONTEXT
*pContext
,
74 SYNC_DESC
*pSync
= (SYNC_DESC
*)pUserData
;
77 work
.pfnWork
= ProcessSyncBE
;
78 work
.desc
.sync
= *pSync
;
80 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
81 pTileMgr
->enqueue(0, 0, &work
);
84 //////////////////////////////////////////////////////////////////////////
85 /// @brief FE handler for SwrGetStats.
86 /// @param pContext - pointer to SWR context.
87 /// @param pDC - pointer to draw context.
88 /// @param workerId - thread's worker id. Even thread has a unique id.
89 /// @param pUserData - Pointer to user data passed back to stats callback.
90 /// @todo This should go away when we switch this to use compute threading.
91 void ProcessQueryStats(
92 SWR_CONTEXT
*pContext
,
97 QUERY_DESC
*pQueryStats
= (QUERY_DESC
*)pUserData
;
99 work
.type
= QUERYSTATS
;
100 work
.pfnWork
= ProcessQueryStatsBE
;
101 work
.desc
.queryStats
= *pQueryStats
;
103 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
104 pTileMgr
->enqueue(0, 0, &work
);
107 //////////////////////////////////////////////////////////////////////////
108 /// @brief FE handler for SwrClearRenderTarget.
109 /// @param pContext - pointer to SWR context.
110 /// @param pDC - pointer to draw context.
111 /// @param workerId - thread's worker id. Even thread has a unique id.
112 /// @param pUserData - Pointer to user data passed back to clear callback.
113 /// @todo This should go away when we switch this to use compute threading.
115 SWR_CONTEXT
*pContext
,
120 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
121 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
123 const API_STATE
& state
= GetApiState(pDC
);
125 // queue a clear to each macro tile
126 // compute macro tile bounds for the current scissor/viewport
127 uint32_t macroTileLeft
= state
.scissorInFixedPoint
.left
/ KNOB_MACROTILE_X_DIM_FIXED
;
128 uint32_t macroTileRight
= state
.scissorInFixedPoint
.right
/ KNOB_MACROTILE_X_DIM_FIXED
;
129 uint32_t macroTileTop
= state
.scissorInFixedPoint
.top
/ KNOB_MACROTILE_Y_DIM_FIXED
;
130 uint32_t macroTileBottom
= state
.scissorInFixedPoint
.bottom
/ KNOB_MACROTILE_Y_DIM_FIXED
;
134 work
.pfnWork
= ProcessClearBE
;
135 work
.desc
.clear
= *pClear
;
137 for (uint32_t y
= macroTileTop
; y
<= macroTileBottom
; ++y
)
139 for (uint32_t x
= macroTileLeft
; x
<= macroTileRight
; ++x
)
141 pTileMgr
->enqueue(x
, y
, &work
);
146 //////////////////////////////////////////////////////////////////////////
147 /// @brief FE handler for SwrStoreTiles.
148 /// @param pContext - pointer to SWR context.
149 /// @param pDC - pointer to draw context.
150 /// @param workerId - thread's worker id. Even thread has a unique id.
151 /// @param pUserData - Pointer to user data passed back to callback.
152 /// @todo This should go away when we switch this to use compute threading.
153 void ProcessStoreTiles(
154 SWR_CONTEXT
*pContext
,
159 RDTSC_START(FEProcessStoreTiles
);
160 STORE_TILES_DESC
*pStore
= (STORE_TILES_DESC
*)pUserData
;
161 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
163 const API_STATE
& state
= GetApiState(pDC
);
165 // queue a store to each macro tile
166 // compute macro tile bounds for the current render target
167 const uint32_t macroWidth
= KNOB_MACROTILE_X_DIM
;
168 const uint32_t macroHeight
= KNOB_MACROTILE_Y_DIM
;
170 uint32_t numMacroTilesX
= ((uint32_t)state
.vp
[0].width
+ (uint32_t)state
.vp
[0].x
+ (macroWidth
- 1)) / macroWidth
;
171 uint32_t numMacroTilesY
= ((uint32_t)state
.vp
[0].height
+ (uint32_t)state
.vp
[0].y
+ (macroHeight
- 1)) / macroHeight
;
175 work
.type
= STORETILES
;
176 work
.pfnWork
= ProcessStoreTileBE
;
177 work
.desc
.storeTiles
= *pStore
;
179 for (uint32_t x
= 0; x
< numMacroTilesX
; ++x
)
181 for (uint32_t y
= 0; y
< numMacroTilesY
; ++y
)
183 pTileMgr
->enqueue(x
, y
, &work
);
187 RDTSC_STOP(FEProcessStoreTiles
, 0, pDC
->drawId
);
190 //////////////////////////////////////////////////////////////////////////
191 /// @brief FE handler for SwrInvalidateTiles.
192 /// @param pContext - pointer to SWR context.
193 /// @param pDC - pointer to draw context.
194 /// @param workerId - thread's worker id. Even thread has a unique id.
195 /// @param pUserData - Pointer to user data passed back to callback.
196 /// @todo This should go away when we switch this to use compute threading.
197 void ProcessDiscardInvalidateTiles(
198 SWR_CONTEXT
*pContext
,
203 RDTSC_START(FEProcessInvalidateTiles
);
204 DISCARD_INVALIDATE_TILES_DESC
*pInv
= (DISCARD_INVALIDATE_TILES_DESC
*)pUserData
;
205 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
209 if (pInv
->rect
.top
| pInv
->rect
.bottom
| pInv
->rect
.right
| pInv
->rect
.left
)
216 // Use viewport dimensions
217 const API_STATE
& state
= GetApiState(pDC
);
219 rect
.left
= (uint32_t)state
.vp
[0].x
;
220 rect
.right
= (uint32_t)(state
.vp
[0].x
+ state
.vp
[0].width
);
221 rect
.top
= (uint32_t)state
.vp
[0].y
;
222 rect
.bottom
= (uint32_t)(state
.vp
[0].y
+ state
.vp
[0].height
);
225 // queue a store to each macro tile
226 // compute macro tile bounds for the current render target
227 uint32_t macroWidth
= KNOB_MACROTILE_X_DIM
;
228 uint32_t macroHeight
= KNOB_MACROTILE_Y_DIM
;
230 // Setup region assuming full tiles
231 uint32_t macroTileStartX
= (rect
.left
+ (macroWidth
- 1)) / macroWidth
;
232 uint32_t macroTileStartY
= (rect
.top
+ (macroHeight
- 1)) / macroHeight
;
234 uint32_t macroTileEndX
= rect
.right
/ macroWidth
;
235 uint32_t macroTileEndY
= rect
.bottom
/ macroHeight
;
237 if (pInv
->fullTilesOnly
== false)
239 // include partial tiles
240 macroTileStartX
= rect
.left
/ macroWidth
;
241 macroTileStartY
= rect
.top
/ macroHeight
;
243 macroTileEndX
= (rect
.right
+ macroWidth
- 1) / macroWidth
;
244 macroTileEndY
= (rect
.bottom
+ macroHeight
- 1) / macroHeight
;
247 SWR_ASSERT(macroTileEndX
<= KNOB_NUM_HOT_TILES_X
);
248 SWR_ASSERT(macroTileEndY
<= KNOB_NUM_HOT_TILES_Y
);
250 macroTileEndX
= std::min
<uint32_t>(macroTileEndX
, KNOB_NUM_HOT_TILES_X
);
251 macroTileEndY
= std::min
<uint32_t>(macroTileEndY
, KNOB_NUM_HOT_TILES_Y
);
255 work
.type
= DISCARDINVALIDATETILES
;
256 work
.pfnWork
= ProcessDiscardInvalidateTilesBE
;
257 work
.desc
.discardInvalidateTiles
= *pInv
;
259 for (uint32_t x
= macroTileStartX
; x
< macroTileEndX
; ++x
)
261 for (uint32_t y
= macroTileStartY
; y
< macroTileEndY
; ++y
)
263 pTileMgr
->enqueue(x
, y
, &work
);
267 RDTSC_STOP(FEProcessInvalidateTiles
, 0, pDC
->drawId
);
270 //////////////////////////////////////////////////////////////////////////
271 /// @brief Computes the number of primitives given the number of verts.
272 /// @param mode - primitive topology for draw operation.
273 /// @param numPrims - number of vertices or indices for draw.
274 /// @todo Frontend needs to be refactored. This will go in appropriate place then.
275 uint32_t GetNumPrims(
276 PRIMITIVE_TOPOLOGY mode
,
281 case TOP_POINT_LIST
: return numPrims
;
282 case TOP_TRIANGLE_LIST
: return numPrims
/ 3;
283 case TOP_TRIANGLE_STRIP
: return numPrims
< 3 ? 0 : numPrims
- 2;
284 case TOP_TRIANGLE_FAN
: return numPrims
< 3 ? 0 : numPrims
- 2;
285 case TOP_TRIANGLE_DISC
: return numPrims
< 2 ? 0 : numPrims
- 1;
286 case TOP_QUAD_LIST
: return numPrims
/ 4;
287 case TOP_QUAD_STRIP
: return numPrims
< 4 ? 0 : (numPrims
- 2) / 2;
288 case TOP_LINE_STRIP
: return numPrims
< 2 ? 0 : numPrims
- 1;
289 case TOP_LINE_LIST
: return numPrims
/ 2;
290 case TOP_LINE_LOOP
: return numPrims
;
291 case TOP_RECT_LIST
: return numPrims
/ 3;
292 case TOP_LINE_LIST_ADJ
: return numPrims
/ 4;
293 case TOP_LISTSTRIP_ADJ
: return numPrims
< 3 ? 0 : numPrims
- 3;
294 case TOP_TRI_LIST_ADJ
: return numPrims
/ 6;
295 case TOP_TRI_STRIP_ADJ
: return numPrims
< 4 ? 0 : (numPrims
/ 2) - 2;
297 case TOP_PATCHLIST_1
:
298 case TOP_PATCHLIST_2
:
299 case TOP_PATCHLIST_3
:
300 case TOP_PATCHLIST_4
:
301 case TOP_PATCHLIST_5
:
302 case TOP_PATCHLIST_6
:
303 case TOP_PATCHLIST_7
:
304 case TOP_PATCHLIST_8
:
305 case TOP_PATCHLIST_9
:
306 case TOP_PATCHLIST_10
:
307 case TOP_PATCHLIST_11
:
308 case TOP_PATCHLIST_12
:
309 case TOP_PATCHLIST_13
:
310 case TOP_PATCHLIST_14
:
311 case TOP_PATCHLIST_15
:
312 case TOP_PATCHLIST_16
:
313 case TOP_PATCHLIST_17
:
314 case TOP_PATCHLIST_18
:
315 case TOP_PATCHLIST_19
:
316 case TOP_PATCHLIST_20
:
317 case TOP_PATCHLIST_21
:
318 case TOP_PATCHLIST_22
:
319 case TOP_PATCHLIST_23
:
320 case TOP_PATCHLIST_24
:
321 case TOP_PATCHLIST_25
:
322 case TOP_PATCHLIST_26
:
323 case TOP_PATCHLIST_27
:
324 case TOP_PATCHLIST_28
:
325 case TOP_PATCHLIST_29
:
326 case TOP_PATCHLIST_30
:
327 case TOP_PATCHLIST_31
:
328 case TOP_PATCHLIST_32
:
329 return numPrims
/ (mode
- TOP_PATCHLIST_BASE
);
332 case TOP_POINT_LIST_BF
:
333 case TOP_LINE_STRIP_CONT
:
334 case TOP_LINE_STRIP_BF
:
335 case TOP_LINE_STRIP_CONT_BF
:
336 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
337 case TOP_TRI_STRIP_REVERSE
:
338 case TOP_PATCHLIST_BASE
:
340 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
347 //////////////////////////////////////////////////////////////////////////
348 /// @brief Computes the number of verts given the number of primitives.
349 /// @param mode - primitive topology for draw operation.
350 /// @param numPrims - number of primitives for draw.
351 uint32_t GetNumVerts(
352 PRIMITIVE_TOPOLOGY mode
,
357 case TOP_POINT_LIST
: return numPrims
;
358 case TOP_TRIANGLE_LIST
: return numPrims
* 3;
359 case TOP_TRIANGLE_STRIP
: return numPrims
? numPrims
+ 2 : 0;
360 case TOP_TRIANGLE_FAN
: return numPrims
? numPrims
+ 2 : 0;
361 case TOP_TRIANGLE_DISC
: return numPrims
? numPrims
+ 1 : 0;
362 case TOP_QUAD_LIST
: return numPrims
* 4;
363 case TOP_QUAD_STRIP
: return numPrims
? numPrims
* 2 + 2 : 0;
364 case TOP_LINE_STRIP
: return numPrims
? numPrims
+ 1 : 0;
365 case TOP_LINE_LIST
: return numPrims
* 2;
366 case TOP_LINE_LOOP
: return numPrims
;
367 case TOP_RECT_LIST
: return numPrims
* 3;
368 case TOP_LINE_LIST_ADJ
: return numPrims
* 4;
369 case TOP_LISTSTRIP_ADJ
: return numPrims
? numPrims
+ 3 : 0;
370 case TOP_TRI_LIST_ADJ
: return numPrims
* 6;
371 case TOP_TRI_STRIP_ADJ
: return numPrims
? (numPrims
+ 2) * 2 : 0;
373 case TOP_PATCHLIST_1
:
374 case TOP_PATCHLIST_2
:
375 case TOP_PATCHLIST_3
:
376 case TOP_PATCHLIST_4
:
377 case TOP_PATCHLIST_5
:
378 case TOP_PATCHLIST_6
:
379 case TOP_PATCHLIST_7
:
380 case TOP_PATCHLIST_8
:
381 case TOP_PATCHLIST_9
:
382 case TOP_PATCHLIST_10
:
383 case TOP_PATCHLIST_11
:
384 case TOP_PATCHLIST_12
:
385 case TOP_PATCHLIST_13
:
386 case TOP_PATCHLIST_14
:
387 case TOP_PATCHLIST_15
:
388 case TOP_PATCHLIST_16
:
389 case TOP_PATCHLIST_17
:
390 case TOP_PATCHLIST_18
:
391 case TOP_PATCHLIST_19
:
392 case TOP_PATCHLIST_20
:
393 case TOP_PATCHLIST_21
:
394 case TOP_PATCHLIST_22
:
395 case TOP_PATCHLIST_23
:
396 case TOP_PATCHLIST_24
:
397 case TOP_PATCHLIST_25
:
398 case TOP_PATCHLIST_26
:
399 case TOP_PATCHLIST_27
:
400 case TOP_PATCHLIST_28
:
401 case TOP_PATCHLIST_29
:
402 case TOP_PATCHLIST_30
:
403 case TOP_PATCHLIST_31
:
404 case TOP_PATCHLIST_32
:
405 return numPrims
* (mode
- TOP_PATCHLIST_BASE
);
408 case TOP_POINT_LIST_BF
:
409 case TOP_LINE_STRIP_CONT
:
410 case TOP_LINE_STRIP_BF
:
411 case TOP_LINE_STRIP_CONT_BF
:
412 case TOP_TRIANGLE_FAN_NOSTIPPLE
:
413 case TOP_TRI_STRIP_REVERSE
:
414 case TOP_PATCHLIST_BASE
:
416 SWR_ASSERT(false, "Unsupported topology: %d", mode
);
423 //////////////////////////////////////////////////////////////////////////
424 /// @brief Return number of verts per primitive.
425 /// @param topology - topology
426 /// @param includeAdjVerts - include adjacent verts in primitive vertices
427 INLINE
uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology
, bool includeAdjVerts
)
429 uint32_t numVerts
= 0;
433 case TOP_POINT_LIST_BF
:
438 case TOP_LINE_LIST_ADJ
:
440 case TOP_LINE_STRIP_CONT
:
441 case TOP_LINE_STRIP_BF
:
442 case TOP_LISTSTRIP_ADJ
:
445 case TOP_TRIANGLE_LIST
:
446 case TOP_TRIANGLE_STRIP
:
447 case TOP_TRIANGLE_FAN
:
448 case TOP_TRI_LIST_ADJ
:
449 case TOP_TRI_STRIP_ADJ
:
450 case TOP_TRI_STRIP_REVERSE
:
458 case TOP_PATCHLIST_1
:
459 case TOP_PATCHLIST_2
:
460 case TOP_PATCHLIST_3
:
461 case TOP_PATCHLIST_4
:
462 case TOP_PATCHLIST_5
:
463 case TOP_PATCHLIST_6
:
464 case TOP_PATCHLIST_7
:
465 case TOP_PATCHLIST_8
:
466 case TOP_PATCHLIST_9
:
467 case TOP_PATCHLIST_10
:
468 case TOP_PATCHLIST_11
:
469 case TOP_PATCHLIST_12
:
470 case TOP_PATCHLIST_13
:
471 case TOP_PATCHLIST_14
:
472 case TOP_PATCHLIST_15
:
473 case TOP_PATCHLIST_16
:
474 case TOP_PATCHLIST_17
:
475 case TOP_PATCHLIST_18
:
476 case TOP_PATCHLIST_19
:
477 case TOP_PATCHLIST_20
:
478 case TOP_PATCHLIST_21
:
479 case TOP_PATCHLIST_22
:
480 case TOP_PATCHLIST_23
:
481 case TOP_PATCHLIST_24
:
482 case TOP_PATCHLIST_25
:
483 case TOP_PATCHLIST_26
:
484 case TOP_PATCHLIST_27
:
485 case TOP_PATCHLIST_28
:
486 case TOP_PATCHLIST_29
:
487 case TOP_PATCHLIST_30
:
488 case TOP_PATCHLIST_31
:
489 case TOP_PATCHLIST_32
:
490 numVerts
= topology
- TOP_PATCHLIST_BASE
;
493 SWR_ASSERT(false, "Unsupported topology: %d", topology
);
501 case TOP_LISTSTRIP_ADJ
:
502 case TOP_LINE_LIST_ADJ
: numVerts
= 4; break;
503 case TOP_TRI_STRIP_ADJ
:
504 case TOP_TRI_LIST_ADJ
: numVerts
= 6; break;
512 //////////////////////////////////////////////////////////////////////////
513 /// @brief Generate mask from remaining work.
514 /// @param numWorkItems - Number of items being worked on by a SIMD.
515 static INLINE simdscalari
GenerateMask(uint32_t numItemsRemaining
)
517 uint32_t numActive
= (numItemsRemaining
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: numItemsRemaining
;
518 uint32_t mask
= (numActive
> 0) ? ((1 << numActive
) - 1) : 0;
519 return _simd_castps_si(vMask(mask
));
522 //////////////////////////////////////////////////////////////////////////
523 /// @brief StreamOut - Streams vertex data out to SO buffers.
524 /// Generally, we are only streaming out a SIMDs worth of triangles.
525 /// @param pDC - pointer to draw context.
526 /// @param workerId - thread's worker id. Even thread has a unique id.
527 /// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
528 static void StreamOut(
533 uint32_t streamIndex
)
535 RDTSC_START(FEStreamout
);
537 SWR_CONTEXT
* pContext
= pDC
->pContext
;
539 const API_STATE
& state
= GetApiState(pDC
);
540 const SWR_STREAMOUT_STATE
&soState
= state
.soState
;
542 uint32_t soVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
544 // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
545 uint32_t primDataDwordVertexStride
= (KNOB_NUM_ATTRIBUTES
* sizeof(float) * 4) / sizeof(uint32_t);
547 SWR_STREAMOUT_CONTEXT soContext
= { 0 };
549 // Setup buffer state pointers.
550 for (uint32_t i
= 0; i
< 4; ++i
)
552 soContext
.pBuffer
[i
] = &state
.soBuffer
[i
];
555 uint32_t numPrims
= pa
.NumPrims();
556 for (uint32_t primIndex
= 0; primIndex
< numPrims
; ++primIndex
)
559 uint32_t soMask
= soState
.streamMasks
[streamIndex
];
561 // Write all entries into primitive data buffer for SOS.
562 while (_BitScanForward(&slot
, soMask
))
564 __m128 attrib
[MAX_NUM_VERTS_PER_PRIM
]; // prim attribs (always 4 wide)
565 uint32_t paSlot
= slot
+ VERTEX_ATTRIB_START_SLOT
;
566 pa
.AssembleSingle(paSlot
, primIndex
, attrib
);
568 // Attribute offset is relative offset from start of vertex.
569 // Note that attributes start at slot 1 in the PA buffer. We need to write this
570 // to prim data starting at slot 0. Which is why we do (slot - 1).
571 // Also note: GL works slightly differently, and needs slot 0
572 uint32_t primDataAttribOffset
= slot
* sizeof(float) * 4 / sizeof(uint32_t);
574 // Store each vertex's attrib at appropriate locations in pPrimData buffer.
575 for (uint32_t v
= 0; v
< soVertsPerPrim
; ++v
)
577 uint32_t* pPrimDataAttrib
= pPrimData
+ primDataAttribOffset
+ (v
* primDataDwordVertexStride
);
579 _mm_store_ps((float*)pPrimDataAttrib
, attrib
[v
]);
581 soMask
&= ~(1 << slot
);
584 // Update pPrimData pointer
585 soContext
.pPrimData
= pPrimData
;
588 SWR_ASSERT(state
.pfnSoFunc
[streamIndex
] != nullptr, "Trying to execute uninitialized streamout jit function.");
589 state
.pfnSoFunc
[streamIndex
](soContext
);
592 // Update SO write offset. The driver provides memory for the update.
593 for (uint32_t i
= 0; i
< 4; ++i
)
595 if (state
.soBuffer
[i
].pWriteOffset
)
597 *state
.soBuffer
[i
].pWriteOffset
= soContext
.pBuffer
[i
]->streamOffset
* sizeof(uint32_t);
599 // The SOS increments the existing write offset. So we don't want to increment
600 // the SoWriteOffset stat using an absolute offset instead of relative.
601 SET_STAT(SoWriteOffset
[i
], soContext
.pBuffer
[i
]->streamOffset
);
605 UPDATE_STAT(SoPrimStorageNeeded
[streamIndex
], soContext
.numPrimStorageNeeded
);
606 UPDATE_STAT(SoNumPrimsWritten
[streamIndex
], soContext
.numPrimsWritten
);
608 RDTSC_STOP(FEStreamout
, 1, 0);
611 //////////////////////////////////////////////////////////////////////////
612 /// @brief Computes number of invocations. The current index represents
613 /// the start of the SIMD. The max index represents how much work
614 /// items are remaining. If there is less then a SIMD's left of work
615 /// then return the remaining amount of work.
616 /// @param curIndex - The start index for the SIMD.
617 /// @param maxIndex - The last index for all work items.
618 static INLINE
uint32_t GetNumInvocations(
622 uint32_t remainder
= (maxIndex
- curIndex
);
623 return (remainder
>= KNOB_SIMD_WIDTH
) ? KNOB_SIMD_WIDTH
: remainder
;
626 //////////////////////////////////////////////////////////////////////////
627 /// @brief Converts a streamId buffer to a cut buffer for the given stream id.
628 /// The geometry shader will loop over each active streamout buffer, assembling
629 /// primitives for the downstream stages. When multistream output is enabled,
630 /// the generated stream ID buffer from the GS needs to be converted to a cut
631 /// buffer for the primitive assembler.
632 /// @param stream - stream id to generate the cut buffer for
633 /// @param pStreamIdBase - pointer to the stream ID buffer
634 /// @param numEmittedVerts - Number of total verts emitted by the GS
635 /// @param pCutBuffer - output buffer to write cuts to
636 void ProcessStreamIdBuffer(uint32_t stream
, uint8_t* pStreamIdBase
, uint32_t numEmittedVerts
, uint8_t *pCutBuffer
)
638 SWR_ASSERT(stream
< MAX_SO_STREAMS
);
640 uint32_t numInputBytes
= (numEmittedVerts
* 2 + 7) / 8;
641 uint32_t numOutputBytes
= std::max(numInputBytes
/ 2, 1U);
643 for (uint32_t b
= 0; b
< numOutputBytes
; ++b
)
645 uint8_t curInputByte
= pStreamIdBase
[2*b
];
647 for (uint32_t i
= 0; i
< 4; ++i
)
649 if ((curInputByte
& 0x3) != stream
)
656 curInputByte
= pStreamIdBase
[2 * b
+ 1];
657 for (uint32_t i
= 0; i
< 4; ++i
)
659 if ((curInputByte
& 0x3) != stream
)
661 outByte
|= (1 << (i
+ 4));
666 *pCutBuffer
++ = outByte
;
670 THREAD SWR_GS_CONTEXT tlsGsContext
;
672 //////////////////////////////////////////////////////////////////////////
673 /// @brief Implements GS stage.
674 /// @param pDC - pointer to draw context.
675 /// @param workerId - thread's worker id. Even thread has a unique id.
676 /// @param pa - The primitive assembly object.
677 /// @param pGsOut - output stream for GS
679 typename HasStreamOutT
,
681 static void GeometryShaderStage(
687 void* pStreamCutBuffer
,
688 uint32_t* pSoPrimData
,
691 RDTSC_START(FEGeometryShader
);
693 SWR_CONTEXT
* pContext
= pDC
->pContext
;
695 const API_STATE
& state
= GetApiState(pDC
);
696 const SWR_GS_STATE
* pState
= &state
.gsState
;
698 SWR_ASSERT(pGsOut
!= nullptr, "GS output buffer should be initialized");
699 SWR_ASSERT(pCutBuffer
!= nullptr, "GS output cut buffer should be initialized");
701 tlsGsContext
.pStream
= (uint8_t*)pGsOut
;
702 tlsGsContext
.pCutOrStreamIdBuffer
= (uint8_t*)pCutBuffer
;
703 tlsGsContext
.PrimitiveID
= primID
;
705 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, true);
706 simdvector attrib
[MAX_ATTRIBUTES
];
708 // assemble all attributes for the input primitive
709 for (uint32_t slot
= 0; slot
< pState
->numInputAttribs
; ++slot
)
711 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
712 pa
.Assemble(attribSlot
, attrib
);
714 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
716 tlsGsContext
.vert
[i
].attrib
[attribSlot
] = attrib
[i
];
721 pa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
722 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
724 tlsGsContext
.vert
[i
].attrib
[VERTEX_POSITION_SLOT
] = attrib
[i
];
727 const uint32_t vertexStride
= sizeof(simdvertex
);
728 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
729 const uint32_t inputPrimStride
= numSimdBatches
* vertexStride
;
730 const uint32_t instanceStride
= inputPrimStride
* KNOB_SIMD_WIDTH
;
731 uint32_t cutPrimStride
;
732 uint32_t cutInstanceStride
;
734 if (pState
->isSingleStream
)
736 cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
737 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
741 cutPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
742 cutInstanceStride
= cutPrimStride
* KNOB_SIMD_WIDTH
;
745 // record valid prims from the frontend to avoid over binning the newly generated
747 uint32_t numInputPrims
= pa
.NumPrims();
749 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
751 tlsGsContext
.InstanceID
= instance
;
752 tlsGsContext
.mask
= GenerateMask(numInputPrims
);
754 // execute the geometry shader
755 state
.pfnGsFunc(GetPrivateState(pDC
), &tlsGsContext
);
757 tlsGsContext
.pStream
+= instanceStride
;
758 tlsGsContext
.pCutOrStreamIdBuffer
+= cutInstanceStride
;
761 // set up new binner and state for the GS output topology
762 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
765 switch (pState
->outputTopology
)
767 case TOP_TRIANGLE_STRIP
: pfnClipFunc
= ClipTriangles
; break;
768 case TOP_LINE_STRIP
: pfnClipFunc
= ClipLines
; break;
769 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
770 default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState
->outputTopology
);
774 // foreach input prim:
775 // - setup a new PA based on the emitted verts for that prim
776 // - loop over the new verts, calling PA to assemble each prim
777 uint32_t* pVertexCount
= (uint32_t*)&tlsGsContext
.vertexCount
;
778 uint32_t* pPrimitiveId
= (uint32_t*)&primID
;
780 uint32_t totalPrimsGenerated
= 0;
781 for (uint32_t inputPrim
= 0; inputPrim
< numInputPrims
; ++inputPrim
)
783 uint8_t* pInstanceBase
= (uint8_t*)pGsOut
+ inputPrim
* inputPrimStride
;
784 uint8_t* pCutBufferBase
= (uint8_t*)pCutBuffer
+ inputPrim
* cutPrimStride
;
785 for (uint32_t instance
= 0; instance
< pState
->instanceCount
; ++instance
)
787 uint32_t numEmittedVerts
= pVertexCount
[inputPrim
];
788 if (numEmittedVerts
== 0)
793 uint8_t* pBase
= pInstanceBase
+ instance
* instanceStride
;
794 uint8_t* pCutBase
= pCutBufferBase
+ instance
* cutInstanceStride
;
797 if (_BitScanReverse(&numAttribs
, state
.feAttribMask
))
806 for (uint32_t stream
= 0; stream
< MAX_SO_STREAMS
; ++stream
)
808 bool processCutVerts
= false;
810 uint8_t* pCutBuffer
= pCutBase
;
812 // assign default stream ID, only relevant when GS is outputting a single stream
813 uint32_t streamID
= 0;
814 if (pState
->isSingleStream
)
816 processCutVerts
= true;
817 streamID
= pState
->singleStreamID
;
818 if (streamID
!= stream
) continue;
822 // early exit if this stream is not enabled for streamout
823 if (HasStreamOutT::value
&& !state
.soState
.streamEnable
[stream
])
828 // multi-stream output, need to translate StreamID buffer to a cut buffer
829 ProcessStreamIdBuffer(stream
, pCutBase
, numEmittedVerts
, (uint8_t*)pStreamCutBuffer
);
830 pCutBuffer
= (uint8_t*)pStreamCutBuffer
;
831 processCutVerts
= false;
834 PA_STATE_CUT
gsPa(pDC
, pBase
, numEmittedVerts
, pCutBuffer
, numEmittedVerts
, numAttribs
, pState
->outputTopology
, processCutVerts
);
836 while (gsPa
.GetNextStreamOutput())
840 bool assemble
= gsPa
.Assemble(VERTEX_POSITION_SLOT
, attrib
);
844 totalPrimsGenerated
+= gsPa
.NumPrims();
846 if (HasStreamOutT::value
)
848 StreamOut(pDC
, gsPa
, workerId
, pSoPrimData
, stream
);
851 if (HasRastT::value
&& state
.soState
.streamToRasterizer
== stream
)
854 // pull primitiveID from the GS output if available
855 if (state
.gsState
.emitsPrimitiveID
)
857 simdvector primIdAttrib
[3];
858 gsPa
.Assemble(VERTEX_PRIMID_SLOT
, primIdAttrib
);
859 vPrimId
= _simd_castps_si(primIdAttrib
[0].x
);
863 vPrimId
= _simd_set1_epi32(pPrimitiveId
[inputPrim
]);
866 pfnClipFunc(pDC
, gsPa
, workerId
, attrib
, GenMask(gsPa
.NumPrims()), vPrimId
);
869 } while (gsPa
.NextPrim());
875 // update GS pipeline stats
876 UPDATE_STAT(GsInvocations
, numInputPrims
* pState
->instanceCount
);
877 UPDATE_STAT(GsPrimitives
, totalPrimsGenerated
);
879 RDTSC_STOP(FEGeometryShader
, 1, 0);
882 //////////////////////////////////////////////////////////////////////////
883 /// @brief Allocate GS buffers
884 /// @param pDC - pointer to draw context.
885 /// @param state - API state
886 /// @param ppGsOut - pointer to GS output buffer allocation
887 /// @param ppCutBuffer - pointer to GS output cut buffer allocation
888 static INLINE
void AllocateGsBuffers(DRAW_CONTEXT
* pDC
, const API_STATE
& state
, void** ppGsOut
, void** ppCutBuffer
,
889 void **ppStreamCutBuffer
)
891 auto pArena
= pDC
->pArena
;
892 SWR_ASSERT(pArena
!= nullptr);
893 SWR_ASSERT(state
.gsState
.gsEnable
);
894 // allocate arena space to hold GS output verts
895 // @todo pack attribs
896 // @todo support multiple streams
897 const uint32_t vertexStride
= sizeof(simdvertex
);
898 const uint32_t numSimdBatches
= (state
.gsState
.maxNumVerts
+ KNOB_SIMD_WIDTH
- 1) / KNOB_SIMD_WIDTH
;
899 uint32_t size
= state
.gsState
.instanceCount
* numSimdBatches
* vertexStride
* KNOB_SIMD_WIDTH
;
900 *ppGsOut
= pArena
->AllocAligned(size
, KNOB_SIMD_WIDTH
* sizeof(float));
902 const uint32_t cutPrimStride
= (state
.gsState
.maxNumVerts
+ 7) / 8;
903 const uint32_t streamIdPrimStride
= AlignUp(state
.gsState
.maxNumVerts
* 2 / 8, 4);
904 const uint32_t cutBufferSize
= cutPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
905 const uint32_t streamIdSize
= streamIdPrimStride
* state
.gsState
.instanceCount
* KNOB_SIMD_WIDTH
;
907 // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
908 // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
910 // allocate space for temporary per-stream cut buffer if multi-stream is enabled
911 if (state
.gsState
.isSingleStream
)
913 *ppCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
914 *ppStreamCutBuffer
= nullptr;
918 *ppCutBuffer
= pArena
->AllocAligned(streamIdSize
, KNOB_SIMD_WIDTH
* sizeof(float));
919 *ppStreamCutBuffer
= pArena
->AllocAligned(cutBufferSize
, KNOB_SIMD_WIDTH
* sizeof(float));
924 //////////////////////////////////////////////////////////////////////////
925 /// @brief Contains all data generated by the HS and passed to the
926 /// tessellator and DS.
927 struct TessellationThreadLocalData
929 SWR_HS_CONTEXT hsContext
;
930 ScalarPatch patchData
[KNOB_SIMD_WIDTH
];
934 simdscalar
* pDSOutput
;
935 size_t numDSOutputVectors
;
938 THREAD TessellationThreadLocalData
* gt_pTessellationThreadData
= nullptr;
940 //////////////////////////////////////////////////////////////////////////
941 /// @brief Allocate tessellation data for this worker thread.
943 static void AllocateTessellationData(SWR_CONTEXT
* pContext
)
945 /// @TODO - Don't use thread local storage. Use Worker local storage instead.
946 if (gt_pTessellationThreadData
== nullptr)
948 gt_pTessellationThreadData
= (TessellationThreadLocalData
*)
949 _aligned_malloc(sizeof(TessellationThreadLocalData
), 64);
950 memset(gt_pTessellationThreadData
, 0, sizeof(*gt_pTessellationThreadData
));
954 //////////////////////////////////////////////////////////////////////////
955 /// @brief Implements Tessellation Stages.
956 /// @param pDC - pointer to draw context.
957 /// @param workerId - thread's worker id. Even thread has a unique id.
958 /// @param pa - The primitive assembly object.
959 /// @param pGsOut - output stream for GS
961 typename HasGeometryShaderT
,
962 typename HasStreamOutT
,
964 static void TessellationStages(
970 void* pCutStreamBuffer
,
971 uint32_t* pSoPrimData
,
974 const API_STATE
& state
= GetApiState(pDC
);
975 const SWR_TS_STATE
& tsState
= state
.tsState
;
976 SWR_CONTEXT
*pContext
= pDC
->pContext
; // Needed for UPDATE_STATS macro
978 SWR_ASSERT(gt_pTessellationThreadData
);
980 HANDLE tsCtx
= TSInitCtx(
982 tsState
.partitioning
,
983 tsState
.tsOutputTopology
,
984 gt_pTessellationThreadData
->pTxCtx
,
985 gt_pTessellationThreadData
->tsCtxSize
);
986 if (tsCtx
== nullptr)
988 gt_pTessellationThreadData
->pTxCtx
= _aligned_malloc(gt_pTessellationThreadData
->tsCtxSize
, 64);
991 tsState
.partitioning
,
992 tsState
.tsOutputTopology
,
993 gt_pTessellationThreadData
->pTxCtx
,
994 gt_pTessellationThreadData
->tsCtxSize
);
998 PFN_PROCESS_PRIMS pfnClipFunc
= nullptr;
1001 switch (tsState
.postDSTopology
)
1003 case TOP_TRIANGLE_LIST
: pfnClipFunc
= ClipTriangles
; break;
1004 case TOP_LINE_LIST
: pfnClipFunc
= ClipLines
; break;
1005 case TOP_POINT_LIST
: pfnClipFunc
= ClipPoints
; break;
1006 default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState
.postDSTopology
);
1010 SWR_HS_CONTEXT
& hsContext
= gt_pTessellationThreadData
->hsContext
;
1011 hsContext
.pCPout
= gt_pTessellationThreadData
->patchData
;
1012 hsContext
.PrimitiveID
= primID
;
1014 uint32_t numVertsPerPrim
= NumVertsPerPrim(pa
.binTopology
, false);
1015 // Max storage for one attribute for an entire simdprimitive
1016 simdvector simdattrib
[MAX_NUM_VERTS_PER_PRIM
];
1018 // assemble all attributes for the input primitives
1019 for (uint32_t slot
= 0; slot
< tsState
.numHsInputAttribs
; ++slot
)
1021 uint32_t attribSlot
= VERTEX_ATTRIB_START_SLOT
+ slot
;
1022 pa
.Assemble(attribSlot
, simdattrib
);
1024 for (uint32_t i
= 0; i
< numVertsPerPrim
; ++i
)
1026 hsContext
.vert
[i
].attrib
[attribSlot
] = simdattrib
[i
];
1031 memset(hsContext
.pCPout
, 0x90, sizeof(ScalarPatch
) * KNOB_SIMD_WIDTH
);
1034 uint32_t numPrims
= pa
.NumPrims();
1035 hsContext
.mask
= GenerateMask(numPrims
);
1038 RDTSC_START(FEHullShader
);
1039 state
.pfnHsFunc(GetPrivateState(pDC
), &hsContext
);
1040 RDTSC_STOP(FEHullShader
, 0, 0);
1042 UPDATE_STAT(HsInvocations
, numPrims
);
1044 const uint32_t* pPrimId
= (const uint32_t*)&primID
;
1046 for (uint32_t p
= 0; p
< numPrims
; ++p
)
1049 SWR_TS_TESSELLATED_DATA tsData
= { 0 };
1050 RDTSC_START(FETessellation
);
1051 TSTessellate(tsCtx
, hsContext
.pCPout
[p
].tessFactors
, tsData
);
1052 RDTSC_STOP(FETessellation
, 0, 0);
1054 if (tsData
.NumPrimitives
== 0)
1058 SWR_ASSERT(tsData
.NumDomainPoints
);
1060 // Allocate DS Output memory
1061 uint32_t requiredDSVectorInvocations
= AlignUp(tsData
.NumDomainPoints
, KNOB_SIMD_WIDTH
) / KNOB_SIMD_WIDTH
;
1062 size_t requiredDSOutputVectors
= requiredDSVectorInvocations
* tsState
.numDsOutputAttribs
;
1063 size_t requiredAllocSize
= sizeof(simdvector
) * requiredDSOutputVectors
;
1064 if (requiredDSOutputVectors
> gt_pTessellationThreadData
->numDSOutputVectors
)
1066 _aligned_free(gt_pTessellationThreadData
->pDSOutput
);
1067 gt_pTessellationThreadData
->pDSOutput
= (simdscalar
*)_aligned_malloc(requiredAllocSize
, 64);
1068 gt_pTessellationThreadData
->numDSOutputVectors
= requiredDSOutputVectors
;
1070 SWR_ASSERT(gt_pTessellationThreadData
->pDSOutput
);
1071 SWR_ASSERT(gt_pTessellationThreadData
->numDSOutputVectors
>= requiredDSOutputVectors
);
1074 memset(gt_pTessellationThreadData
->pDSOutput
, 0x90, requiredAllocSize
);
1077 // Run Domain Shader
1078 SWR_DS_CONTEXT dsContext
;
1079 dsContext
.PrimitiveID
= pPrimId
[p
];
1080 dsContext
.pCpIn
= &hsContext
.pCPout
[p
];
1081 dsContext
.pDomainU
= (simdscalar
*)tsData
.pDomainPointsU
;
1082 dsContext
.pDomainV
= (simdscalar
*)tsData
.pDomainPointsV
;
1083 dsContext
.pOutputData
= gt_pTessellationThreadData
->pDSOutput
;
1084 dsContext
.vectorStride
= requiredDSVectorInvocations
;
1086 uint32_t dsInvocations
= 0;
1088 for (dsContext
.vectorOffset
= 0; dsContext
.vectorOffset
< requiredDSVectorInvocations
; ++dsContext
.vectorOffset
)
1090 dsContext
.mask
= GenerateMask(tsData
.NumDomainPoints
- dsInvocations
);
1092 RDTSC_START(FEDomainShader
);
1093 state
.pfnDsFunc(GetPrivateState(pDC
), &dsContext
);
1094 RDTSC_STOP(FEDomainShader
, 0, 0);
1096 dsInvocations
+= KNOB_SIMD_WIDTH
;
1098 UPDATE_STAT(DsInvocations
, tsData
.NumDomainPoints
);
1102 dsContext
.pOutputData
,
1103 dsContext
.vectorStride
,
1104 tsState
.numDsOutputAttribs
,
1106 tsData
.NumPrimitives
,
1107 tsState
.postDSTopology
);
1109 while (tessPa
.HasWork())
1111 if (HasGeometryShaderT::value
)
1113 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1114 pDC
, workerId
, tessPa
, pGsOut
, pCutBuffer
, pCutStreamBuffer
, pSoPrimData
,
1115 _simd_set1_epi32(dsContext
.PrimitiveID
));
1119 if (HasStreamOutT::value
)
1121 StreamOut(pDC
, tessPa
, workerId
, pSoPrimData
, 0);
1124 if (HasRastT::value
)
1126 simdvector prim
[3]; // Only deal with triangles, lines, or points
1127 RDTSC_START(FEPAAssemble
);
1128 #if SWR_ENABLE_ASSERTS
1131 tessPa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1132 RDTSC_STOP(FEPAAssemble
, 1, 0);
1133 SWR_ASSERT(assemble
);
1135 SWR_ASSERT(pfnClipFunc
);
1136 pfnClipFunc(pDC
, tessPa
, workerId
, prim
,
1137 GenMask(tessPa
.NumPrims()), _simd_set1_epi32(dsContext
.PrimitiveID
));
1143 } // while (tessPa.HasWork())
1144 } // for (uint32_t p = 0; p < numPrims; ++p)
1146 TSDestroyCtx(tsCtx
);
1149 //////////////////////////////////////////////////////////////////////////
1150 /// @brief FE handler for SwrDraw.
1151 /// @tparam IsIndexedT - Is indexed drawing enabled
1152 /// @tparam HasTessellationT - Is tessellation enabled
1153 /// @tparam HasGeometryShaderT::value - Is the geometry shader stage enabled
1154 /// @tparam HasStreamOutT - Is stream-out enabled
1155 /// @tparam HasRastT - Is rasterization enabled
1156 /// @param pContext - pointer to SWR context.
1157 /// @param pDC - pointer to draw context.
1158 /// @param workerId - thread's worker id.
1159 /// @param pUserData - Pointer to DRAW_WORK
1161 typename IsIndexedT
,
1162 typename HasTessellationT
,
1163 typename HasGeometryShaderT
,
1164 typename HasStreamOutT
,
1167 SWR_CONTEXT
*pContext
,
1173 #if KNOB_ENABLE_TOSS_POINTS
1174 if (KNOB_TOSS_QUEUE_FE
)
1180 RDTSC_START(FEProcessDraw
);
1182 DRAW_WORK
& work
= *(DRAW_WORK
*)pUserData
;
1183 const API_STATE
& state
= GetApiState(pDC
);
1184 __m256i vScale
= _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
1185 SWR_VS_CONTEXT vsContext
;
1189 uint32_t endVertex
= work
.numVerts
;
1191 const int32_t* pLastRequestedIndex
= nullptr;
1192 if (IsIndexedT::value
)
1197 indexSize
= sizeof(uint32_t);
1198 pLastRequestedIndex
= &(work
.pIB
[endVertex
]);
1201 indexSize
= sizeof(uint16_t);
1202 // nasty address offset to last index
1203 pLastRequestedIndex
= (int32_t*)(&(((uint16_t*)work
.pIB
)[endVertex
]));
1206 indexSize
= sizeof(uint8_t);
1207 // nasty address offset to last index
1208 pLastRequestedIndex
= (int32_t*)(&(((uint8_t*)work
.pIB
)[endVertex
]));
1216 // No cuts, prune partial primitives.
1217 endVertex
= GetNumVerts(state
.topology
, GetNumPrims(state
.topology
, work
.numVerts
));
1220 SWR_FETCH_CONTEXT fetchInfo
= { 0 };
1221 fetchInfo
.pStreams
= &state
.vertexBuffers
[0];
1222 fetchInfo
.StartInstance
= work
.startInstance
;
1223 fetchInfo
.StartVertex
= 0;
1225 vsContext
.pVin
= &vin
;
1227 if (IsIndexedT::value
)
1229 fetchInfo
.BaseVertex
= work
.baseVertex
;
1231 // if the entire index buffer isn't being consumed, set the last index
1232 // so that fetches < a SIMD wide will be masked off
1233 fetchInfo
.pLastIndex
= (const int32_t*)(((uint8_t*)state
.indexBuffer
.pIndices
) + state
.indexBuffer
.size
);
1234 if (pLastRequestedIndex
< fetchInfo
.pLastIndex
)
1236 fetchInfo
.pLastIndex
= pLastRequestedIndex
;
1241 fetchInfo
.StartVertex
= work
.startVertex
;
1244 #ifdef KNOB_ENABLE_RDTSC
1245 uint32_t numPrims
= GetNumPrims(state
.topology
, work
.numVerts
);
1248 void* pGsOut
= nullptr;
1249 void* pCutBuffer
= nullptr;
1250 void* pStreamCutBuffer
= nullptr;
1251 if (HasGeometryShaderT::value
)
1253 AllocateGsBuffers(pDC
, state
, &pGsOut
, &pCutBuffer
, &pStreamCutBuffer
);
1256 if (HasTessellationT::value
)
1258 SWR_ASSERT(state
.tsState
.tsEnable
== true);
1259 SWR_ASSERT(state
.pfnHsFunc
!= nullptr);
1260 SWR_ASSERT(state
.pfnDsFunc
!= nullptr);
1262 AllocateTessellationData(pContext
);
1266 SWR_ASSERT(state
.tsState
.tsEnable
== false);
1267 SWR_ASSERT(state
.pfnHsFunc
== nullptr);
1268 SWR_ASSERT(state
.pfnDsFunc
== nullptr);
1271 // allocate space for streamout input prim data
1272 uint32_t* pSoPrimData
= nullptr;
1273 if (HasStreamOutT::value
)
1275 pSoPrimData
= (uint32_t*)pDC
->pArena
->AllocAligned(4096, 16);
1278 for (uint32_t i
= 0; i
< 4; ++i
)
1280 SET_STAT(SoWriteOffset
[i
], state
.soBuffer
[i
].streamOffset
);
1285 // choose primitive assembler
1286 PA_FACTORY
<IsIndexedT
> paFactory(pDC
, state
.topology
, work
.numVerts
);
1287 PA_STATE
& pa
= paFactory
.GetPA();
1289 /// @todo: temporarily move instance loop in the FE to ensure SO ordering
1290 for (uint32_t instanceNum
= 0; instanceNum
< work
.numInstances
; instanceNum
++)
1295 if (IsIndexedT::value
)
1297 fetchInfo
.pIndices
= work
.pIB
;
1301 vIndex
= _simd_add_epi32(_simd_set1_epi32(work
.startVertexID
), vScale
);
1302 fetchInfo
.pIndices
= (const int32_t*)&vIndex
;
1305 fetchInfo
.CurInstance
= instanceNum
;
1306 vsContext
.InstanceID
= instanceNum
;
1308 while (pa
.HasWork())
1310 // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
1311 // So we need to keep this outside of (i < endVertex) check.
1312 simdmask
* pvCutIndices
= nullptr;
1313 if (IsIndexedT::value
)
1315 pvCutIndices
= &pa
.GetNextVsIndices();
1318 simdvertex
& vout
= pa
.GetNextVsOutput();
1319 vsContext
.pVout
= &vout
;
1324 // 1. Execute FS/VS for a single SIMD.
1325 RDTSC_START(FEFetchShader
);
1326 state
.pfnFetchFunc(fetchInfo
, vin
);
1327 RDTSC_STOP(FEFetchShader
, 0, 0);
1329 // forward fetch generated vertex IDs to the vertex shader
1330 vsContext
.VertexID
= fetchInfo
.VertexID
;
1332 // Setup active mask for vertex shader.
1333 vsContext
.mask
= GenerateMask(endVertex
- i
);
1335 // forward cut mask to the PA
1336 if (IsIndexedT::value
)
1338 *pvCutIndices
= _simd_movemask_ps(_simd_castsi_ps(fetchInfo
.CutMask
));
1341 UPDATE_STAT(IaVertices
, GetNumInvocations(i
, endVertex
));
1343 #if KNOB_ENABLE_TOSS_POINTS
1344 if (!KNOB_TOSS_FETCH
)
1347 RDTSC_START(FEVertexShader
);
1348 state
.pfnVertexFunc(GetPrivateState(pDC
), &vsContext
);
1349 RDTSC_STOP(FEVertexShader
, 0, 0);
1351 UPDATE_STAT(VsInvocations
, GetNumInvocations(i
, endVertex
));
1355 // 2. Assemble primitives given the last two SIMD.
1358 simdvector prim
[MAX_NUM_VERTS_PER_PRIM
];
1359 // PaAssemble returns false if there is not enough verts to assemble.
1360 RDTSC_START(FEPAAssemble
);
1361 bool assemble
= pa
.Assemble(VERTEX_POSITION_SLOT
, prim
);
1362 RDTSC_STOP(FEPAAssemble
, 1, 0);
1364 #if KNOB_ENABLE_TOSS_POINTS
1365 if (!KNOB_TOSS_FETCH
)
1368 #if KNOB_ENABLE_TOSS_POINTS
1374 UPDATE_STAT(IaPrimitives
, pa
.NumPrims());
1376 if (HasTessellationT::value
)
1378 TessellationStages
<HasGeometryShaderT
, HasStreamOutT
, HasRastT
>(
1379 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1381 else if (HasGeometryShaderT::value
)
1383 GeometryShaderStage
<HasStreamOutT
, HasRastT
>(
1384 pDC
, workerId
, pa
, pGsOut
, pCutBuffer
, pStreamCutBuffer
, pSoPrimData
, pa
.GetPrimID(work
.startPrimID
));
1388 // If streamout is enabled then stream vertices out to memory.
1389 if (HasStreamOutT::value
)
1391 StreamOut(pDC
, pa
, workerId
, pSoPrimData
, 0);
1394 if (HasRastT::value
)
1396 SWR_ASSERT(pDC
->pState
->pfnProcessPrims
);
1397 pDC
->pState
->pfnProcessPrims(pDC
, pa
, workerId
, prim
,
1398 GenMask(pa
.NumPrims()), pa
.GetPrimID(work
.startPrimID
));
1404 } while (pa
.NextPrim());
1406 i
+= KNOB_SIMD_WIDTH
;
1407 if (IsIndexedT::value
)
1409 fetchInfo
.pIndices
= (int*)((uint8_t*)fetchInfo
.pIndices
+ KNOB_SIMD_WIDTH
* indexSize
);
1413 vIndex
= _simd_add_epi32(vIndex
, _simd_set1_epi32(KNOB_SIMD_WIDTH
));
1419 RDTSC_STOP(FEProcessDraw
, numPrims
* work
.numInstances
, pDC
->drawId
);
1422 struct FEDrawChooser
1424 typedef PFN_FE_WORK_FUNC FuncType
;
1426 template <typename
... ArgsB
>
1427 static FuncType
GetFunc()
1429 return ProcessDraw
<ArgsB
...>;
1434 // Selector for correct templated Draw front-end function
1435 PFN_FE_WORK_FUNC
GetProcessDrawFunc(
1437 bool HasTessellation
,
1438 bool HasGeometryShader
,
1440 bool HasRasterization
)
1442 return TemplateArgUnroller
<FEDrawChooser
>::GetFunc(IsIndexed
, HasTessellation
, HasGeometryShader
, HasStreamOut
, HasRasterization
);
1446 //////////////////////////////////////////////////////////////////////////
1447 /// @brief Processes attributes for the backend based on linkage mask and
1448 /// linkage map. Essentially just doing an SOA->AOS conversion and pack.
1449 /// @param pDC - Draw context
1450 /// @param pa - Primitive Assembly state
1451 /// @param linkageMask - Specifies which VS outputs are routed to PS.
1452 /// @param pLinkageMap - maps VS attribute slot to PS slot
1453 /// @param triIndex - Triangle to process attributes for
1454 /// @param pBuffer - Output result
1455 template<uint32_t NumVerts
>
1456 INLINE
void ProcessAttributes(
1459 uint32_t linkageMask
,
1460 const uint8_t* pLinkageMap
,
1465 uint32_t mapIdx
= 0;
1466 LONG constantInterpMask
= pDC
->pState
->state
.backendState
.constantInterpolationMask
;
1467 const uint32_t provokingVertex
= pDC
->pState
->state
.frontendState
.topologyProvokingVertex
;
1469 while (_BitScanForward(&slot
, linkageMask
))
1471 linkageMask
&= ~(1 << slot
); // done with this bit.
1473 // compute absolute slot in vertex attrib array
1474 uint32_t inputSlot
= VERTEX_ATTRIB_START_SLOT
+ pLinkageMap
[mapIdx
];
1476 __m128 attrib
[3]; // triangle attribs (always 4 wide)
1477 pa
.AssembleSingle(inputSlot
, triIndex
, attrib
);
1479 if (_bittest(&constantInterpMask
, mapIdx
))
1481 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
1483 _mm_store_ps(pBuffer
, attrib
[provokingVertex
]);
1489 for (uint32_t i
= 0; i
< NumVerts
; ++i
)
1491 _mm_store_ps(pBuffer
, attrib
[i
]);
1496 // pad out the attrib buffer to 3 verts to ensure the triangle
1497 // interpolation code in the pixel shader works correctly for the
1498 // 3 topologies - point, line, tri. This effectively zeros out the
1499 // effect of the missing vertices in the triangle interpolation.
1500 for (uint32_t i
= NumVerts
; i
< 3; ++i
)
1502 _mm_store_ps(pBuffer
, attrib
[NumVerts
- 1]);
1510 //////////////////////////////////////////////////////////////////////////
1511 /// @brief Processes enabled user clip distances. Loads the active clip
1512 /// distances from the PA, sets up barycentric equations, and
1513 /// stores the results to the output buffer
1514 /// @param pa - Primitive Assembly state
1515 /// @param primIndex - primitive index to process
1516 /// @param clipDistMask - mask of enabled clip distances
1517 /// @param pUserClipBuffer - buffer to store results
1518 template<uint32_t NumVerts
>
1519 void ProcessUserClipDist(PA_STATE
& pa
, uint32_t primIndex
, uint8_t clipDistMask
, float* pUserClipBuffer
)
1522 while (_BitScanForward(&clipDist
, clipDistMask
))
1524 clipDistMask
&= ~(1 << clipDist
);
1525 uint32_t clipSlot
= clipDist
>> 2;
1526 uint32_t clipComp
= clipDist
& 0x3;
1527 uint32_t clipAttribSlot
= clipSlot
== 0 ?
1528 VERTEX_CLIPCULL_DIST_LO_SLOT
: VERTEX_CLIPCULL_DIST_HI_SLOT
;
1530 __m128 primClipDist
[3];
1531 pa
.AssembleSingle(clipAttribSlot
, primIndex
, primClipDist
);
1533 float vertClipDist
[NumVerts
];
1534 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1536 OSALIGNSIMD(float) aVertClipDist
[4];
1537 _mm_store_ps(aVertClipDist
, primClipDist
[e
]);
1538 vertClipDist
[e
] = aVertClipDist
[clipComp
];
1541 // setup plane equations for barycentric interpolation in the backend
1542 float baryCoeff
[NumVerts
];
1543 for (uint32_t e
= 0; e
< NumVerts
- 1; ++e
)
1545 baryCoeff
[e
] = vertClipDist
[e
] - vertClipDist
[NumVerts
- 1];
1547 baryCoeff
[NumVerts
- 1] = vertClipDist
[NumVerts
- 1];
1549 for (uint32_t e
= 0; e
< NumVerts
; ++e
)
1551 *(pUserClipBuffer
++) = baryCoeff
[e
];
1556 //////////////////////////////////////////////////////////////////////////
1557 /// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
1558 /// culling, viewport transform, etc.
1559 /// @param pDC - pointer to draw context.
1560 /// @param pa - The primitive assembly object.
1561 /// @param workerId - thread's worker id. Even thread has a unique id.
1562 /// @param tri - Contains triangle position data for SIMDs worth of triangles.
1563 /// @param primID - Primitive ID for each triangle.
1572 RDTSC_START(FEBinTriangles
);
1574 const API_STATE
& state
= GetApiState(pDC
);
1575 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1576 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1577 const SWR_GS_STATE
& gsState
= state
.gsState
;
1579 // Simple wireframe mode for debugging purposes only
1581 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
1582 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
1583 simdscalar vRecipW2
= _simd_set1_ps(1.0f
);
1585 if (!feState
.vpTransformDisable
)
1587 // perspective divide
1588 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[0].w
);
1589 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[1].w
);
1590 vRecipW2
= _simd_div_ps(_simd_set1_ps(1.0f
), tri
[2].w
);
1592 tri
[0].v
[0] = _simd_mul_ps(tri
[0].v
[0], vRecipW0
);
1593 tri
[1].v
[0] = _simd_mul_ps(tri
[1].v
[0], vRecipW1
);
1594 tri
[2].v
[0] = _simd_mul_ps(tri
[2].v
[0], vRecipW2
);
1596 tri
[0].v
[1] = _simd_mul_ps(tri
[0].v
[1], vRecipW0
);
1597 tri
[1].v
[1] = _simd_mul_ps(tri
[1].v
[1], vRecipW1
);
1598 tri
[2].v
[1] = _simd_mul_ps(tri
[2].v
[1], vRecipW2
);
1600 tri
[0].v
[2] = _simd_mul_ps(tri
[0].v
[2], vRecipW0
);
1601 tri
[1].v
[2] = _simd_mul_ps(tri
[1].v
[2], vRecipW1
);
1602 tri
[2].v
[2] = _simd_mul_ps(tri
[2].v
[2], vRecipW2
);
1604 // viewport transform to screen coords
1605 viewportTransform
<3>(tri
, state
.vpMatrix
[0]);
1608 // adjust for pixel center location
1609 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1610 tri
[0].x
= _simd_add_ps(tri
[0].x
, offset
);
1611 tri
[0].y
= _simd_add_ps(tri
[0].y
, offset
);
1613 tri
[1].x
= _simd_add_ps(tri
[1].x
, offset
);
1614 tri
[1].y
= _simd_add_ps(tri
[1].y
, offset
);
1616 tri
[2].x
= _simd_add_ps(tri
[2].x
, offset
);
1617 tri
[2].y
= _simd_add_ps(tri
[2].y
, offset
);
1619 // convert to fixed point
1620 simdscalari vXi
[3], vYi
[3];
1621 vXi
[0] = fpToFixedPointVertical(tri
[0].x
);
1622 vYi
[0] = fpToFixedPointVertical(tri
[0].y
);
1623 vXi
[1] = fpToFixedPointVertical(tri
[1].x
);
1624 vYi
[1] = fpToFixedPointVertical(tri
[1].y
);
1625 vXi
[2] = fpToFixedPointVertical(tri
[2].x
);
1626 vYi
[2] = fpToFixedPointVertical(tri
[2].y
);
1629 simdscalari vAi
[3], vBi
[3];
1630 triangleSetupABIntVertical(vXi
, vYi
, vAi
, vBi
);
1633 simdscalari vDet
[2];
1634 calcDeterminantIntVertical(vAi
, vBi
, vDet
);
1637 int maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[0], _simd_setzero_si())));
1638 int maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet
[1], _simd_setzero_si())));
1640 int cullZeroAreaMask
= maskLo
| ((maskHi
<< KNOB_SIMD_WIDTH
/ 2));
1642 uint32_t origTriMask
= triMask
;
1643 triMask
&= ~cullZeroAreaMask
;
1645 // determine front winding tris
1648 maskLo
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[0], _simd_setzero_si())));
1649 maskHi
= _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet
[1], _simd_setzero_si())));
1650 int cwTriMask
= maskLo
| (maskHi
<< (KNOB_SIMD_WIDTH
/2) );
1652 uint32_t frontWindingTris
;
1653 if (rastState
.frontWinding
== SWR_FRONTWINDING_CW
)
1655 frontWindingTris
= cwTriMask
;
1659 frontWindingTris
= ~cwTriMask
;
1664 switch ((SWR_CULLMODE
)rastState
.cullMode
)
1666 case SWR_CULLMODE_BOTH
: cullTris
= 0xffffffff; break;
1667 case SWR_CULLMODE_NONE
: cullTris
= 0x0; break;
1668 case SWR_CULLMODE_FRONT
: cullTris
= frontWindingTris
; break;
1669 case SWR_CULLMODE_BACK
: cullTris
= ~frontWindingTris
; break;
1670 default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState
.cullMode
); cullTris
= 0x0; break;
1673 triMask
&= ~cullTris
;
1675 if (origTriMask
^ triMask
)
1677 RDTSC_EVENT(FECullZeroAreaAndBackface
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1680 // compute per tri backface
1681 uint32_t frontFaceMask
= frontWindingTris
;
1683 uint32_t *pPrimID
= (uint32_t *)&primID
;
1688 goto endBinTriangles
;
1691 // Calc bounding box of triangles
1693 calcBoundingBoxIntVertical(vXi
, vYi
, bbox
);
1695 // determine if triangle falls between pixel centers and discard
1696 // only discard for non-MSAA case
1697 // (left + 127) & ~255
1698 // (right + 128) & ~255
1700 if(rastState
.sampleCount
== SWR_MULTISAMPLE_1X
)
1702 origTriMask
= triMask
;
1706 simdscalari left
= _simd_add_epi32(bbox
.left
, _simd_set1_epi32(127));
1707 left
= _simd_and_si(left
, _simd_set1_epi32(~255));
1708 simdscalari right
= _simd_add_epi32(bbox
.right
, _simd_set1_epi32(128));
1709 right
= _simd_and_si(right
, _simd_set1_epi32(~255));
1711 simdscalari vMaskH
= _simd_cmpeq_epi32(left
, right
);
1713 simdscalari top
= _simd_add_epi32(bbox
.top
, _simd_set1_epi32(127));
1714 top
= _simd_and_si(top
, _simd_set1_epi32(~255));
1715 simdscalari bottom
= _simd_add_epi32(bbox
.bottom
, _simd_set1_epi32(128));
1716 bottom
= _simd_and_si(bottom
, _simd_set1_epi32(~255));
1718 simdscalari vMaskV
= _simd_cmpeq_epi32(top
, bottom
);
1719 vMaskV
= _simd_or_si(vMaskH
, vMaskV
);
1720 cullCenterMask
= _simd_movemask_ps(_simd_castsi_ps(vMaskV
));
1723 triMask
&= ~cullCenterMask
;
1725 if(origTriMask
^ triMask
)
1727 RDTSC_EVENT(FECullBetweenCenters
, _mm_popcnt_u32(origTriMask
^ triMask
), 0);
1731 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
1732 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
1733 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
1734 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
1735 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
1737 // Cull tris completely outside scissor
1739 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
1740 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
1741 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
1742 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
1743 triMask
= triMask
& ~maskOutsideScissor
;
1748 goto endBinTriangles
;
1751 // Convert triangle bbox to macrotile units.
1752 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1753 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1754 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1755 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1757 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
1758 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
1759 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
1760 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
1761 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
1763 // transpose verts needed for backend
1764 /// @todo modify BE to take non-transformed verts
1765 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
1766 vTranspose3x8(vHorizX
, tri
[0].x
, tri
[1].x
, tri
[2].x
);
1767 vTranspose3x8(vHorizY
, tri
[0].y
, tri
[1].y
, tri
[2].y
);
1768 vTranspose3x8(vHorizZ
, tri
[0].z
, tri
[1].z
, tri
[2].z
);
1769 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vRecipW2
);
1771 // store render target array index
1772 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1773 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1775 simdvector vRtai
[3];
1776 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
1778 vRtaii
= _simd_castps_si(vRtai
[0].x
);
1779 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1783 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1786 // scan remaining valid triangles and bin each separately
1787 while (_BitScanForward(&triIndex
, triMask
))
1789 uint32_t linkageCount
= state
.linkageCount
;
1790 uint32_t linkageMask
= state
.linkageMask
;
1791 uint32_t numScalarAttribs
= linkageCount
* 4;
1796 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1798 desc
.triFlags
.frontFacing
= state
.forceFront
? 1 : ((frontFaceMask
>> triIndex
) & 1);
1799 desc
.triFlags
.primID
= pPrimID
[triIndex
];
1800 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[triIndex
];
1802 if(rastState
.samplePattern
== SWR_MSAA_STANDARD_PATTERN
)
1804 work
.pfnWork
= gRasterizerTable
[rastState
.scissorEnable
][rastState
.sampleCount
];
1808 // for center sample pattern, all samples are at pixel center; calculate coverage
1809 // once at center and broadcast the results in the backend
1810 work
.pfnWork
= gRasterizerTable
[rastState
.scissorEnable
][SWR_MULTISAMPLE_1X
];
1813 auto pArena
= pDC
->pArena
;
1814 SWR_ASSERT(pArena
!= nullptr);
1816 // store active attribs
1817 float *pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
1818 desc
.pAttribs
= pAttribs
;
1819 desc
.numAttribs
= linkageCount
;
1820 ProcessAttributes
<3>(pDC
, pa
, linkageMask
, state
.linkageMap
, triIndex
, desc
.pAttribs
);
1822 // store triangle vertex data
1823 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
1825 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[triIndex
]);
1826 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[triIndex
]);
1827 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[triIndex
]);
1828 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[triIndex
]);
1830 // store user clip distances
1831 if (rastState
.clipDistanceMask
)
1833 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
1834 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 3 * sizeof(float));
1835 ProcessUserClipDist
<3>(pa
, triIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
1838 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
1839 for (uint32_t y
= aMTTop
[triIndex
]; y
<= aMTBottom
[triIndex
]; ++y
)
1841 for (uint32_t x
= aMTLeft
[triIndex
]; x
<= aMTRight
[triIndex
]; ++x
)
1843 #if KNOB_ENABLE_TOSS_POINTS
1844 if (!KNOB_TOSS_SETUP_TRIS
)
1847 pTileMgr
->enqueue(x
, y
, &work
);
1851 triMask
&= ~(1 << triIndex
);
1855 RDTSC_STOP(FEBinTriangles
, 1, 0);
1860 //////////////////////////////////////////////////////////////////////////
1861 /// @brief Bin SIMD points to the backend. Only supports point size of 1
1862 /// @param pDC - pointer to draw context.
1863 /// @param pa - The primitive assembly object.
1864 /// @param workerId - thread's worker id. Even thread has a unique id.
1865 /// @param tri - Contains point position data for SIMDs worth of points.
1866 /// @param primID - Primitive ID for each point.
1875 RDTSC_START(FEBinPoints
);
1877 simdvector
& primVerts
= prim
[0];
1879 const API_STATE
& state
= GetApiState(pDC
);
1880 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
1881 const SWR_GS_STATE
& gsState
= state
.gsState
;
1882 const SWR_RASTSTATE
& rastState
= state
.rastState
;
1884 if (!feState
.vpTransformDisable
)
1886 // perspective divide
1887 simdscalar vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), primVerts
.w
);
1888 primVerts
.x
= _simd_mul_ps(primVerts
.x
, vRecipW0
);
1889 primVerts
.y
= _simd_mul_ps(primVerts
.y
, vRecipW0
);
1890 primVerts
.z
= _simd_mul_ps(primVerts
.z
, vRecipW0
);
1892 // viewport transform to screen coords
1893 viewportTransform
<1>(&primVerts
, state
.vpMatrix
[0]);
1896 // adjust for pixel center location
1897 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
1898 primVerts
.x
= _simd_add_ps(primVerts
.x
, offset
);
1899 primVerts
.y
= _simd_add_ps(primVerts
.y
, offset
);
1901 // convert to fixed point
1902 simdscalari vXi
, vYi
;
1903 vXi
= fpToFixedPointVertical(primVerts
.x
);
1904 vYi
= fpToFixedPointVertical(primVerts
.y
);
1906 if (CanUseSimplePoints(pDC
))
1908 // adjust for top-left rule
1909 vXi
= _simd_sub_epi32(vXi
, _simd_set1_epi32(1));
1910 vYi
= _simd_sub_epi32(vYi
, _simd_set1_epi32(1));
1912 // cull points off the top-left edge of the viewport
1913 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vXi
));
1914 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vYi
));
1916 // compute macro tile coordinates
1917 simdscalari macroX
= _simd_srai_epi32(vXi
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
1918 simdscalari macroY
= _simd_srai_epi32(vYi
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
1920 OSALIGNSIMD(uint32_t) aMacroX
[KNOB_SIMD_WIDTH
], aMacroY
[KNOB_SIMD_WIDTH
];
1921 _simd_store_si((simdscalari
*)aMacroX
, macroX
);
1922 _simd_store_si((simdscalari
*)aMacroY
, macroY
);
1924 // compute raster tile coordinates
1925 simdscalari rasterX
= _simd_srai_epi32(vXi
, KNOB_TILE_X_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1926 simdscalari rasterY
= _simd_srai_epi32(vYi
, KNOB_TILE_Y_DIM_SHIFT
+ FIXED_POINT_SHIFT
);
1928 // compute raster tile relative x,y for coverage mask
1929 simdscalari tileAlignedX
= _simd_slli_epi32(rasterX
, KNOB_TILE_X_DIM_SHIFT
);
1930 simdscalari tileAlignedY
= _simd_slli_epi32(rasterY
, KNOB_TILE_Y_DIM_SHIFT
);
1932 simdscalari tileRelativeX
= _simd_sub_epi32(_simd_srai_epi32(vXi
, FIXED_POINT_SHIFT
), tileAlignedX
);
1933 simdscalari tileRelativeY
= _simd_sub_epi32(_simd_srai_epi32(vYi
, FIXED_POINT_SHIFT
), tileAlignedY
);
1935 OSALIGNSIMD(uint32_t) aTileRelativeX
[KNOB_SIMD_WIDTH
];
1936 OSALIGNSIMD(uint32_t) aTileRelativeY
[KNOB_SIMD_WIDTH
];
1937 _simd_store_si((simdscalari
*)aTileRelativeX
, tileRelativeX
);
1938 _simd_store_si((simdscalari
*)aTileRelativeY
, tileRelativeY
);
1940 OSALIGNSIMD(uint32_t) aTileAlignedX
[KNOB_SIMD_WIDTH
];
1941 OSALIGNSIMD(uint32_t) aTileAlignedY
[KNOB_SIMD_WIDTH
];
1942 _simd_store_si((simdscalari
*)aTileAlignedX
, tileAlignedX
);
1943 _simd_store_si((simdscalari
*)aTileAlignedY
, tileAlignedY
);
1945 OSALIGNSIMD(float) aZ
[KNOB_SIMD_WIDTH
];
1946 _simd_store_ps((float*)aZ
, primVerts
.z
);
1948 // store render target array index
1949 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
1950 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
1953 pa
.Assemble(VERTEX_RTAI_SLOT
, &vRtai
);
1954 simdscalari vRtaii
= _simd_castps_si(vRtai
.x
);
1955 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
1959 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
1962 uint32_t *pPrimID
= (uint32_t *)&primID
;
1963 DWORD primIndex
= 0;
1964 // scan remaining valid triangles and bin each separately
1965 while (_BitScanForward(&primIndex
, primMask
))
1967 uint32_t linkageCount
= state
.linkageCount
;
1968 uint32_t linkageMask
= state
.linkageMask
;
1970 uint32_t numScalarAttribs
= linkageCount
* 4;
1975 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
1977 // points are always front facing
1978 desc
.triFlags
.frontFacing
= 1;
1979 desc
.triFlags
.primID
= pPrimID
[primIndex
];
1980 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
1982 work
.pfnWork
= RasterizeSimplePoint
;
1984 auto pArena
= pDC
->pArena
;
1985 SWR_ASSERT(pArena
!= nullptr);
1988 float *pAttribs
= (float*)pArena
->AllocAligned(3 * numScalarAttribs
* sizeof(float), 16);
1989 desc
.pAttribs
= pAttribs
;
1990 desc
.numAttribs
= linkageCount
;
1992 ProcessAttributes
<1>(pDC
, pa
, linkageMask
, state
.linkageMap
, primIndex
, pAttribs
);
1994 // store raster tile aligned x, y, perspective correct z
1995 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
1996 desc
.pTriBuffer
= pTriBuffer
;
1997 *(uint32_t*)pTriBuffer
++ = aTileAlignedX
[primIndex
];
1998 *(uint32_t*)pTriBuffer
++ = aTileAlignedY
[primIndex
];
1999 *pTriBuffer
= aZ
[primIndex
];
2001 uint32_t tX
= aTileRelativeX
[primIndex
];
2002 uint32_t tY
= aTileRelativeY
[primIndex
];
2004 // pack the relative x,y into the coverageMask, the rasterizer will
2005 // generate the true coverage mask from it
2006 work
.desc
.tri
.triFlags
.coverageMask
= tX
| (tY
<< 4);
2009 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2010 #if KNOB_ENABLE_TOSS_POINTS
2011 if (!KNOB_TOSS_SETUP_TRIS
)
2014 pTileMgr
->enqueue(aMacroX
[primIndex
], aMacroY
[primIndex
], &work
);
2016 primMask
&= ~(1 << primIndex
);
2021 // non simple points need to be potentially binned to multiple macro tiles
2022 simdscalar vPointSize
;
2023 if (rastState
.pointParam
)
2026 pa
.Assemble(VERTEX_POINT_SIZE_SLOT
, size
);
2027 vPointSize
= size
[0].x
;
2031 vPointSize
= _simd_set1_ps(rastState
.pointSize
);
2034 // bloat point to bbox
2036 bbox
.left
= bbox
.right
= vXi
;
2037 bbox
.top
= bbox
.bottom
= vYi
;
2039 simdscalar vHalfWidth
= _simd_mul_ps(vPointSize
, _simd_set1_ps(0.5f
));
2040 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2041 bbox
.left
= _simd_sub_epi32(bbox
.left
, vHalfWidthi
);
2042 bbox
.right
= _simd_add_epi32(bbox
.right
, vHalfWidthi
);
2043 bbox
.top
= _simd_sub_epi32(bbox
.top
, vHalfWidthi
);
2044 bbox
.bottom
= _simd_add_epi32(bbox
.bottom
, vHalfWidthi
);
2046 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2047 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
2048 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
2049 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
2050 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
2052 // Cull bloated points completely outside scissor
2053 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
2054 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
2055 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2056 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2057 primMask
= primMask
& ~maskOutsideScissor
;
2059 // Convert bbox to macrotile units.
2060 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2061 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2062 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2063 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2065 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2066 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
2067 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
2068 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
2069 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
2071 // store render target array index
2072 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2073 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2075 simdvector vRtai
[2];
2076 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2077 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2078 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2082 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2085 OSALIGNSIMD(float) aPointSize
[KNOB_SIMD_WIDTH
];
2086 _simd_store_ps((float*)aPointSize
, vPointSize
);
2088 uint32_t *pPrimID
= (uint32_t *)&primID
;
2090 OSALIGNSIMD(float) aPrimVertsX
[KNOB_SIMD_WIDTH
];
2091 OSALIGNSIMD(float) aPrimVertsY
[KNOB_SIMD_WIDTH
];
2092 OSALIGNSIMD(float) aPrimVertsZ
[KNOB_SIMD_WIDTH
];
2094 _simd_store_ps((float*)aPrimVertsX
, primVerts
.x
);
2095 _simd_store_ps((float*)aPrimVertsY
, primVerts
.y
);
2096 _simd_store_ps((float*)aPrimVertsZ
, primVerts
.z
);
2098 // scan remaining valid prims and bin each separately
2100 while (_BitScanForward(&primIndex
, primMask
))
2102 uint32_t linkageCount
= state
.linkageCount
;
2103 uint32_t linkageMask
= state
.linkageMask
;
2104 uint32_t numScalarAttribs
= linkageCount
* 4;
2109 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2111 desc
.triFlags
.frontFacing
= 1;
2112 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2113 desc
.triFlags
.pointSize
= aPointSize
[primIndex
];
2114 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2116 work
.pfnWork
= RasterizeTriPoint
;
2118 auto pArena
= pDC
->pArena
;
2119 SWR_ASSERT(pArena
!= nullptr);
2121 // store active attribs
2122 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2123 desc
.numAttribs
= linkageCount
;
2124 ProcessAttributes
<1>(pDC
, pa
, linkageMask
, state
.linkageMap
, primIndex
, desc
.pAttribs
);
2126 // store point vertex data
2127 float *pTriBuffer
= (float*)pArena
->AllocAligned(4 * sizeof(float), 16);
2128 desc
.pTriBuffer
= pTriBuffer
;
2129 *pTriBuffer
++ = aPrimVertsX
[primIndex
];
2130 *pTriBuffer
++ = aPrimVertsY
[primIndex
];
2131 *pTriBuffer
= aPrimVertsZ
[primIndex
];
2133 // store user clip distances
2134 if (rastState
.clipDistanceMask
)
2136 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2137 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2138 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2141 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2142 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2144 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2146 #if KNOB_ENABLE_TOSS_POINTS
2147 if (!KNOB_TOSS_SETUP_TRIS
)
2150 pTileMgr
->enqueue(x
, y
, &work
);
2155 primMask
&= ~(1 << primIndex
);
2162 RDTSC_STOP(FEBinPoints
, 1, 0);
2165 //////////////////////////////////////////////////////////////////////////
2166 /// @brief Bin SIMD lines to the backend.
2167 /// @param pDC - pointer to draw context.
2168 /// @param pa - The primitive assembly object.
2169 /// @param workerId - thread's worker id. Even thread has a unique id.
2170 /// @param tri - Contains line position data for SIMDs worth of points.
2171 /// @param primID - Primitive ID for each line.
2180 RDTSC_START(FEBinLines
);
2182 const API_STATE
& state
= GetApiState(pDC
);
2183 const SWR_RASTSTATE
& rastState
= state
.rastState
;
2184 const SWR_FRONTEND_STATE
& feState
= state
.frontendState
;
2185 const SWR_GS_STATE
& gsState
= state
.gsState
;
2187 simdscalar vRecipW0
= _simd_set1_ps(1.0f
);
2188 simdscalar vRecipW1
= _simd_set1_ps(1.0f
);
2190 if (!feState
.vpTransformDisable
)
2192 // perspective divide
2193 vRecipW0
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[0].w
);
2194 vRecipW1
= _simd_div_ps(_simd_set1_ps(1.0f
), prim
[1].w
);
2196 prim
[0].v
[0] = _simd_mul_ps(prim
[0].v
[0], vRecipW0
);
2197 prim
[1].v
[0] = _simd_mul_ps(prim
[1].v
[0], vRecipW1
);
2199 prim
[0].v
[1] = _simd_mul_ps(prim
[0].v
[1], vRecipW0
);
2200 prim
[1].v
[1] = _simd_mul_ps(prim
[1].v
[1], vRecipW1
);
2202 prim
[0].v
[2] = _simd_mul_ps(prim
[0].v
[2], vRecipW0
);
2203 prim
[1].v
[2] = _simd_mul_ps(prim
[1].v
[2], vRecipW1
);
2205 // viewport transform to screen coords
2206 viewportTransform
<2>(prim
, state
.vpMatrix
[0]);
2209 // adjust for pixel center location
2210 simdscalar offset
= g_pixelOffsets
[rastState
.pixelLocation
];
2211 prim
[0].x
= _simd_add_ps(prim
[0].x
, offset
);
2212 prim
[0].y
= _simd_add_ps(prim
[0].y
, offset
);
2214 prim
[1].x
= _simd_add_ps(prim
[1].x
, offset
);
2215 prim
[1].y
= _simd_add_ps(prim
[1].y
, offset
);
2217 // convert to fixed point
2218 simdscalari vXi
[2], vYi
[2];
2219 vXi
[0] = fpToFixedPointVertical(prim
[0].x
);
2220 vYi
[0] = fpToFixedPointVertical(prim
[0].y
);
2221 vXi
[1] = fpToFixedPointVertical(prim
[1].x
);
2222 vYi
[1] = fpToFixedPointVertical(prim
[1].y
);
2224 // compute x-major vs y-major mask
2225 simdscalari xLength
= _simd_abs_epi32(_simd_sub_epi32(vXi
[0], vXi
[1]));
2226 simdscalari yLength
= _simd_abs_epi32(_simd_sub_epi32(vYi
[0], vYi
[1]));
2227 simdscalar vYmajorMask
= _simd_castsi_ps(_simd_cmpgt_epi32(yLength
, xLength
));
2228 uint32_t yMajorMask
= _simd_movemask_ps(vYmajorMask
);
2230 // cull zero-length lines
2231 simdscalari vZeroLengthMask
= _simd_cmpeq_epi32(xLength
, _simd_setzero_si());
2232 vZeroLengthMask
= _simd_and_si(vZeroLengthMask
, _simd_cmpeq_epi32(yLength
, _simd_setzero_si()));
2234 primMask
&= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask
));
2236 uint32_t *pPrimID
= (uint32_t *)&primID
;
2238 simdscalar vUnused
= _simd_setzero_ps();
2240 // Calc bounding box of lines
2242 bbox
.left
= _simd_min_epi32(vXi
[0], vXi
[1]);
2243 bbox
.right
= _simd_max_epi32(vXi
[0], vXi
[1]);
2244 bbox
.top
= _simd_min_epi32(vYi
[0], vYi
[1]);
2245 bbox
.bottom
= _simd_max_epi32(vYi
[0], vYi
[1]);
2247 // bloat bbox by line width along minor axis
2248 simdscalar vHalfWidth
= _simd_set1_ps(rastState
.lineWidth
/ 2.0f
);
2249 simdscalari vHalfWidthi
= fpToFixedPointVertical(vHalfWidth
);
2251 bloatBox
.left
= _simd_sub_epi32(bbox
.left
, vHalfWidthi
);
2252 bloatBox
.right
= _simd_add_epi32(bbox
.right
, vHalfWidthi
);
2253 bloatBox
.top
= _simd_sub_epi32(bbox
.top
, vHalfWidthi
);
2254 bloatBox
.bottom
= _simd_add_epi32(bbox
.bottom
, vHalfWidthi
);
2256 bbox
.left
= _simd_blendv_epi32(bbox
.left
, bloatBox
.left
, vYmajorMask
);
2257 bbox
.right
= _simd_blendv_epi32(bbox
.right
, bloatBox
.right
, vYmajorMask
);
2258 bbox
.top
= _simd_blendv_epi32(bloatBox
.top
, bbox
.top
, vYmajorMask
);
2259 bbox
.bottom
= _simd_blendv_epi32(bloatBox
.bottom
, bbox
.bottom
, vYmajorMask
);
2261 // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
2262 bbox
.left
= _simd_max_epi32(bbox
.left
, _simd_set1_epi32(state
.scissorInFixedPoint
.left
));
2263 bbox
.top
= _simd_max_epi32(bbox
.top
, _simd_set1_epi32(state
.scissorInFixedPoint
.top
));
2264 bbox
.right
= _simd_min_epi32(_simd_sub_epi32(bbox
.right
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.right
));
2265 bbox
.bottom
= _simd_min_epi32(_simd_sub_epi32(bbox
.bottom
, _simd_set1_epi32(1)), _simd_set1_epi32(state
.scissorInFixedPoint
.bottom
));
2267 // Cull prims completely outside scissor
2269 simdscalari maskOutsideScissorX
= _simd_cmpgt_epi32(bbox
.left
, bbox
.right
);
2270 simdscalari maskOutsideScissorY
= _simd_cmpgt_epi32(bbox
.top
, bbox
.bottom
);
2271 simdscalari maskOutsideScissorXY
= _simd_or_si(maskOutsideScissorX
, maskOutsideScissorY
);
2272 uint32_t maskOutsideScissor
= _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY
));
2273 primMask
= primMask
& ~maskOutsideScissor
;
2281 // Convert triangle bbox to macrotile units.
2282 bbox
.left
= _simd_srai_epi32(bbox
.left
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2283 bbox
.top
= _simd_srai_epi32(bbox
.top
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2284 bbox
.right
= _simd_srai_epi32(bbox
.right
, KNOB_MACROTILE_X_DIM_FIXED_SHIFT
);
2285 bbox
.bottom
= _simd_srai_epi32(bbox
.bottom
, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT
);
2287 OSALIGNSIMD(uint32_t) aMTLeft
[KNOB_SIMD_WIDTH
], aMTRight
[KNOB_SIMD_WIDTH
], aMTTop
[KNOB_SIMD_WIDTH
], aMTBottom
[KNOB_SIMD_WIDTH
];
2288 _simd_store_si((simdscalari
*)aMTLeft
, bbox
.left
);
2289 _simd_store_si((simdscalari
*)aMTRight
, bbox
.right
);
2290 _simd_store_si((simdscalari
*)aMTTop
, bbox
.top
);
2291 _simd_store_si((simdscalari
*)aMTBottom
, bbox
.bottom
);
2293 // transpose verts needed for backend
2294 /// @todo modify BE to take non-transformed verts
2295 __m128 vHorizX
[8], vHorizY
[8], vHorizZ
[8], vHorizW
[8];
2296 vTranspose3x8(vHorizX
, prim
[0].x
, prim
[1].x
, vUnused
);
2297 vTranspose3x8(vHorizY
, prim
[0].y
, prim
[1].y
, vUnused
);
2298 vTranspose3x8(vHorizZ
, prim
[0].z
, prim
[1].z
, vUnused
);
2299 vTranspose3x8(vHorizW
, vRecipW0
, vRecipW1
, vUnused
);
2301 // store render target array index
2302 OSALIGNSIMD(uint32_t) aRTAI
[KNOB_SIMD_WIDTH
];
2303 if (gsState
.gsEnable
&& gsState
.emitsRenderTargetArrayIndex
)
2305 simdvector vRtai
[2];
2306 pa
.Assemble(VERTEX_RTAI_SLOT
, vRtai
);
2307 simdscalari vRtaii
= _simd_castps_si(vRtai
[0].x
);
2308 _simd_store_si((simdscalari
*)aRTAI
, vRtaii
);
2312 _simd_store_si((simdscalari
*)aRTAI
, _simd_setzero_si());
2315 // scan remaining valid prims and bin each separately
2317 while (_BitScanForward(&primIndex
, primMask
))
2319 uint32_t linkageCount
= state
.linkageCount
;
2320 uint32_t linkageMask
= state
.linkageMask
;
2321 uint32_t numScalarAttribs
= linkageCount
* 4;
2326 TRIANGLE_WORK_DESC
&desc
= work
.desc
.tri
;
2328 desc
.triFlags
.frontFacing
= 1;
2329 desc
.triFlags
.primID
= pPrimID
[primIndex
];
2330 desc
.triFlags
.yMajor
= (yMajorMask
>> primIndex
) & 1;
2331 desc
.triFlags
.renderTargetArrayIndex
= aRTAI
[primIndex
];
2333 work
.pfnWork
= RasterizeLine
;
2335 auto pArena
= pDC
->pArena
;
2336 SWR_ASSERT(pArena
!= nullptr);
2338 // store active attribs
2339 desc
.pAttribs
= (float*)pArena
->AllocAligned(numScalarAttribs
* 3 * sizeof(float), 16);
2340 desc
.numAttribs
= linkageCount
;
2341 ProcessAttributes
<2>(pDC
, pa
, linkageMask
, state
.linkageMap
, primIndex
, desc
.pAttribs
);
2343 // store line vertex data
2344 desc
.pTriBuffer
= (float*)pArena
->AllocAligned(4 * 4 * sizeof(float), 16);
2345 _mm_store_ps(&desc
.pTriBuffer
[0], vHorizX
[primIndex
]);
2346 _mm_store_ps(&desc
.pTriBuffer
[4], vHorizY
[primIndex
]);
2347 _mm_store_ps(&desc
.pTriBuffer
[8], vHorizZ
[primIndex
]);
2348 _mm_store_ps(&desc
.pTriBuffer
[12], vHorizW
[primIndex
]);
2350 // store user clip distances
2351 if (rastState
.clipDistanceMask
)
2353 uint32_t numClipDist
= _mm_popcnt_u32(rastState
.clipDistanceMask
);
2354 desc
.pUserClipBuffer
= (float*)pArena
->Alloc(numClipDist
* 2 * sizeof(float));
2355 ProcessUserClipDist
<2>(pa
, primIndex
, rastState
.clipDistanceMask
, desc
.pUserClipBuffer
);
2358 MacroTileMgr
*pTileMgr
= pDC
->pTileMgr
;
2359 for (uint32_t y
= aMTTop
[primIndex
]; y
<= aMTBottom
[primIndex
]; ++y
)
2361 for (uint32_t x
= aMTLeft
[primIndex
]; x
<= aMTRight
[primIndex
]; ++x
)
2363 #if KNOB_ENABLE_TOSS_POINTS
2364 if (!KNOB_TOSS_SETUP_TRIS
)
2367 pTileMgr
->enqueue(x
, y
, &work
);
2372 primMask
&= ~(1 << primIndex
);
2377 RDTSC_STOP(FEBinLines
, 1, 0);