1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
33 #include "depthstencil.h"
35 #include "memory/tilingtraits.h"
36 #include "core/multisample.h"
40 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
41 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 const API_STATE
& state
= GetApiState(pDC
);
66 SWR_CS_CONTEXT csContext
{ 0 };
67 csContext
.tileCounter
= threadGroupId
;
68 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
69 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
70 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
71 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
72 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
74 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
76 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
78 AR_END(BEDispatch
, 1);
81 //////////////////////////////////////////////////////////////////////////
82 /// @brief Process shutdown.
83 /// @param pDC - pointer to draw context (dispatch).
84 /// @param workerId - The unique worker ID that is assigned to this thread.
85 /// @param threadGroupId - the linear index for the thread group within the dispatch.
86 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
91 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
94 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
95 SWR_ASSERT(x
== 0 && y
== 0);
98 template<SWR_FORMAT format
>
99 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
101 auto lambda
= [&](int32_t comp
)
103 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
105 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
108 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
110 for (uint32_t i
= 0; i
< numIter
; ++i
)
112 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
116 #if USE_8x2_TILE_BACKEND
117 template<SWR_FORMAT format
>
118 void ClearRasterTile(uint8_t *pTileBuffer
, simd16vector
&value
)
120 auto lambda
= [&](int32_t comp
)
122 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
124 pTileBuffer
+= (KNOB_SIMD16_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
127 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
);
129 for (uint32_t i
= 0; i
< numIter
; ++i
)
131 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
136 template<SWR_FORMAT format
>
137 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, DWORD clear
[4], const SWR_RECT
& rect
)
139 // convert clear color to hottile format
140 // clear color is in RGBA float/uint32
141 #if USE_8x2_TILE_BACKEND
143 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
146 vComp
= _simd16_load1_ps((const float*)&clear
[comp
]);
147 if (FormatTraits
<format
>::isNormalized(comp
))
149 vComp
= _simd16_mul_ps(vComp
, _simd16_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
150 vComp
= _simd16_castsi_ps(_simd16_cvtps_epi32(vComp
));
152 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
153 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
158 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
161 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
162 if (FormatTraits
<format
>::isNormalized(comp
))
164 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
165 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
167 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
168 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
172 uint32_t tileX
, tileY
;
173 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
175 // Init to full macrotile
178 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
179 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
180 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
181 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
184 // intersect with clear rect
187 // translate to local hottile origin
188 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
190 // Make maximums inclusive (needed for convert to raster tiles)
194 // convert to raster tiles
195 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
196 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
197 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
198 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
200 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
201 // compute steps between raster tile samples / raster tiles / macro tile rows
202 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
203 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
204 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
205 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
207 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
);
208 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
209 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
211 // loop over all raster tiles in the current hot tile
212 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
214 uint8_t* pRasterTile
= pRasterTileRow
;
215 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
217 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
219 ClearRasterTile
<format
>(pRasterTile
, vClear
);
220 pRasterTile
+= rasterTileSampleStep
;
223 pRasterTileRow
+= macroTileRowStep
;
226 pHotTile
->state
= HOTTILE_DIRTY
;
230 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
232 SWR_CONTEXT
*pContext
= pDC
->pContext
;
236 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
237 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
238 uint32_t numSamples
= GetNumSamples(sampleCount
);
240 SWR_ASSERT(pClear
->flags
.bits
!= 0); // shouldn't be here without a reason.
242 AR_BEGIN(BEClear
, pDC
->drawId
);
244 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
246 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_COLOR0
, true, numSamples
);
247 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
248 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
249 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
250 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
251 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
252 pHotTile
->state
= HOTTILE_CLEAR
;
255 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
257 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
);
258 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
259 pHotTile
->state
= HOTTILE_CLEAR
;
262 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
264 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
);
266 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearStencil
;
267 pHotTile
->state
= HOTTILE_CLEAR
;
275 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
276 AR_BEGIN(BEClear
, pDC
->drawId
);
278 if (pClear
->flags
.mask
& SWR_CLEAR_COLOR
)
280 /// @todo clear data should come in as RGBA32_FLOAT
283 clearFloat
[0] = ((uint8_t*)(&pClear
->clearRTColor
))[0] / 255.0f
;
284 clearFloat
[1] = ((uint8_t*)(&pClear
->clearRTColor
))[1] / 255.0f
;
285 clearFloat
[2] = ((uint8_t*)(&pClear
->clearRTColor
))[2] / 255.0f
;
286 clearFloat
[3] = ((uint8_t*)(&pClear
->clearRTColor
))[3] / 255.0f
;
287 clearData
[0] = *(DWORD
*)&clearFloat
[0];
288 clearData
[1] = *(DWORD
*)&clearFloat
[1];
289 clearData
[2] = *(DWORD
*)&clearFloat
[2];
290 clearData
[3] = *(DWORD
*)&clearFloat
[3];
292 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
293 SWR_ASSERT(pfnClearTiles
!= nullptr);
295 pfnClearTiles(pDC
, SWR_ATTACHMENT_COLOR0
, macroTile
, clearData
, pClear
->rect
);
298 if (pClear
->flags
.mask
& SWR_CLEAR_DEPTH
)
301 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
302 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
303 SWR_ASSERT(pfnClearTiles
!= nullptr);
305 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, clearData
, pClear
->rect
);
308 if (pClear
->flags
.mask
& SWR_CLEAR_STENCIL
)
310 uint32_t value
= pClear
->clearStencil
;
312 clearData
[0] = *(DWORD
*)&value
;
313 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
315 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, clearData
, pClear
->rect
);
322 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
323 SWR_RENDERTARGET_ATTACHMENT attachment
)
325 SWR_CONTEXT
*pContext
= pDC
->pContext
;
327 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
329 SWR_FORMAT srcFormat
;
332 case SWR_ATTACHMENT_COLOR0
:
333 case SWR_ATTACHMENT_COLOR1
:
334 case SWR_ATTACHMENT_COLOR2
:
335 case SWR_ATTACHMENT_COLOR3
:
336 case SWR_ATTACHMENT_COLOR4
:
337 case SWR_ATTACHMENT_COLOR5
:
338 case SWR_ATTACHMENT_COLOR6
:
339 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
340 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
341 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
342 default: SWR_ASSERT(false, "Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
346 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
348 // Only need to store the hottile if it's been rendered to...
349 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, attachment
, false);
352 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
353 if (pHotTile
->state
== HOTTILE_CLEAR
)
355 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
356 SWR_ASSERT(pfnClearTiles
!= nullptr);
358 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->clearData
, pDesc
->rect
);
361 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
363 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
364 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
366 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
367 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
371 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
373 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
376 AR_END(BEStoreTiles
, 1);
379 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
381 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
383 unsigned long rt
= 0;
384 uint32_t mask
= pDesc
->attachmentMask
;
385 while (_BitScanForward(&rt
, mask
))
388 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
392 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
394 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
395 SWR_CONTEXT
*pContext
= pDC
->pContext
;
397 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
399 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
401 if (pDesc
->attachmentMask
& (1 << i
))
403 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
404 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
407 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
413 #if KNOB_SIMD_WIDTH == 8
414 const __m256 vCenterOffsetsX
= {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
415 const __m256 vCenterOffsetsY
= {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
416 const __m256 vULOffsetsX
= {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
417 const __m256 vULOffsetsY
= {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
419 #error Unsupported vector width
422 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
424 simdscalar vClipMask
= _simd_setzero_ps();
425 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
427 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
429 // pull triangle clip distance values from clip buffer
430 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
431 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
432 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
435 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
437 // clip if interpolated clip distance is < 0 || NAN
438 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
440 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
443 return _simd_movemask_ps(vClipMask
);
447 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
449 SWR_CONTEXT
*pContext
= pDC
->pContext
;
451 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
452 AR_BEGIN(BESetup
, pDC
->drawId
);
454 const API_STATE
&state
= GetApiState(pDC
);
456 BarycentricCoeffs coeffs
;
457 SetupBarycentricCoeffs(&coeffs
, work
);
459 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
460 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
462 SWR_PS_CONTEXT psContext
;
463 SetupPixelShaderContext
<T
>(&psContext
, work
);
467 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
468 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
470 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
472 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
474 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
475 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
477 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
479 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
481 #if USE_8x2_TILE_BACKEND
482 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
485 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
489 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
491 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
493 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
495 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
496 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
498 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
501 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
503 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
505 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
508 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
510 CalcPixelBarycentrics(coeffs
, psContext
);
512 CalcCentroid
<T
, true>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
514 // interpolate and quantize z
515 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
516 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
518 AR_END(BEBarycentric
, 1);
520 // interpolate user clip distance if available
521 if (state
.rastState
.clipDistanceMask
)
523 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
526 simdscalar vCoverageMask
= vMask(coverageMask
);
527 simdscalar depthPassMask
= vCoverageMask
;
528 simdscalar stencilPassMask
= vCoverageMask
;
533 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
534 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
535 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
536 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(vCoverageMask
), _simd_movemask_ps(stencilPassMask
)));
537 AR_END(BEEarlyDepthTest
, 0);
539 // early-exit if no pixels passed depth or earlyZ is forced on
540 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
542 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
543 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
545 if (!_simd_movemask_ps(depthPassMask
))
552 psContext
.sampleIndex
= 0;
553 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
555 // execute pixel shader
556 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
557 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
558 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
559 AR_END(BEPixelShader
, 0);
561 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
566 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
567 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
568 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
569 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(vCoverageMask
), _simd_movemask_ps(stencilPassMask
)));
570 AR_END(BELateDepthTest
, 0);
572 if (!_simd_movemask_ps(depthPassMask
))
574 // need to call depth/stencil write for stencil write
575 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
576 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
581 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
582 uint32_t statCount
= _mm_popcnt_u32(statMask
);
583 UPDATE_STAT_BE(DepthPassCount
, statCount
);
586 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
587 #if USE_8x2_TILE_BACKEND
588 OutputMerger(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
590 OutputMerger(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
593 // do final depth write after all pixel kills
594 if (!state
.psState
.forceEarlyZ
)
596 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
597 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
599 AR_END(BEOutputMerger
, 0);
603 AR_BEGIN(BEEndTile
, pDC
->drawId
);
605 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
606 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
608 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
611 #if USE_8x2_TILE_BACKEND
612 if (useAlternateOffset
)
614 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
616 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
620 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
622 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
625 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
626 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
628 AR_END(BEEndTile
, 0);
630 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
631 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
634 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
635 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
638 AR_END(BESingleSampleBackend
, 0);
642 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
644 SWR_CONTEXT
*pContext
= pDC
->pContext
;
646 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
647 AR_BEGIN(BESetup
, pDC
->drawId
);
649 const API_STATE
&state
= GetApiState(pDC
);
651 BarycentricCoeffs coeffs
;
652 SetupBarycentricCoeffs(&coeffs
, work
);
654 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
655 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
657 SWR_PS_CONTEXT psContext
;
658 SetupPixelShaderContext
<T
>(&psContext
, work
);
662 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
663 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
665 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
667 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
669 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
670 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
672 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
674 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
676 #if USE_8x2_TILE_BACKEND
677 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
680 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
682 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
684 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
687 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
689 CalcPixelBarycentrics(coeffs
, psContext
);
691 CalcCentroid
<T
, false>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
693 AR_END(BEBarycentric
, 0);
695 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
697 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
701 // offset depth/stencil buffers current sample
702 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
703 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
705 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
707 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
709 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
711 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
712 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
714 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
717 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
719 // calculate per sample positions
720 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, T::MultisampleT::vX(sample
));
721 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, T::MultisampleT::vY(sample
));
723 CalcSampleBarycentrics(coeffs
, psContext
);
725 // interpolate and quantize z
726 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
727 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
729 AR_END(BEBarycentric
, 0);
731 // interpolate user clip distance if available
732 if (state
.rastState
.clipDistanceMask
)
734 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
737 simdscalar vCoverageMask
= vMask(coverageMask
);
738 simdscalar depthPassMask
= vCoverageMask
;
739 simdscalar stencilPassMask
= vCoverageMask
;
744 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
745 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
746 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
747 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(vCoverageMask
), _simd_movemask_ps(stencilPassMask
)));
748 AR_END(BEEarlyDepthTest
, 0);
750 // early-exit if no samples passed depth or earlyZ is forced on.
751 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
753 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
754 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
756 if (!_simd_movemask_ps(depthPassMask
))
758 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
764 psContext
.sampleIndex
= sample
;
765 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
767 // execute pixel shader
768 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
769 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
770 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
771 AR_END(BEPixelShader
, 0);
773 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
778 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
779 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
780 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
781 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(vCoverageMask
), _simd_movemask_ps(stencilPassMask
)));
782 AR_END(BELateDepthTest
, 0);
784 if (!_simd_movemask_ps(depthPassMask
))
786 // need to call depth/stencil write for stencil write
787 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
788 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
790 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
795 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
796 uint32_t statCount
= _mm_popcnt_u32(statMask
);
797 UPDATE_STAT_BE(DepthPassCount
, statCount
);
800 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
801 #if USE_8x2_TILE_BACKEND
802 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
804 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
807 // do final depth write after all pixel kills
808 if (!state
.psState
.forceEarlyZ
)
810 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
811 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
813 AR_END(BEOutputMerger
, 0);
815 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
821 AR_BEGIN(BEEndTile
, pDC
->drawId
);
823 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
825 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
828 #if USE_8x2_TILE_BACKEND
829 if (useAlternateOffset
)
831 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
833 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
837 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
839 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
842 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
843 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
845 AR_END(BEEndTile
, 0);
847 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
848 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
851 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
852 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
855 AR_END(BESampleRateBackend
, 0);
859 void BackendPixelRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
861 SWR_CONTEXT
*pContext
= pDC
->pContext
;
863 AR_BEGIN(BEPixelRateBackend
, pDC
->drawId
);
864 AR_BEGIN(BESetup
, pDC
->drawId
);
866 const API_STATE
&state
= GetApiState(pDC
);
868 BarycentricCoeffs coeffs
;
869 SetupBarycentricCoeffs(&coeffs
, work
);
871 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
872 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
874 SWR_PS_CONTEXT psContext
;
875 SetupPixelShaderContext
<T
>(&psContext
, work
);
879 PixelRateZTestLoop
<T
> PixelRateZTest(pDC
, workerId
, work
, coeffs
, state
, pDepthBuffer
, pStencilBuffer
, state
.rastState
.clipDistanceMask
);
881 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
882 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
884 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
886 for(uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
888 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
889 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
891 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
893 for(uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
895 #if USE_8x2_TILE_BACKEND
896 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
899 simdscalar activeLanes
;
900 if(!(work
.anyCoveredSamples
& MASK
)) {goto Endtile
;};
901 activeLanes
= vMask(work
.anyCoveredSamples
& MASK
);
903 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
905 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
907 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
910 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
912 CalcPixelBarycentrics(coeffs
, psContext
);
914 CalcCentroid
<T
, false>(&psContext
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
916 AR_END(BEBarycentric
, 0);
918 if(T::bForcedSampleCount
)
920 // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
921 const simdscalar vSampleMask
= _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(state
.blendState
.sampleMask
), _simd_setzero_si()));
922 activeLanes
= _simd_and_ps(activeLanes
, vSampleMask
);
926 if(T::bCanEarlyZ
&& !T::bForcedSampleCount
)
928 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BEEarlyDepthTest
);
929 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
930 AR_EVENT(EarlyDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
933 // if we have no covered samples that passed depth at this point, go to next tile
934 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
936 if(state
.psState
.usesSourceDepth
)
938 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
939 // interpolate and quantize z
940 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
941 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
942 AR_END(BEBarycentric
, 0);
945 // pixels that are currently active
946 psContext
.activeMask
= _simd_castps_si(activeLanes
);
947 psContext
.oMask
= T::MultisampleT::FullSampleMask();
949 // execute pixel shader
950 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
951 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
952 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(activeLanes
)));
953 AR_END(BEPixelShader
, 0);
955 // update active lanes to remove any discarded or oMask'd pixels
956 activeLanes
= _simd_castsi_ps(_simd_and_si(psContext
.activeMask
, _simd_cmpgt_epi32(psContext
.oMask
, _simd_setzero_si())));
957 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
960 if(!T::bCanEarlyZ
&& !T::bForcedSampleCount
)
962 uint32_t depthPassCount
= PixelRateZTest(activeLanes
, psContext
, BELateDepthTest
);
963 UPDATE_STAT_BE(DepthPassCount
, depthPassCount
);
964 AR_EVENT(LateDepthInfoPixelRate(depthPassCount
, _simd_movemask_ps(activeLanes
)));
967 // if we have no covered samples that passed depth at this point, skip OM and go to next tile
968 if(!_simd_movemask_ps(activeLanes
)) { goto Endtile
; };
971 // loop over all samples, broadcasting the results of the PS to all passing pixels
972 for(uint32_t sample
= 0; sample
< GetNumOMSamples
<T
>(state
.blendState
.sampleCount
); sample
++)
974 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
975 // center pattern does a single coverage/depth/stencil test, standard pattern tests all samples
976 uint32_t coverageSampleNum
= (T::bIsStandardPattern
) ? sample
: 0;
977 simdscalar coverageMask
, depthMask
;
978 if(T::bForcedSampleCount
)
980 coverageMask
= depthMask
= activeLanes
;
984 coverageMask
= PixelRateZTest
.vCoverageMask
[coverageSampleNum
];
985 depthMask
= PixelRateZTest
.depthPassMask
[coverageSampleNum
];
986 if(!_simd_movemask_ps(depthMask
))
988 // stencil should already have been written in early/lateZ tests
989 AR_END(BEOutputMerger
, 0);
994 // broadcast the results of the PS to all passing pixels
995 #if USE_8x2_TILE_BACKEND
996 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
, useAlternateOffset
);
998 OutputMerger(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, coverageMask
, depthMask
, state
.psState
.numRenderTargets
);
1001 if(!state
.psState
.forceEarlyZ
&& !T::bForcedSampleCount
)
1003 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
1004 uint8_t * pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1006 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, PixelRateZTest
.vZ
[coverageSampleNum
],
1007 pDepthSample
, depthMask
, coverageMask
, pStencilSample
, PixelRateZTest
.stencilPassMask
[coverageSampleNum
]);
1009 AR_END(BEOutputMerger
, 0);
1012 AR_BEGIN(BEEndTile
, pDC
->drawId
);
1014 for(uint32_t sample
= 0; sample
< T::MultisampleT::numCoverageSamples
; sample
++)
1016 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1019 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
1021 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1023 work
.anyCoveredSamples
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1025 #if USE_8x2_TILE_BACKEND
1026 if (useAlternateOffset
)
1028 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1030 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1034 for(uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
1036 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
1038 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1039 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1042 AR_END(BEEndTile
, 0);
1044 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
1045 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
1048 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
1049 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
1052 AR_END(BEPixelRateBackend
, 0);
1054 // optimized backend flow with NULL PS
1055 template<uint32_t sampleCountT
>
1056 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
1058 SWR_CONTEXT
*pContext
= pDC
->pContext
;
1060 AR_BEGIN(BENullBackend
, pDC
->drawId
);
1061 ///@todo: handle center multisample pattern
1062 typedef SwrBackendTraits
<sampleCountT
, SWR_MSAA_STANDARD_PATTERN
> T
;
1063 AR_BEGIN(BESetup
, pDC
->drawId
);
1065 const API_STATE
&state
= GetApiState(pDC
);
1067 BarycentricCoeffs coeffs
;
1068 SetupBarycentricCoeffs(&coeffs
, work
);
1070 uint8_t *pDepthBuffer
, *pStencilBuffer
;
1071 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
1073 SWR_PS_CONTEXT psContext
;
1074 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
1078 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
1080 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
1082 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
1084 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
1086 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
1088 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
1090 // iterate over active samples
1091 unsigned long sample
= 0;
1092 uint32_t sampleMask
= state
.blendState
.sampleMask
;
1093 while (_BitScanForward(&sample
, sampleMask
))
1095 sampleMask
&= ~(1 << sample
);
1097 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
1101 // offset depth/stencil buffers current sample
1102 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
1103 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
1105 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
1107 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
1109 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
1111 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
1112 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
1114 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
1117 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
1119 // calculate per sample positions
1120 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, T::MultisampleT::vX(sample
));
1121 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, T::MultisampleT::vY(sample
));
1123 CalcSampleBarycentrics(coeffs
, psContext
);
1125 // interpolate and quantize z
1126 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1127 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
1129 AR_END(BEBarycentric
, 0);
1131 // interpolate user clip distance if available
1132 if (state
.rastState
.clipDistanceMask
)
1134 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
1137 simdscalar vCoverageMask
= vMask(coverageMask
);
1138 simdscalar stencilPassMask
= vCoverageMask
;
1140 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
1141 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
1142 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
1143 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(vCoverageMask
), _simd_movemask_ps(stencilPassMask
)));
1144 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
1145 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
1146 AR_END(BEEarlyDepthTest
, 0);
1148 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
1149 uint32_t statCount
= _mm_popcnt_u32(statMask
);
1150 UPDATE_STAT_BE(DepthPassCount
, statCount
);
1155 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
1158 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
1159 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
1161 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
1164 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
1167 AR_END(BENullBackend
, 0);
1170 void InitClearTilesTable()
1172 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1174 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1175 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1176 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1177 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1178 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1181 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1182 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1186 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1187 [SWR_MSAA_SAMPLE_PATTERN_COUNT
]
1188 [SWR_INPUT_COVERAGE_COUNT
]
1190 [2] // forcedSampleCount
1193 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1194 [SWR_INPUT_COVERAGE_COUNT
]
1199 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1200 // arguments to static template arguments.
1201 template <uint32_t... ArgsT
>
1204 // Last Arg Terminator
1205 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1209 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1210 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1211 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1213 SWR_ASSERT(0 && "Invalid backend func\n");
1219 // Recursively parse args
1220 template <typename
... TArgsT
>
1221 static PFN_BACKEND_FUNC
GetFunc(SWR_MSAA_SAMPLE_PATTERN tArg
, TArgsT
... remainingArgs
)
1225 case SWR_MSAA_CENTER_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_CENTER_PATTERN
>::GetFunc(remainingArgs
...); break;
1226 case SWR_MSAA_STANDARD_PATTERN
: return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...); break;
1228 SWR_ASSERT(0 && "Invalid sample pattern\n");
1229 return BEChooser
<ArgsT
..., SWR_MSAA_STANDARD_PATTERN
>::GetFunc(remainingArgs
...);
1234 // Recursively parse args
1235 template <typename
... TArgsT
>
1236 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1240 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1241 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1242 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1244 SWR_ASSERT(0 && "Invalid sample pattern\n");
1245 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1250 // Recursively parse args
1251 template <typename
... TArgsT
>
1252 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1256 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1257 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1258 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1259 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1260 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1262 SWR_ASSERT(0 && "Invalid sample count\n");
1263 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1268 // Recursively parse args
1269 template <typename
... TArgsT
>
1270 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1274 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1277 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1281 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1283 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1285 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1287 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1289 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1290 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1291 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1297 void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_MSAA_SAMPLE_PATTERN_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2][2])
1299 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1301 for(uint32_t samplePattern
= SWR_MSAA_CENTER_PATTERN
; samplePattern
< SWR_MSAA_SAMPLE_PATTERN_COUNT
; samplePattern
++)
1303 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1305 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1307 for(uint32_t forcedSampleCount
= 0; forcedSampleCount
< 2; forcedSampleCount
++)
1309 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1311 table
[sampleCount
][samplePattern
][inputCoverage
][isCentroid
][forcedSampleCount
][canEarlyZ
] =
1312 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, (SWR_MSAA_SAMPLE_PATTERN
)samplePattern
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1313 (isCentroid
> 0), (forcedSampleCount
> 0), (canEarlyZ
> 0), SWR_BACKEND_MSAA_PIXEL_RATE
);
1322 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1324 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1326 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1328 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1330 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1332 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1333 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, SWR_MSAA_STANDARD_PATTERN
, (SWR_INPUT_COVERAGE
)inputCoverage
,
1334 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1341 void InitBackendFuncTables()
1343 InitBackendSingleFuncTable(gBackendSingleSample
);
1344 InitBackendPixelFuncTable(gBackendPixelRateTable
);
1345 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1347 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1348 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1349 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1350 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1351 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;