1 /****************************************************************************
2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
25 * @brief Backend handles rasterization, pixel shading and output merger
28 ******************************************************************************/
30 #include <smmintrin.h>
34 #include "memory/tilingtraits.h"
35 #include "core/multisample.h"
39 typedef void(*PFN_CLEAR_TILES
)(DRAW_CONTEXT
*, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t, uint32_t, DWORD
[4], const SWR_RECT
& rect
);
40 static PFN_CLEAR_TILES sClearTilesTable
[NUM_SWR_FORMATS
];
43 //////////////////////////////////////////////////////////////////////////
44 /// @brief Process compute work.
45 /// @param pDC - pointer to draw context (dispatch).
46 /// @param workerId - The unique worker ID that is assigned to this thread.
47 /// @param threadGroupId - the linear index for the thread group within the dispatch.
48 void ProcessComputeBE(DRAW_CONTEXT
* pDC
, uint32_t workerId
, uint32_t threadGroupId
, void*& pSpillFillBuffer
, void*& pScratchSpace
)
50 SWR_CONTEXT
*pContext
= pDC
->pContext
;
52 AR_BEGIN(BEDispatch
, pDC
->drawId
);
54 const COMPUTE_DESC
* pTaskData
= (COMPUTE_DESC
*)pDC
->pDispatch
->GetTasksData();
55 SWR_ASSERT(pTaskData
!= nullptr);
57 // Ensure spill fill memory has been allocated.
58 size_t spillFillSize
= pDC
->pState
->state
.totalSpillFillSize
;
59 if (spillFillSize
&& pSpillFillBuffer
== nullptr)
61 pSpillFillBuffer
= pDC
->pArena
->AllocAlignedSync(spillFillSize
, KNOB_SIMD_BYTES
);
64 size_t scratchSpaceSize
= pDC
->pState
->state
.scratchSpaceSize
* pDC
->pState
->state
.scratchSpaceNumInstances
;
65 if (scratchSpaceSize
&& pScratchSpace
== nullptr)
67 pScratchSpace
= pDC
->pArena
->AllocAlignedSync(scratchSpaceSize
, KNOB_SIMD_BYTES
);
70 const API_STATE
& state
= GetApiState(pDC
);
72 SWR_CS_CONTEXT csContext
{ 0 };
73 csContext
.tileCounter
= threadGroupId
;
74 csContext
.dispatchDims
[0] = pTaskData
->threadGroupCountX
;
75 csContext
.dispatchDims
[1] = pTaskData
->threadGroupCountY
;
76 csContext
.dispatchDims
[2] = pTaskData
->threadGroupCountZ
;
77 csContext
.pTGSM
= pContext
->ppScratch
[workerId
];
78 csContext
.pSpillFillBuffer
= (uint8_t*)pSpillFillBuffer
;
79 csContext
.pScratchSpace
= (uint8_t*)pScratchSpace
;
80 csContext
.scratchSpacePerSimd
= pDC
->pState
->state
.scratchSpaceSize
;
82 state
.pfnCsFunc(GetPrivateState(pDC
), &csContext
);
84 UPDATE_STAT_BE(CsInvocations
, state
.totalThreadsInGroup
);
86 AR_END(BEDispatch
, 1);
89 //////////////////////////////////////////////////////////////////////////
90 /// @brief Process shutdown.
91 /// @param pDC - pointer to draw context (dispatch).
92 /// @param workerId - The unique worker ID that is assigned to this thread.
93 /// @param threadGroupId - the linear index for the thread group within the dispatch.
94 void ProcessShutdownBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
99 void ProcessSyncBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
102 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
103 SWR_ASSERT(x
== 0 && y
== 0);
106 template<SWR_FORMAT format
>
107 void ClearRasterTile(uint8_t *pTileBuffer
, simdvector
&value
)
109 auto lambda
= [&](int32_t comp
)
111 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
113 pTileBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
116 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD_TILE_X_DIM
);
118 for (uint32_t i
= 0; i
< numIter
; ++i
)
120 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
124 #if USE_8x2_TILE_BACKEND
125 template<SWR_FORMAT format
>
126 void ClearRasterTile(uint8_t *pTileBuffer
, simd16vector
&value
)
128 auto lambda
= [&](int32_t comp
)
130 FormatTraits
<format
>::storeSOA(comp
, pTileBuffer
, value
.v
[comp
]);
132 pTileBuffer
+= (KNOB_SIMD16_WIDTH
* FormatTraits
<format
>::GetBPC(comp
) / 8);
135 const uint32_t numIter
= (KNOB_TILE_Y_DIM
/ SIMD16_TILE_Y_DIM
) * (KNOB_TILE_X_DIM
/ SIMD16_TILE_X_DIM
);
137 for (uint32_t i
= 0; i
< numIter
; ++i
)
139 UnrollerL
<0, FormatTraits
<format
>::numComps
, 1>::step(lambda
);
144 template<SWR_FORMAT format
>
145 INLINE
void ClearMacroTile(DRAW_CONTEXT
*pDC
, SWR_RENDERTARGET_ATTACHMENT rt
, uint32_t macroTile
, uint32_t renderTargetArrayIndex
, DWORD clear
[4], const SWR_RECT
& rect
)
147 // convert clear color to hottile format
148 // clear color is in RGBA float/uint32
149 #if USE_8x2_TILE_BACKEND
151 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
154 vComp
= _simd16_load1_ps((const float*)&clear
[comp
]);
155 if (FormatTraits
<format
>::isNormalized(comp
))
157 vComp
= _simd16_mul_ps(vComp
, _simd16_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
158 vComp
= _simd16_castsi_ps(_simd16_cvtps_epi32(vComp
));
160 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
161 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
166 for (uint32_t comp
= 0; comp
< FormatTraits
<format
>::numComps
; ++comp
)
169 vComp
= _simd_load1_ps((const float*)&clear
[comp
]);
170 if (FormatTraits
<format
>::isNormalized(comp
))
172 vComp
= _simd_mul_ps(vComp
, _simd_set1_ps(FormatTraits
<format
>::fromFloat(comp
)));
173 vComp
= _simd_castsi_ps(_simd_cvtps_epi32(vComp
));
175 vComp
= FormatTraits
<format
>::pack(comp
, vComp
);
176 vClear
.v
[FormatTraits
<format
>::swizzle(comp
)] = vComp
;
180 uint32_t tileX
, tileY
;
181 MacroTileMgr::getTileIndices(macroTile
, tileX
, tileY
);
183 // Init to full macrotile
186 KNOB_MACROTILE_X_DIM
* int32_t(tileX
),
187 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
),
188 KNOB_MACROTILE_X_DIM
* int32_t(tileX
+ 1),
189 KNOB_MACROTILE_Y_DIM
* int32_t(tileY
+ 1),
192 // intersect with clear rect
195 // translate to local hottile origin
196 clearTile
.Translate(-int32_t(tileX
) * KNOB_MACROTILE_X_DIM
, -int32_t(tileY
) * KNOB_MACROTILE_Y_DIM
);
198 // Make maximums inclusive (needed for convert to raster tiles)
202 // convert to raster tiles
203 clearTile
.ymin
>>= (KNOB_TILE_Y_DIM_SHIFT
);
204 clearTile
.ymax
>>= (KNOB_TILE_Y_DIM_SHIFT
);
205 clearTile
.xmin
>>= (KNOB_TILE_X_DIM_SHIFT
);
206 clearTile
.xmax
>>= (KNOB_TILE_X_DIM_SHIFT
);
208 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
209 // compute steps between raster tile samples / raster tiles / macro tile rows
210 const uint32_t rasterTileSampleStep
= KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* FormatTraits
<format
>::bpp
/ 8;
211 const uint32_t rasterTileStep
= (KNOB_TILE_X_DIM
* KNOB_TILE_Y_DIM
* (FormatTraits
<format
>::bpp
/ 8)) * numSamples
;
212 const uint32_t macroTileRowStep
= (KNOB_MACROTILE_X_DIM
/ KNOB_TILE_X_DIM
) * rasterTileStep
;
213 const uint32_t pitch
= (FormatTraits
<format
>::bpp
* KNOB_MACROTILE_X_DIM
/ 8);
215 HOTTILE
*pHotTile
= pDC
->pContext
->pHotTileMgr
->GetHotTile(pDC
->pContext
, pDC
, macroTile
, rt
, true, numSamples
, renderTargetArrayIndex
);
216 uint32_t rasterTileStartOffset
= (ComputeTileOffset2D
< TilingTraits
<SWR_TILE_SWRZ
, FormatTraits
<format
>::bpp
> >(pitch
, clearTile
.xmin
, clearTile
.ymin
)) * numSamples
;
217 uint8_t* pRasterTileRow
= pHotTile
->pBuffer
+ rasterTileStartOffset
; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
219 // loop over all raster tiles in the current hot tile
220 for (int32_t y
= clearTile
.ymin
; y
<= clearTile
.ymax
; ++y
)
222 uint8_t* pRasterTile
= pRasterTileRow
;
223 for (int32_t x
= clearTile
.xmin
; x
<= clearTile
.xmax
; ++x
)
225 for( int32_t sampleNum
= 0; sampleNum
< numSamples
; sampleNum
++)
227 ClearRasterTile
<format
>(pRasterTile
, vClear
);
228 pRasterTile
+= rasterTileSampleStep
;
231 pRasterTileRow
+= macroTileRowStep
;
234 pHotTile
->state
= HOTTILE_DIRTY
;
238 void ProcessClearBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pUserData
)
240 SWR_CONTEXT
*pContext
= pDC
->pContext
;
244 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
245 SWR_MULTISAMPLE_COUNT sampleCount
= pDC
->pState
->state
.rastState
.sampleCount
;
246 uint32_t numSamples
= GetNumSamples(sampleCount
);
248 SWR_ASSERT(pClear
->attachmentMask
!= 0); // shouldn't be here without a reason.
250 AR_BEGIN(BEClear
, pDC
->drawId
);
252 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
254 unsigned long rt
= 0;
255 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
256 while (_BitScanForward(&rt
, mask
))
260 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, true, numSamples
, pClear
->renderTargetArrayIndex
);
262 // All we want to do here is to mark the hot tile as being in a "needs clear" state.
263 pHotTile
->clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
264 pHotTile
->clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
265 pHotTile
->clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
266 pHotTile
->clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
267 pHotTile
->state
= HOTTILE_CLEAR
;
271 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
273 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_DEPTH
, true, numSamples
, pClear
->renderTargetArrayIndex
);
274 pHotTile
->clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
275 pHotTile
->state
= HOTTILE_CLEAR
;
278 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
280 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTile(pContext
, pDC
, macroTile
, SWR_ATTACHMENT_STENCIL
, true, numSamples
, pClear
->renderTargetArrayIndex
);
282 pHotTile
->clearData
[0] = pClear
->clearStencil
;
283 pHotTile
->state
= HOTTILE_CLEAR
;
291 CLEAR_DESC
*pClear
= (CLEAR_DESC
*)pUserData
;
292 AR_BEGIN(BEClear
, pDC
->drawId
);
294 if (pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
)
297 clearData
[0] = *(DWORD
*)&(pClear
->clearRTColor
[0]);
298 clearData
[1] = *(DWORD
*)&(pClear
->clearRTColor
[1]);
299 clearData
[2] = *(DWORD
*)&(pClear
->clearRTColor
[2]);
300 clearData
[3] = *(DWORD
*)&(pClear
->clearRTColor
[3]);
302 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_COLOR_HOT_TILE_FORMAT
];
303 SWR_ASSERT(pfnClearTiles
!= nullptr);
305 unsigned long rt
= 0;
306 uint32_t mask
= pClear
->attachmentMask
& SWR_ATTACHMENT_MASK_COLOR
;
307 while (_BitScanForward(&rt
, mask
))
311 pfnClearTiles(pDC
, (SWR_RENDERTARGET_ATTACHMENT
)rt
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
315 if (pClear
->attachmentMask
& SWR_ATTACHMENT_DEPTH_BIT
)
318 clearData
[0] = *(DWORD
*)&pClear
->clearDepth
;
319 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_DEPTH_HOT_TILE_FORMAT
];
320 SWR_ASSERT(pfnClearTiles
!= nullptr);
322 pfnClearTiles(pDC
, SWR_ATTACHMENT_DEPTH
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
325 if (pClear
->attachmentMask
& SWR_ATTACHMENT_STENCIL_BIT
)
328 clearData
[0] = pClear
->clearStencil
;
329 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[KNOB_STENCIL_HOT_TILE_FORMAT
];
331 pfnClearTiles(pDC
, SWR_ATTACHMENT_STENCIL
, macroTile
, pClear
->renderTargetArrayIndex
, clearData
, pClear
->rect
);
338 void ProcessStoreTileBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, STORE_TILES_DESC
* pDesc
,
339 SWR_RENDERTARGET_ATTACHMENT attachment
)
341 SWR_CONTEXT
*pContext
= pDC
->pContext
;
343 AR_BEGIN(BEStoreTiles
, pDC
->drawId
);
345 SWR_FORMAT srcFormat
;
348 case SWR_ATTACHMENT_COLOR0
:
349 case SWR_ATTACHMENT_COLOR1
:
350 case SWR_ATTACHMENT_COLOR2
:
351 case SWR_ATTACHMENT_COLOR3
:
352 case SWR_ATTACHMENT_COLOR4
:
353 case SWR_ATTACHMENT_COLOR5
:
354 case SWR_ATTACHMENT_COLOR6
:
355 case SWR_ATTACHMENT_COLOR7
: srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
356 case SWR_ATTACHMENT_DEPTH
: srcFormat
= KNOB_DEPTH_HOT_TILE_FORMAT
; break;
357 case SWR_ATTACHMENT_STENCIL
: srcFormat
= KNOB_STENCIL_HOT_TILE_FORMAT
; break;
358 default: SWR_INVALID("Unknown attachment: %d", attachment
); srcFormat
= KNOB_COLOR_HOT_TILE_FORMAT
; break;
362 MacroTileMgr::getTileIndices(macroTile
, x
, y
);
364 // Only need to store the hottile if it's been rendered to...
365 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(pContext
, pDC
, macroTile
, attachment
, false);
368 // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
369 if (pHotTile
->state
== HOTTILE_CLEAR
)
371 PFN_CLEAR_TILES pfnClearTiles
= sClearTilesTable
[srcFormat
];
372 SWR_ASSERT(pfnClearTiles
!= nullptr);
374 pfnClearTiles(pDC
, attachment
, macroTile
, pHotTile
->renderTargetArrayIndex
, pHotTile
->clearData
, pDesc
->rect
);
377 if (pHotTile
->state
== HOTTILE_DIRTY
|| pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
)
379 int32_t destX
= KNOB_MACROTILE_X_DIM
* x
;
380 int32_t destY
= KNOB_MACROTILE_Y_DIM
* y
;
382 pContext
->pfnStoreTile(GetPrivateState(pDC
), srcFormat
,
383 attachment
, destX
, destY
, pHotTile
->renderTargetArrayIndex
, pHotTile
->pBuffer
);
387 if (pHotTile
->state
== HOTTILE_DIRTY
|| pHotTile
->state
== HOTTILE_RESOLVED
)
389 if (!(pDesc
->postStoreTileState
== (SWR_TILE_STATE
)HOTTILE_DIRTY
&& pHotTile
->state
== HOTTILE_RESOLVED
))
391 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->postStoreTileState
;
395 AR_END(BEStoreTiles
, 1);
398 void ProcessStoreTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
400 STORE_TILES_DESC
*pDesc
= (STORE_TILES_DESC
*)pData
;
402 unsigned long rt
= 0;
403 uint32_t mask
= pDesc
->attachmentMask
;
404 while (_BitScanForward(&rt
, mask
))
407 ProcessStoreTileBE(pDC
, workerId
, macroTile
, pDesc
, (SWR_RENDERTARGET_ATTACHMENT
)rt
);
411 void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t macroTile
, void *pData
)
413 DISCARD_INVALIDATE_TILES_DESC
*pDesc
= (DISCARD_INVALIDATE_TILES_DESC
*)pData
;
414 SWR_CONTEXT
*pContext
= pDC
->pContext
;
416 const int32_t numSamples
= GetNumSamples(pDC
->pState
->state
.rastState
.sampleCount
);
418 for (uint32_t i
= 0; i
< SWR_NUM_ATTACHMENTS
; ++i
)
420 if (pDesc
->attachmentMask
& (1 << i
))
422 HOTTILE
*pHotTile
= pContext
->pHotTileMgr
->GetHotTileNoLoad(
423 pContext
, pDC
, macroTile
, (SWR_RENDERTARGET_ATTACHMENT
)i
, pDesc
->createNewTiles
, numSamples
);
426 pHotTile
->state
= (HOTTILE_STATE
)pDesc
->newTileState
;
432 #if KNOB_SIMD_WIDTH == 8
433 const simdscalar vCenterOffsetsX
= __m256
{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
434 const simdscalar vCenterOffsetsY
= __m256
{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
435 const simdscalar vULOffsetsX
= __m256
{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
436 const simdscalar vULOffsetsY
= __m256
{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
438 #error Unsupported vector width
441 simdmask
ComputeUserClipMask(uint8_t clipMask
, float* pUserClipBuffer
, simdscalar vI
, simdscalar vJ
)
443 simdscalar vClipMask
= _simd_setzero_ps();
444 uint32_t numClipDistance
= _mm_popcnt_u32(clipMask
);
446 for (uint32_t i
= 0; i
< numClipDistance
; ++i
)
448 // pull triangle clip distance values from clip buffer
449 simdscalar vA
= _simd_broadcast_ss(pUserClipBuffer
++);
450 simdscalar vB
= _simd_broadcast_ss(pUserClipBuffer
++);
451 simdscalar vC
= _simd_broadcast_ss(pUserClipBuffer
++);
454 simdscalar vInterp
= vplaneps(vA
, vB
, vC
, vI
, vJ
);
456 // clip if interpolated clip distance is < 0 || NAN
457 simdscalar vCull
= _simd_cmp_ps(_simd_setzero_ps(), vInterp
, _CMP_NLE_UQ
);
459 vClipMask
= _simd_or_ps(vClipMask
, vCull
);
462 return _simd_movemask_ps(vClipMask
);
466 void BackendSingleSample(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
468 SWR_CONTEXT
*pContext
= pDC
->pContext
;
470 AR_BEGIN(BESingleSampleBackend
, pDC
->drawId
);
471 AR_BEGIN(BESetup
, pDC
->drawId
);
473 const API_STATE
&state
= GetApiState(pDC
);
475 BarycentricCoeffs coeffs
;
476 SetupBarycentricCoeffs(&coeffs
, work
);
478 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
479 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
481 SWR_PS_CONTEXT psContext
;
482 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
483 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
487 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
488 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
490 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
492 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
494 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
495 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
497 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
499 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
501 #if USE_8x2_TILE_BACKEND
502 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
505 simdmask coverageMask
= work
.coverageMask
[0] & MASK
;
509 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
511 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
513 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthBuffer
));
515 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
516 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
518 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
521 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
523 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
525 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
528 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
530 CalcPixelBarycentrics(coeffs
, psContext
);
532 CalcCentroid
<T
, true>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
534 // interpolate and quantize z
535 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.center
, psContext
.vJ
.center
);
536 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
538 AR_END(BEBarycentric
, 1);
540 // interpolate user clip distance if available
541 if (state
.rastState
.clipDistanceMask
)
543 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.center
, psContext
.vJ
.center
);
546 simdscalar vCoverageMask
= vMask(coverageMask
);
547 simdscalar depthPassMask
= vCoverageMask
;
548 simdscalar stencilPassMask
= vCoverageMask
;
553 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
554 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
555 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
556 AR_EVENT(EarlyDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
557 AR_END(BEEarlyDepthTest
, 0);
559 // early-exit if no pixels passed depth or earlyZ is forced on
560 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
562 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
563 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
565 if (!_simd_movemask_ps(depthPassMask
))
572 psContext
.sampleIndex
= 0;
573 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
575 // execute pixel shader
576 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
577 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
578 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
579 AR_END(BEPixelShader
, 0);
581 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
586 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
587 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
588 psContext
.vZ
, pDepthBuffer
, vCoverageMask
, pStencilBuffer
, &stencilPassMask
);
589 AR_EVENT(LateDepthStencilInfoSingleSample(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
590 AR_END(BELateDepthTest
, 0);
592 if (!_simd_movemask_ps(depthPassMask
))
594 // need to call depth/stencil write for stencil write
595 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
596 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
600 // for early z, consolidate discards from shader
601 // into depthPassMask
602 depthPassMask
= _simd_and_ps(depthPassMask
, vCoverageMask
);
605 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
606 uint32_t statCount
= _mm_popcnt_u32(statMask
);
607 UPDATE_STAT_BE(DepthPassCount
, statCount
);
610 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
611 #if USE_8x2_TILE_BACKEND
612 OutputMerger8x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
614 OutputMerger4x2(psContext
, pColorBuffer
, 0, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
617 // do final depth write after all pixel kills
618 if (!state
.psState
.forceEarlyZ
)
620 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
621 pDepthBuffer
, depthPassMask
, vCoverageMask
, pStencilBuffer
, stencilPassMask
);
623 AR_END(BEOutputMerger
, 0);
627 AR_BEGIN(BEEndTile
, pDC
->drawId
);
629 work
.coverageMask
[0] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
630 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
632 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
635 #if USE_8x2_TILE_BACKEND
636 if (useAlternateOffset
)
638 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
640 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
644 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
646 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
649 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
650 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
652 AR_END(BEEndTile
, 0);
654 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
655 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
658 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
659 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
662 AR_END(BESingleSampleBackend
, 0);
666 void BackendSampleRate(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
668 SWR_CONTEXT
*pContext
= pDC
->pContext
;
670 AR_BEGIN(BESampleRateBackend
, pDC
->drawId
);
671 AR_BEGIN(BESetup
, pDC
->drawId
);
673 const API_STATE
&state
= GetApiState(pDC
);
675 BarycentricCoeffs coeffs
;
676 SetupBarycentricCoeffs(&coeffs
, work
);
678 uint8_t *pColorBuffer
[SWR_NUM_RENDERTARGETS
], *pDepthBuffer
, *pStencilBuffer
;
679 SetupRenderBuffers(pColorBuffer
, &pDepthBuffer
, &pStencilBuffer
, state
.psState
.numRenderTargets
, renderBuffers
);
681 SWR_PS_CONTEXT psContext
;
682 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
683 SetupPixelShaderContext
<T
>(&psContext
, samplePos
, work
);
687 psContext
.vY
.UL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
688 psContext
.vY
.center
= _simd_add_ps(vCenterOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
690 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
692 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
694 psContext
.vX
.UL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
695 psContext
.vX
.center
= _simd_add_ps(vCenterOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
697 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
699 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
701 #if USE_8x2_TILE_BACKEND
702 const bool useAlternateOffset
= ((xx
& SIMD_TILE_X_DIM
) != 0);
705 if (T::InputCoverage
!= SWR_INPUT_COVERAGE_NONE
)
707 const uint64_t* pCoverageMask
= (T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
) ? &work
.innerCoverageMask
: &work
.coverageMask
[0];
709 generateInputCoverage
<T
, T::InputCoverage
>(pCoverageMask
, psContext
.inputMask
, state
.blendState
.sampleMask
);
712 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
714 CalcPixelBarycentrics(coeffs
, psContext
);
716 CalcCentroid
<T
, false>(&psContext
, samplePos
, coeffs
, work
.coverageMask
, state
.blendState
.sampleMask
);
718 AR_END(BEBarycentric
, 0);
720 for (uint32_t sample
= 0; sample
< T::MultisampleT::numSamples
; sample
++)
722 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
726 // offset depth/stencil buffers current sample
727 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
728 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
730 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
732 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
734 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
736 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
737 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
739 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
742 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
744 // calculate per sample positions
745 psContext
.vX
.sample
= _simd_add_ps(psContext
.vX
.UL
, samplePos
.vX(sample
));
746 psContext
.vY
.sample
= _simd_add_ps(psContext
.vY
.UL
, samplePos
.vY(sample
));
748 CalcSampleBarycentrics(coeffs
, psContext
);
750 // interpolate and quantize z
751 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
752 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
754 AR_END(BEBarycentric
, 0);
756 // interpolate user clip distance if available
757 if (state
.rastState
.clipDistanceMask
)
759 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
762 simdscalar vCoverageMask
= vMask(coverageMask
);
763 simdscalar depthPassMask
= vCoverageMask
;
764 simdscalar stencilPassMask
= vCoverageMask
;
769 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
770 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
771 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
772 AR_EVENT(EarlyDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
773 AR_END(BEEarlyDepthTest
, 0);
775 // early-exit if no samples passed depth or earlyZ is forced on.
776 if (state
.psState
.forceEarlyZ
|| !_simd_movemask_ps(depthPassMask
))
778 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
779 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
781 if (!_simd_movemask_ps(depthPassMask
))
783 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
789 psContext
.sampleIndex
= sample
;
790 psContext
.activeMask
= _simd_castps_si(vCoverageMask
);
792 // execute pixel shader
793 AR_BEGIN(BEPixelShader
, pDC
->drawId
);
794 UPDATE_STAT_BE(PsInvocations
, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask
)));
795 state
.psState
.pfnPixelShader(GetPrivateState(pDC
), &psContext
);
796 AR_END(BEPixelShader
, 0);
798 vCoverageMask
= _simd_castsi_ps(psContext
.activeMask
);
803 AR_BEGIN(BELateDepthTest
, pDC
->drawId
);
804 depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
805 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
806 AR_EVENT(LateDepthStencilInfoSampleRate(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
807 AR_END(BELateDepthTest
, 0);
809 if (!_simd_movemask_ps(depthPassMask
))
811 // need to call depth/stencil write for stencil write
812 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
813 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
815 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
820 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
821 uint32_t statCount
= _mm_popcnt_u32(statMask
);
822 UPDATE_STAT_BE(DepthPassCount
, statCount
);
825 AR_BEGIN(BEOutputMerger
, pDC
->drawId
);
826 #if USE_8x2_TILE_BACKEND
827 OutputMerger8x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
, state
.colorHottileEnable
, useAlternateOffset
);
829 OutputMerger4x2(psContext
, pColorBuffer
, sample
, &state
.blendState
, state
.pfnBlendFunc
, vCoverageMask
, depthPassMask
, state
.psState
.numRenderTargets
);
832 // do final depth write after all pixel kills
833 if (!state
.psState
.forceEarlyZ
)
835 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
836 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
838 AR_END(BEOutputMerger
, 0);
840 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
846 AR_BEGIN(BEEndTile
, pDC
->drawId
);
848 if(T::InputCoverage
== SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
)
850 work
.innerCoverageMask
>>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
853 #if USE_8x2_TILE_BACKEND
854 if (useAlternateOffset
)
856 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
858 pColorBuffer
[rt
] += (2 * KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
862 for (uint32_t rt
= 0; rt
< state
.psState
.numRenderTargets
; ++rt
)
864 pColorBuffer
[rt
] += (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_COLOR_HOT_TILE_FORMAT
>::bpp
) / 8;
867 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
868 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
870 AR_END(BEEndTile
, 0);
872 psContext
.vX
.UL
= _simd_add_ps(psContext
.vX
.UL
, dx
);
873 psContext
.vX
.center
= _simd_add_ps(psContext
.vX
.center
, dx
);
876 psContext
.vY
.UL
= _simd_add_ps(psContext
.vY
.UL
, dy
);
877 psContext
.vY
.center
= _simd_add_ps(psContext
.vY
.center
, dy
);
880 AR_END(BESampleRateBackend
, 0);
882 // optimized backend flow with NULL PS
883 template<uint32_t sampleCountT
>
884 void BackendNullPS(DRAW_CONTEXT
*pDC
, uint32_t workerId
, uint32_t x
, uint32_t y
, SWR_TRIANGLE_DESC
&work
, RenderOutputBuffers
&renderBuffers
)
886 SWR_CONTEXT
*pContext
= pDC
->pContext
;
888 AR_BEGIN(BENullBackend
, pDC
->drawId
);
889 ///@todo: handle center multisample pattern
890 AR_BEGIN(BESetup
, pDC
->drawId
);
892 const API_STATE
&state
= GetApiState(pDC
);
894 BarycentricCoeffs coeffs
;
895 SetupBarycentricCoeffs(&coeffs
, work
);
897 uint8_t *pDepthBuffer
, *pStencilBuffer
;
898 SetupRenderBuffers(NULL
, &pDepthBuffer
, &pStencilBuffer
, 0, renderBuffers
);
900 SWR_PS_CONTEXT psContext
;
901 // skip SetupPixelShaderContext(&psContext, ...); // not needed here
905 simdscalar vYSamplePosUL
= _simd_add_ps(vULOffsetsY
, _simd_set1_ps(static_cast<float>(y
)));
907 const simdscalar dy
= _simd_set1_ps(static_cast<float>(SIMD_TILE_Y_DIM
));
908 const SWR_MULTISAMPLE_POS
& samplePos
= state
.rastState
.samplePositions
;
909 for (uint32_t yy
= y
; yy
< y
+ KNOB_TILE_Y_DIM
; yy
+= SIMD_TILE_Y_DIM
)
911 simdscalar vXSamplePosUL
= _simd_add_ps(vULOffsetsX
, _simd_set1_ps(static_cast<float>(x
)));
913 const simdscalar dx
= _simd_set1_ps(static_cast<float>(SIMD_TILE_X_DIM
));
915 for (uint32_t xx
= x
; xx
< x
+ KNOB_TILE_X_DIM
; xx
+= SIMD_TILE_X_DIM
)
917 // iterate over active samples
918 unsigned long sample
= 0;
919 uint32_t sampleMask
= state
.blendState
.sampleMask
;
920 while (_BitScanForward(&sample
, sampleMask
))
922 sampleMask
&= ~(1 << sample
);
924 simdmask coverageMask
= work
.coverageMask
[sample
] & MASK
;
928 // offset depth/stencil buffers current sample
929 uint8_t *pDepthSample
= pDepthBuffer
+ RasterTileDepthOffset(sample
);
930 uint8_t *pStencilSample
= pStencilBuffer
+ RasterTileStencilOffset(sample
);
932 if (state
.depthHottileEnable
&& state
.depthBoundsState
.depthBoundsTestEnable
)
934 static_assert(KNOB_DEPTH_HOT_TILE_FORMAT
== R32_FLOAT
, "Unsupported depth hot tile format");
936 const simdscalar z
= _simd_load_ps(reinterpret_cast<const float *>(pDepthSample
));
938 const float minz
= state
.depthBoundsState
.depthBoundsTestMinValue
;
939 const float maxz
= state
.depthBoundsState
.depthBoundsTestMaxValue
;
941 coverageMask
&= CalcDepthBoundsAcceptMask(z
, minz
, maxz
);
944 AR_BEGIN(BEBarycentric
, pDC
->drawId
);
946 // calculate per sample positions
947 psContext
.vX
.sample
= _simd_add_ps(vXSamplePosUL
, samplePos
.vX(sample
));
948 psContext
.vY
.sample
= _simd_add_ps(vYSamplePosUL
, samplePos
.vY(sample
));
950 CalcSampleBarycentrics(coeffs
, psContext
);
952 // interpolate and quantize z
953 psContext
.vZ
= vplaneps(coeffs
.vZa
, coeffs
.vZb
, coeffs
.vZc
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
954 psContext
.vZ
= state
.pfnQuantizeDepth(psContext
.vZ
);
956 AR_END(BEBarycentric
, 0);
958 // interpolate user clip distance if available
959 if (state
.rastState
.clipDistanceMask
)
961 coverageMask
&= ~ComputeUserClipMask(state
.rastState
.clipDistanceMask
, work
.pUserClipBuffer
, psContext
.vI
.sample
, psContext
.vJ
.sample
);
964 simdscalar vCoverageMask
= vMask(coverageMask
);
965 simdscalar stencilPassMask
= vCoverageMask
;
967 AR_BEGIN(BEEarlyDepthTest
, pDC
->drawId
);
968 simdscalar depthPassMask
= DepthStencilTest(&state
, work
.triFlags
.frontFacing
, work
.triFlags
.viewportIndex
,
969 psContext
.vZ
, pDepthSample
, vCoverageMask
, pStencilSample
, &stencilPassMask
);
970 AR_EVENT(EarlyDepthStencilInfoNullPS(_simd_movemask_ps(depthPassMask
), _simd_movemask_ps(stencilPassMask
), _simd_movemask_ps(vCoverageMask
)));
971 DepthStencilWrite(&state
.vp
[work
.triFlags
.viewportIndex
], &state
.depthStencilState
, work
.triFlags
.frontFacing
, psContext
.vZ
,
972 pDepthSample
, depthPassMask
, vCoverageMask
, pStencilSample
, stencilPassMask
);
973 AR_END(BEEarlyDepthTest
, 0);
975 uint32_t statMask
= _simd_movemask_ps(depthPassMask
);
976 uint32_t statCount
= _mm_popcnt_u32(statMask
);
977 UPDATE_STAT_BE(DepthPassCount
, statCount
);
982 work
.coverageMask
[sample
] >>= (SIMD_TILE_Y_DIM
* SIMD_TILE_X_DIM
);
985 pDepthBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_DEPTH_HOT_TILE_FORMAT
>::bpp
) / 8;
986 pStencilBuffer
+= (KNOB_SIMD_WIDTH
* FormatTraits
<KNOB_STENCIL_HOT_TILE_FORMAT
>::bpp
) / 8;
988 vXSamplePosUL
= _simd_add_ps(vXSamplePosUL
, dx
);
991 vYSamplePosUL
= _simd_add_ps(vYSamplePosUL
, dy
);
994 AR_END(BENullBackend
, 0);
997 void InitClearTilesTable()
999 memset(sClearTilesTable
, 0, sizeof(sClearTilesTable
));
1001 sClearTilesTable
[R8G8B8A8_UNORM
] = ClearMacroTile
<R8G8B8A8_UNORM
>;
1002 sClearTilesTable
[B8G8R8A8_UNORM
] = ClearMacroTile
<B8G8R8A8_UNORM
>;
1003 sClearTilesTable
[R32_FLOAT
] = ClearMacroTile
<R32_FLOAT
>;
1004 sClearTilesTable
[R32G32B32A32_FLOAT
] = ClearMacroTile
<R32G32B32A32_FLOAT
>;
1005 sClearTilesTable
[R8_UINT
] = ClearMacroTile
<R8_UINT
>;
1008 PFN_BACKEND_FUNC gBackendNullPs
[SWR_MULTISAMPLE_TYPE_COUNT
];
1009 PFN_BACKEND_FUNC gBackendSingleSample
[SWR_INPUT_COVERAGE_COUNT
]
1013 PFN_BACKEND_FUNC gBackendPixelRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1014 [2] // isCenterPattern
1015 [SWR_INPUT_COVERAGE_COUNT
]
1017 [2] // forcedSampleCount
1020 PFN_BACKEND_FUNC gBackendSampleRateTable
[SWR_MULTISAMPLE_TYPE_COUNT
]
1021 [SWR_INPUT_COVERAGE_COUNT
]
1026 // Recursive template used to auto-nest conditionals. Converts dynamic enum function
1027 // arguments to static template arguments.
1028 template <uint32_t... ArgsT
>
1031 // Last Arg Terminator
1032 static PFN_BACKEND_FUNC
GetFunc(SWR_BACKEND_FUNCS tArg
)
1036 case SWR_BACKEND_SINGLE_SAMPLE
: return BackendSingleSample
<SwrBackendTraits
<ArgsT
...>>; break;
1037 case SWR_BACKEND_MSAA_PIXEL_RATE
: return BackendPixelRate
<SwrBackendTraits
<ArgsT
...>>; break;
1038 case SWR_BACKEND_MSAA_SAMPLE_RATE
: return BackendSampleRate
<SwrBackendTraits
<ArgsT
...>>; break;
1040 SWR_ASSERT(0 && "Invalid backend func\n");
1046 // Recursively parse args
1047 template <typename
... TArgsT
>
1048 static PFN_BACKEND_FUNC
GetFunc(SWR_INPUT_COVERAGE tArg
, TArgsT
... remainingArgs
)
1052 case SWR_INPUT_COVERAGE_NONE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...); break;
1053 case SWR_INPUT_COVERAGE_NORMAL
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NORMAL
>::GetFunc(remainingArgs
...); break;
1054 case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
: return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE
>::GetFunc(remainingArgs
...); break;
1056 SWR_ASSERT(0 && "Invalid sample pattern\n");
1057 return BEChooser
<ArgsT
..., SWR_INPUT_COVERAGE_NONE
>::GetFunc(remainingArgs
...);
1062 // Recursively parse args
1063 template <typename
... TArgsT
>
1064 static PFN_BACKEND_FUNC
GetFunc(SWR_MULTISAMPLE_COUNT tArg
, TArgsT
... remainingArgs
)
1068 case SWR_MULTISAMPLE_1X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...); break;
1069 case SWR_MULTISAMPLE_2X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_2X
>::GetFunc(remainingArgs
...); break;
1070 case SWR_MULTISAMPLE_4X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_4X
>::GetFunc(remainingArgs
...); break;
1071 case SWR_MULTISAMPLE_8X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_8X
>::GetFunc(remainingArgs
...); break;
1072 case SWR_MULTISAMPLE_16X
: return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_16X
>::GetFunc(remainingArgs
...); break;
1074 SWR_ASSERT(0 && "Invalid sample count\n");
1075 return BEChooser
<ArgsT
..., SWR_MULTISAMPLE_1X
>::GetFunc(remainingArgs
...);
1080 // Recursively parse args
1081 template <typename
... TArgsT
>
1082 static PFN_BACKEND_FUNC
GetFunc(bool tArg
, TArgsT
... remainingArgs
)
1086 return BEChooser
<ArgsT
..., 1>::GetFunc(remainingArgs
...);
1089 return BEChooser
<ArgsT
..., 0>::GetFunc(remainingArgs
...);
1093 void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_INPUT_COVERAGE_COUNT
][2][2])
1095 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1097 for(uint32_t isCentroid
= 0; isCentroid
< 2; isCentroid
++)
1099 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1101 table
[inputCoverage
][isCentroid
][canEarlyZ
] =
1102 BEChooser
<>::GetFunc(SWR_MULTISAMPLE_1X
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1103 (isCentroid
> 0), false, (canEarlyZ
> 0), SWR_BACKEND_SINGLE_SAMPLE
);
1109 void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table
)[SWR_MULTISAMPLE_TYPE_COUNT
][SWR_INPUT_COVERAGE_COUNT
][2][2])
1111 for(uint32_t sampleCount
= SWR_MULTISAMPLE_1X
; sampleCount
< SWR_MULTISAMPLE_TYPE_COUNT
; sampleCount
++)
1113 for(uint32_t inputCoverage
= 0; inputCoverage
< SWR_INPUT_COVERAGE_COUNT
; inputCoverage
++)
1115 for(uint32_t centroid
= 0; centroid
< 2; centroid
++)
1117 for(uint32_t canEarlyZ
= 0; canEarlyZ
< 2; canEarlyZ
++)
1119 table
[sampleCount
][inputCoverage
][centroid
][canEarlyZ
] =
1120 BEChooser
<>::GetFunc((SWR_MULTISAMPLE_COUNT
)sampleCount
, false, (SWR_INPUT_COVERAGE
)inputCoverage
,
1121 (centroid
> 0), false, (canEarlyZ
> 0), (SWR_BACKEND_FUNCS
)SWR_BACKEND_MSAA_SAMPLE_RATE
);
1128 void InitBackendPixelRate0();
1129 void InitBackendFuncTables()
1131 InitBackendSingleFuncTable(gBackendSingleSample
);
1132 InitBackendPixelRate0();
1133 InitBackendSampleFuncTable(gBackendSampleRateTable
);
1135 gBackendNullPs
[SWR_MULTISAMPLE_1X
] = &BackendNullPS
< SWR_MULTISAMPLE_1X
> ;
1136 gBackendNullPs
[SWR_MULTISAMPLE_2X
] = &BackendNullPS
< SWR_MULTISAMPLE_2X
> ;
1137 gBackendNullPs
[SWR_MULTISAMPLE_4X
] = &BackendNullPS
< SWR_MULTISAMPLE_4X
> ;
1138 gBackendNullPs
[SWR_MULTISAMPLE_8X
] = &BackendNullPS
< SWR_MULTISAMPLE_8X
> ;
1139 gBackendNullPs
[SWR_MULTISAMPLE_16X
] = &BackendNullPS
< SWR_MULTISAMPLE_16X
> ;